diff --git a/go.mod b/go.mod
index 17ac0fd7d..c692215d9 100644
--- a/go.mod
+++ b/go.mod
@@ -57,7 +57,7 @@ require (
 	github.com/ncruces/go-sqlite3 v0.25.0
 	github.com/oklog/ulid v1.3.1
 	github.com/pquerna/otp v1.4.0
-	github.com/prometheus/client_golang v1.21.1
+	github.com/prometheus/client_golang v1.22.0
 	github.com/rivo/uniseg v0.4.7
 	github.com/spf13/cobra v1.9.1
 	github.com/spf13/viper v1.20.1
diff --git a/go.sum b/go.sum
index 450b72a67..a963526b9 100644
--- a/go.sum
+++ b/go.sum
@@ -348,8 +348,8 @@ github.com/pquerna/otp v1.4.0 h1:wZvl1TIVxKRThZIBiwOOHOGP/1+nZyWBil9Y2XNEDzg=
 github.com/pquerna/otp v1.4.0/go.mod h1:dkJfzwRKNiegxyNb54X/3fLwhCynbMspSyWKnvi1AEg=
 github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
 github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
-github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk=
-github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg=
+github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
+github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
 github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
 github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
 github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io=
diff --git a/vendor/github.com/klauspost/compress/.gitattributes b/vendor/github.com/klauspost/compress/.gitattributes
deleted file mode 100644
index 402433593..000000000
--- a/vendor/github.com/klauspost/compress/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-* -text
-*.bin -text -diff
diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore
deleted file mode 100644
index d31b37815..000000000
--- a/vendor/github.com/klauspost/compress/.gitignore
+++ /dev/null
@@ -1,32 +0,0 @@
-# Compiled Object files, Static and Dynamic libs (Shared Objects)
-*.o
-*.a
-*.so
-
-# Folders
-_obj
-_test
-
-# Architecture specific extensions/prefixes
-*.[568vq]
-[568vq].out
-
-*.cgo1.go
-*.cgo2.c
-_cgo_defun.c
-_cgo_gotypes.go
-_cgo_export.*
-
-_testmain.go
-
-*.exe
-*.test
-*.prof
-/s2/cmd/_s2sx/sfx-exe
-
-# Linux perf files
-perf.data
-perf.data.old
-
-# gdb history
-.gdb_history
diff --git a/vendor/github.com/klauspost/compress/.goreleaser.yml b/vendor/github.com/klauspost/compress/.goreleaser.yml
deleted file mode 100644
index 4528059ca..000000000
--- a/vendor/github.com/klauspost/compress/.goreleaser.yml
+++ /dev/null
@@ -1,123 +0,0 @@
-version: 2
-
-before:
-  hooks:
-    - ./gen.sh
-
-builds:
-  -
-    id: "s2c"
-    binary: s2c
-    main: ./s2/cmd/s2c/main.go
-    flags:
-      - -trimpath
-    env:
-      - CGO_ENABLED=0
-    goos:
-      - aix
-      - linux
-      - freebsd
-      - netbsd
-      - windows
-      - darwin
-    goarch:
-      - 386
-      - amd64
-      - arm
-      - arm64
-      - ppc64
-      - ppc64le
-      - mips64
-      - mips64le
-    goarm:
-      - 7
-  -
-    id: "s2d"
-    binary: s2d
-    main: ./s2/cmd/s2d/main.go
-    flags:
-      - -trimpath
-    env:
-      - CGO_ENABLED=0
-    goos:
-      - aix
-      - linux
-      - freebsd
-      - netbsd
-      - windows
-      - darwin
-    goarch:
-      - 386
-      - amd64
-      - arm
-      - arm64
-      - ppc64
-      - ppc64le
-      - mips64
-      - mips64le
-    goarm:
-      - 7
-  -
-    id: "s2sx"
-    binary: s2sx
-    main: ./s2/cmd/_s2sx/main.go
-    flags:
-      - -modfile=s2sx.mod
-      - -trimpath
-    env:
-      - CGO_ENABLED=0
-    goos:
-      - aix
-      - linux
-      - freebsd
-      - netbsd
-      - windows
-      - darwin
-    goarch:
-      - 386
-      - amd64
-      - arm
-      - arm64
-      - ppc64
-      - ppc64le
-      - mips64
-      - mips64le
-    goarm:
-      - 7
-
-archives:
-  -
-    id: s2-binaries
-    name_template: "s2-{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
-    format_overrides:
-      - goos: windows
-        format: zip
-    files:
-      - unpack/*
-      - s2/LICENSE
-      - s2/README.md
-checksum:
-  name_template: 'checksums.txt'
-snapshot:
-  version_template: "{{ .Tag }}-next"
-changelog:
-  sort: asc
-  filters:
-    exclude:
-    - '^doc:'
-    - '^docs:'
-    - '^test:'
-    - '^tests:'
-    - '^Update\sREADME.md'
-
-nfpms:
-  -
-    file_name_template: "s2_package__{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
-    vendor: Klaus Post
-    homepage: https://github.com/klauspost/compress
-    maintainer: Klaus Post <klauspost@gmail.com>
-    description: S2 Compression Tool
-    license: BSD 3-Clause
-    formats:
-      - deb
-      - rpm
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
deleted file mode 100644
index 244ee19c4..000000000
--- a/vendor/github.com/klauspost/compress/README.md
+++ /dev/null
@@ -1,671 +0,0 @@
-# compress
-
-This package provides various compression algorithms.
-
-* [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression in pure Go.
-* [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) is a high performance replacement for Snappy.
-* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
-* [snappy](https://github.com/klauspost/compress/tree/master/snappy) is a drop-in replacement for `github.com/golang/snappy` offering better compression and concurrent streams.
-* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
-* [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
-* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
-
-[![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
-[![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
-[![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
-
-# package usage
-
-Use `go get github.com/klauspost/compress@latest` to add it to your project.
-
-This package will support the current Go version and 2 versions back.
-
-* Use the `nounsafe` tag to disable all use of the "unsafe" package.
-* Use the `noasm` tag to disable all assembly across packages.
-
-Use the links above for more information on each.
-
-# changelog
-
-* Feb 19th, 2025 - [1.18.0](https://github.com/klauspost/compress/releases/tag/v1.18.0)
-  * Add unsafe little endian loaders https://github.com/klauspost/compress/pull/1036
-  * fix: check `r.err != nil` but return a nil value error `err` by @alingse in https://github.com/klauspost/compress/pull/1028
-  * flate: Simplify L4-6 loading https://github.com/klauspost/compress/pull/1043
-  * flate: Simplify matchlen (remove asm) https://github.com/klauspost/compress/pull/1045
-  * s2: Improve small block compression speed w/o asm https://github.com/klauspost/compress/pull/1048
-  * flate: Fix matchlen L5+L6 https://github.com/klauspost/compress/pull/1049
-  * flate: Cleanup & reduce casts https://github.com/klauspost/compress/pull/1050
-
-* Oct 11th, 2024 - [1.17.11](https://github.com/klauspost/compress/releases/tag/v1.17.11)
-  * zstd: Fix extra CRC written with multiple Close calls https://github.com/klauspost/compress/pull/1017
-  * s2: Don't use stack for index tables https://github.com/klauspost/compress/pull/1014
-  * gzhttp: No content-type on no body response code by @juliens in https://github.com/klauspost/compress/pull/1011
-  * gzhttp: Do not set the content-type when response has no body by @kevinpollet in https://github.com/klauspost/compress/pull/1013
-
-* Sep 23rd, 2024 - [1.17.10](https://github.com/klauspost/compress/releases/tag/v1.17.10)
-	* gzhttp: Add TransportAlwaysDecompress option. https://github.com/klauspost/compress/pull/978
-	* gzhttp: Add supported decompress request body by @mirecl in https://github.com/klauspost/compress/pull/1002
-	* s2: Add EncodeBuffer buffer recycling callback https://github.com/klauspost/compress/pull/982
-	* zstd: Improve memory usage on small streaming encodes https://github.com/klauspost/compress/pull/1007
-	* flate: read data written with partial flush by @vajexal in https://github.com/klauspost/compress/pull/996
-
-* Jun 12th, 2024 - [1.17.9](https://github.com/klauspost/compress/releases/tag/v1.17.9)
-	* s2: Reduce ReadFrom temporary allocations https://github.com/klauspost/compress/pull/949
-	* flate, zstd: Shave some bytes off amd64 matchLen by @greatroar in https://github.com/klauspost/compress/pull/963
-	* Upgrade zip/zlib to 1.22.4 upstream https://github.com/klauspost/compress/pull/970 https://github.com/klauspost/compress/pull/971
-	* zstd: BuildDict fails with RLE table https://github.com/klauspost/compress/pull/951
-
-* Apr 9th, 2024 - [1.17.8](https://github.com/klauspost/compress/releases/tag/v1.17.8)
-	* zstd: Reject blocks where reserved values are not 0 https://github.com/klauspost/compress/pull/885
-	* zstd: Add RLE detection+encoding https://github.com/klauspost/compress/pull/938
-
-* Feb 21st, 2024 - [1.17.7](https://github.com/klauspost/compress/releases/tag/v1.17.7)
-	* s2: Add AsyncFlush method: Complete the block without flushing by @Jille in https://github.com/klauspost/compress/pull/927
-	* s2: Fix literal+repeat exceeds dst crash https://github.com/klauspost/compress/pull/930
-  
-* Feb 5th, 2024 - [1.17.6](https://github.com/klauspost/compress/releases/tag/v1.17.6)
-	* zstd: Fix incorrect repeat coding in best mode https://github.com/klauspost/compress/pull/923
-	* s2: Fix DecodeConcurrent deadlock on errors https://github.com/klauspost/compress/pull/925
-  
-* Jan 26th, 2024 - [v1.17.5](https://github.com/klauspost/compress/releases/tag/v1.17.5)
-	* flate: Fix reset with dictionary on custom window encodes https://github.com/klauspost/compress/pull/912
-	* zstd: Add Frame header encoding and stripping https://github.com/klauspost/compress/pull/908
-	* zstd: Limit better/best default window to 8MB https://github.com/klauspost/compress/pull/913
-	* zstd: Speed improvements by @greatroar in https://github.com/klauspost/compress/pull/896 https://github.com/klauspost/compress/pull/910
-	* s2: Fix callbacks for skippable blocks and disallow 0xfe (Padding) by @Jille in https://github.com/klauspost/compress/pull/916 https://github.com/klauspost/compress/pull/917
-https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/compress/pull/918
-
-* Dec 1st, 2023 - [v1.17.4](https://github.com/klauspost/compress/releases/tag/v1.17.4)
-	* huff0: Speed up symbol counting by @greatroar in https://github.com/klauspost/compress/pull/887
-	* huff0: Remove byteReader by @greatroar in https://github.com/klauspost/compress/pull/886
-	* gzhttp: Allow overriding decompression on transport https://github.com/klauspost/compress/pull/892
-	* gzhttp: Clamp compression level https://github.com/klauspost/compress/pull/890
-	* gzip: Error out if reserved bits are set https://github.com/klauspost/compress/pull/891
-
-* Nov 15th, 2023 - [v1.17.3](https://github.com/klauspost/compress/releases/tag/v1.17.3)
-	* fse: Fix max header size https://github.com/klauspost/compress/pull/881
-	* zstd: Improve better/best compression https://github.com/klauspost/compress/pull/877
-	* gzhttp: Fix missing content type on Close https://github.com/klauspost/compress/pull/883
-
-* Oct 22nd, 2023 - [v1.17.2](https://github.com/klauspost/compress/releases/tag/v1.17.2)
-	* zstd: Fix rare *CORRUPTION* output in "best" mode. See https://github.com/klauspost/compress/pull/876
-
-* Oct 14th, 2023 - [v1.17.1](https://github.com/klauspost/compress/releases/tag/v1.17.1)
-	* s2: Fix S2 "best" dictionary wrong encoding https://github.com/klauspost/compress/pull/871
-	* flate: Reduce allocations in decompressor and minor code improvements by @fakefloordiv in https://github.com/klauspost/compress/pull/869
-	* s2: Fix EstimateBlockSize on 6&7 length input https://github.com/klauspost/compress/pull/867
-
-* Sept 19th, 2023 - [v1.17.0](https://github.com/klauspost/compress/releases/tag/v1.17.0)
-	* Add experimental dictionary builder  https://github.com/klauspost/compress/pull/853
-	* Add xerial snappy read/writer https://github.com/klauspost/compress/pull/838
-	* flate: Add limited window compression https://github.com/klauspost/compress/pull/843
-	* s2: Do 2 overlapping match checks https://github.com/klauspost/compress/pull/839
-	* flate: Add amd64 assembly matchlen https://github.com/klauspost/compress/pull/837
-	* gzip: Copy bufio.Reader on Reset by @thatguystone in https://github.com/klauspost/compress/pull/860
-
-<details>
-	<summary>See changes to v1.16.x</summary>
-
-   
-* July 1st, 2023 - [v1.16.7](https://github.com/klauspost/compress/releases/tag/v1.16.7)
-	* zstd: Fix default level first dictionary encode https://github.com/klauspost/compress/pull/829
-	* s2: add GetBufferCapacity() method by @GiedriusS in https://github.com/klauspost/compress/pull/832
-
-* June 13, 2023 - [v1.16.6](https://github.com/klauspost/compress/releases/tag/v1.16.6)
-	* zstd: correctly ignore WithEncoderPadding(1) by @ianlancetaylor in https://github.com/klauspost/compress/pull/806
-	* zstd: Add amd64 match length assembly https://github.com/klauspost/compress/pull/824
-	* gzhttp: Handle informational headers by @rtribotte in https://github.com/klauspost/compress/pull/815
-	* s2: Improve Better compression slightly https://github.com/klauspost/compress/pull/663
-
-* Apr 16, 2023 - [v1.16.5](https://github.com/klauspost/compress/releases/tag/v1.16.5)
-	* zstd: readByte needs to use io.ReadFull by @jnoxon in https://github.com/klauspost/compress/pull/802
-	* gzip: Fix WriterTo after initial read https://github.com/klauspost/compress/pull/804
-
-* Apr 5, 2023 - [v1.16.4](https://github.com/klauspost/compress/releases/tag/v1.16.4)
-	* zstd: Improve zstd best efficiency by @greatroar and @klauspost in https://github.com/klauspost/compress/pull/784
-	* zstd: Respect WithAllLitEntropyCompression https://github.com/klauspost/compress/pull/792
-	* zstd: Fix amd64 not always detecting corrupt data https://github.com/klauspost/compress/pull/785
-	* zstd: Various minor improvements by @greatroar in https://github.com/klauspost/compress/pull/788 https://github.com/klauspost/compress/pull/794 https://github.com/klauspost/compress/pull/795
-	* s2: Fix huge block overflow https://github.com/klauspost/compress/pull/779
-	* s2: Allow CustomEncoder fallback https://github.com/klauspost/compress/pull/780
-	* gzhttp: Support ResponseWriter Unwrap() in gzhttp handler by @jgimenez in https://github.com/klauspost/compress/pull/799
-
-* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)
-	* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776
-	* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767
-	* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766
-	* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773
-	* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774
-
-* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)
-	* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support.  https://github.com/klauspost/compress/pull/685
-	* s2: Add Compression Size Estimate.  https://github.com/klauspost/compress/pull/752
-	* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755
-	* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748
-	* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747
-	* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746
-</details>
-
-<details>
-	<summary>See changes to v1.15.x</summary>
-	
-* Jan 21st, 2023 (v1.15.15)
-	* deflate: Improve level 7-9 https://github.com/klauspost/compress/pull/739
-	* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
-	* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
-	* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
-
-* Jan 3rd, 2023 (v1.15.14)
-
-	* flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718
-	* zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720
-	* export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722
-	* s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723
-
-* Dec 11, 2022 (v1.15.13)
-	* zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder  https://github.com/klauspost/compress/pull/691
-	* zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708
-
-* Oct 26, 2022 (v1.15.12)
-
-	* zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680
-	* gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683
-
-* Sept 26, 2022 (v1.15.11)
-
-	* flate: Improve level 1-3 compression  https://github.com/klauspost/compress/pull/678
-	* zstd: Improve "best" compression by @nightwolfz in https://github.com/klauspost/compress/pull/677
-	* zstd: Fix+reduce decompression allocations https://github.com/klauspost/compress/pull/668
-	* zstd: Fix non-effective noescape tag https://github.com/klauspost/compress/pull/667
-
-* Sept 16, 2022 (v1.15.10)
-
-	* zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649
-	* Add Go 1.19 - deprecate Go 1.16  https://github.com/klauspost/compress/pull/651
-	* flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656
-	* zstd: Improve "better" compression  https://github.com/klauspost/compress/pull/657
-	* s2: Improve "best" compression https://github.com/klauspost/compress/pull/658
-	* s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635
-	* s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646
-	* Use arrays for constant size copies https://github.com/klauspost/compress/pull/659
-
-* July 21, 2022 (v1.15.9)
-
-	* zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645
-	* zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644
-	* zstd: Allow single segments up to "max decoded size" https://github.com/klauspost/compress/pull/643
-
-* July 13, 2022 (v1.15.8)
-
-	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
-	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
-	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
-	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
-	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
-	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
-	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
-
-* June 29, 2022 (v1.15.7)
-
-	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
-	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631
-	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
-	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
-	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620
-	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622
-
-* June 3, 2022 (v1.15.6)
-	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
-	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
-	* zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
-	* zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
-	* zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
-	* gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
-	* s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
-	* s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
-	* snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
-	* s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
-
-* May 25, 2022 (v1.15.5)
-	* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
-	* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
-	* huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
-	* zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
-	* zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
-	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
-	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
-	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
-	* flate: Inplace hashing for level 7-9 https://github.com/klauspost/compress/pull/590
-
-
-* May 11, 2022 (v1.15.4)
-	* huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
-	* inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
-	* zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
-	* zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
-
-* May 5, 2022 (v1.15.3)
-	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
-	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
-
-* Apr 26, 2022 (v1.15.2)
-	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
-	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
-	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
-	* Minimum version is Go 1.16, added CI test on 1.18.
-
-* Mar 11, 2022 (v1.15.1)
-	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
-	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
-	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
-	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
-	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
-
-* Mar 3, 2022 (v1.15.0)
-	* zstd: Refactor decoder [#498](https://github.com/klauspost/compress/pull/498)
-	* zstd: Add stream encoding without goroutines [#505](https://github.com/klauspost/compress/pull/505)
-	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
-	* flate: Inline literal emission [#509](https://github.com/klauspost/compress/pull/509)
-	* gzhttp: Add zstd to transport [#400](https://github.com/klauspost/compress/pull/400)
-	* gzhttp: Make content-type optional [#510](https://github.com/klauspost/compress/pull/510)
-
-Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
-
-Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
-
-While the release has been extensively tested, it is recommended to testing when upgrading.
-
-</details>
-
-<details>
-	<summary>See changes to v1.14.x</summary>
-	
-* Feb 22, 2022 (v1.14.4)
-	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
-	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
-	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501
-	* huff0: Use static decompression buffer up to 30% faster [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
-
-* Feb 17, 2022 (v1.14.3)
-	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)
-	* flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483)
-	* s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486)
-
-* Jan 25, 2022 (v1.14.2)
-	* zstd: improve header decoder by @dsnet  [#476](https://github.com/klauspost/compress/pull/476)
-	* zstd: Add bigger default blocks  [#469](https://github.com/klauspost/compress/pull/469)
-	* zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470)
-	* zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472)
-	* flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473)
-	* zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475)
-
-* Jan 11, 2022 (v1.14.1)
-	* s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
-	* flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
-	* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
-	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
-	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
-</details>
-
-<details>
-	<summary>See changes to v1.13.x</summary>
-	
-* Aug 30, 2021 (v1.13.5)
-	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
-	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
-	* zstd: pooledZipWriter should return Writers to the same pool [#426](https://github.com/klauspost/compress/pull/426)
-	* Removed golang/snappy as external dependency for tests [#421](https://github.com/klauspost/compress/pull/421)
-
-* Aug 12, 2021 (v1.13.4)
-	* Add [snappy replacement package](https://github.com/klauspost/compress/tree/master/snappy).
-	* zstd: Fix incorrect encoding in "best" mode [#415](https://github.com/klauspost/compress/pull/415)
-
-* Aug 3, 2021 (v1.13.3) 
-	* zstd: Improve Best compression [#404](https://github.com/klauspost/compress/pull/404)
-	* zstd: Fix WriteTo error forwarding [#411](https://github.com/klauspost/compress/pull/411)
-	* gzhttp: Return http.HandlerFunc instead of http.Handler. Unlikely breaking change. [#406](https://github.com/klauspost/compress/pull/406)
-	* s2sx: Fix max size error [#399](https://github.com/klauspost/compress/pull/399)
-	* zstd: Add optional stream content size on reset [#401](https://github.com/klauspost/compress/pull/401)
-	* zstd: use SpeedBestCompression for level >= 10 [#410](https://github.com/klauspost/compress/pull/410)
-
-* Jun 14, 2021 (v1.13.1)
-	* s2: Add full Snappy output support  [#396](https://github.com/klauspost/compress/pull/396)
-	* zstd: Add configurable [Decoder window](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithDecoderMaxWindow) size [#394](https://github.com/klauspost/compress/pull/394)
-	* gzhttp: Add header to skip compression  [#389](https://github.com/klauspost/compress/pull/389)
-	* s2: Improve speed with bigger output margin  [#395](https://github.com/klauspost/compress/pull/395)
-
-* Jun 3, 2021 (v1.13.0)
-	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
-	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
-	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
-</details>
-
-
-<details>
-	<summary>See changes to v1.12.x</summary>
-	
-* May 25, 2021 (v1.12.3)
-	* deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374)
-	* deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375)
-	* zstd: Forward read errors [#373](https://github.com/klauspost/compress/pull/373) 
-
-* Apr 27, 2021 (v1.12.2)
-	* zstd: Improve better/best compression [#360](https://github.com/klauspost/compress/pull/360) [#364](https://github.com/klauspost/compress/pull/364) [#365](https://github.com/klauspost/compress/pull/365)
-	* zstd: Add helpers to compress/decompress zstd inside zip files [#363](https://github.com/klauspost/compress/pull/363)
-	* deflate: Improve level 5+6 compression [#367](https://github.com/klauspost/compress/pull/367)
-	* s2: Improve better/best compression [#358](https://github.com/klauspost/compress/pull/358) [#359](https://github.com/klauspost/compress/pull/358)
-	* s2: Load after checking src limit on amd64. [#362](https://github.com/klauspost/compress/pull/362)
-	* s2sx: Limit max executable size [#368](https://github.com/klauspost/compress/pull/368) 
-
-* Apr 14, 2021 (v1.12.1)
-	* snappy package removed. Upstream added as dependency.
-	* s2: Better compression in "best" mode [#353](https://github.com/klauspost/compress/pull/353)
-	* s2sx: Add stdin input and detect pre-compressed from signature [#352](https://github.com/klauspost/compress/pull/352)
-	* s2c/s2d: Add http as possible input [#348](https://github.com/klauspost/compress/pull/348)
-	* s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352)
-	* zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346)
-	* s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349)
-</details>
-
-<details>
-	<summary>See changes to v1.11.x</summary>
-	
-* Mar 26, 2021 (v1.11.13)
-	* zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345)
-	* zstd: Add [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) encoder option [#336](https://github.com/klauspost/compress/pull/336)
-	* deflate: Improve entropy compression [#338](https://github.com/klauspost/compress/pull/338)
-	* s2: Clean up and minor performance improvement in best [#341](https://github.com/klauspost/compress/pull/341)
-
-* Mar 5, 2021 (v1.11.12)
-	* s2: Add `s2sx` binary that creates [self extracting archives](https://github.com/klauspost/compress/tree/master/s2#s2sx-self-extracting-archives).
-	* s2: Speed up decompression on non-assembly platforms [#328](https://github.com/klauspost/compress/pull/328)
-
-* Mar 1, 2021 (v1.11.9)
-	* s2: Add ARM64 decompression assembly. Around 2x output speed. [#324](https://github.com/klauspost/compress/pull/324)
-	* s2: Improve "better" speed and efficiency. [#325](https://github.com/klauspost/compress/pull/325)
-	* s2: Fix binaries.
-
-* Feb 25, 2021 (v1.11.8)
-	* s2: Fixed occasional out-of-bounds write on amd64. Upgrade recommended.
-	* s2: Add AMD64 assembly for better mode. 25-50% faster. [#315](https://github.com/klauspost/compress/pull/315)
-	* s2: Less upfront decoder allocation. [#322](https://github.com/klauspost/compress/pull/322)
-	* zstd: Faster "compression" of incompressible data. [#314](https://github.com/klauspost/compress/pull/314)
-	* zip: Fix zip64 headers. [#313](https://github.com/klauspost/compress/pull/313)
-  
-* Jan 14, 2021 (v1.11.7)
-	* Use Bytes() interface to get bytes across packages. [#309](https://github.com/klauspost/compress/pull/309)
-	* s2: Add 'best' compression option.  [#310](https://github.com/klauspost/compress/pull/310)
-	* s2: Add ReaderMaxBlockSize, changes `s2.NewReader` signature to include varargs. [#311](https://github.com/klauspost/compress/pull/311)
-	* s2: Fix crash on small better buffers. [#308](https://github.com/klauspost/compress/pull/308)
-	* s2: Clean up decoder. [#312](https://github.com/klauspost/compress/pull/312)
-
-* Jan 7, 2021 (v1.11.6)
-	* zstd: Make decoder allocations smaller [#306](https://github.com/klauspost/compress/pull/306)
-	* zstd: Free Decoder resources when Reset is called with a nil io.Reader  [#305](https://github.com/klauspost/compress/pull/305)
-
-* Dec 20, 2020 (v1.11.4)
-	* zstd: Add Best compression mode [#304](https://github.com/klauspost/compress/pull/304)
-	* Add header decoder [#299](https://github.com/klauspost/compress/pull/299)
-	* s2: Add uncompressed stream option [#297](https://github.com/klauspost/compress/pull/297)
-	* Simplify/speed up small blocks with known max size. [#300](https://github.com/klauspost/compress/pull/300)
-	* zstd: Always reset literal dict encoder [#303](https://github.com/klauspost/compress/pull/303)
-
-* Nov 15, 2020 (v1.11.3)
-	* inflate: 10-15% faster decompression  [#293](https://github.com/klauspost/compress/pull/293)
-	* zstd: Tweak DecodeAll default allocation [#295](https://github.com/klauspost/compress/pull/295)
-
-* Oct 11, 2020 (v1.11.2)
-	* s2: Fix out of bounds read in "better" block compression [#291](https://github.com/klauspost/compress/pull/291)
-
-* Oct 1, 2020 (v1.11.1)
-	* zstd: Set allLitEntropy true in default configuration [#286](https://github.com/klauspost/compress/pull/286)
-
-* Sept 8, 2020 (v1.11.0)
-	* zstd: Add experimental compression [dictionaries](https://github.com/klauspost/compress/tree/master/zstd#dictionaries) [#281](https://github.com/klauspost/compress/pull/281)
-	* zstd: Fix mixed Write and ReadFrom calls [#282](https://github.com/klauspost/compress/pull/282)
-	* inflate/gz: Limit variable shifts, ~5% faster decompression [#274](https://github.com/klauspost/compress/pull/274)
-</details>
-
-<details>
-	<summary>See changes to v1.10.x</summary>
- 
-* July 8, 2020 (v1.10.11) 
-	* zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278)
-	* huff0: Also populate compression table when reading decoding table. [#275](https://github.com/klauspost/compress/pull/275)
-	
-* June 23, 2020 (v1.10.10) 
-	* zstd: Skip entropy compression in fastest mode when no matches. [#270](https://github.com/klauspost/compress/pull/270)
-	
-* June 16, 2020 (v1.10.9): 
-	* zstd: API change for specifying dictionaries. See [#268](https://github.com/klauspost/compress/pull/268)
-	* zip: update CreateHeaderRaw to handle zip64 fields. [#266](https://github.com/klauspost/compress/pull/266)
-	* Fuzzit tests removed. The service has been purchased and is no longer available.
-	
-* June 5, 2020 (v1.10.8): 
-	* 1.15x faster zstd block decompression. [#265](https://github.com/klauspost/compress/pull/265)
-	
-* June 1, 2020 (v1.10.7): 
-	* Added zstd decompression [dictionary support](https://github.com/klauspost/compress/tree/master/zstd#dictionaries)
-	* Increase zstd decompression speed up to 1.19x.  [#259](https://github.com/klauspost/compress/pull/259)
-	* Remove internal reset call in zstd compression and reduce allocations. [#263](https://github.com/klauspost/compress/pull/263)
-	
-* May 21, 2020: (v1.10.6) 
-	* zstd: Reduce allocations while decoding. [#258](https://github.com/klauspost/compress/pull/258), [#252](https://github.com/klauspost/compress/pull/252)
-	* zstd: Stricter decompression checks.
-	
-* April 12, 2020: (v1.10.5)
-	* s2-commands: Flush output when receiving SIGINT. [#239](https://github.com/klauspost/compress/pull/239)
-	
-* Apr 8, 2020: (v1.10.4) 
-	* zstd: Minor/special case optimizations. [#251](https://github.com/klauspost/compress/pull/251),  [#250](https://github.com/klauspost/compress/pull/250),  [#249](https://github.com/klauspost/compress/pull/249),  [#247](https://github.com/klauspost/compress/pull/247)
-* Mar 11, 2020: (v1.10.3) 
-	* s2: Use S2 encoder in pure Go mode for Snappy output as well. [#245](https://github.com/klauspost/compress/pull/245)
-	* s2: Fix pure Go block encoder. [#244](https://github.com/klauspost/compress/pull/244)
-	* zstd: Added "better compression" mode. [#240](https://github.com/klauspost/compress/pull/240)
-	* zstd: Improve speed of fastest compression mode by 5-10% [#241](https://github.com/klauspost/compress/pull/241)
-	* zstd: Skip creating encoders when not needed. [#238](https://github.com/klauspost/compress/pull/238)
-	
-* Feb 27, 2020: (v1.10.2) 
-	* Close to 50% speedup in inflate (gzip/zip decompression). [#236](https://github.com/klauspost/compress/pull/236) [#234](https://github.com/klauspost/compress/pull/234) [#232](https://github.com/klauspost/compress/pull/232)
-	* Reduce deflate level 1-6 memory usage up to 59%. [#227](https://github.com/klauspost/compress/pull/227)
-	
-* Feb 18, 2020: (v1.10.1)
-	* Fix zstd crash when resetting multiple times without sending data. [#226](https://github.com/klauspost/compress/pull/226)
-	* deflate: Fix dictionary use on level 1-6. [#224](https://github.com/klauspost/compress/pull/224)
-	* Remove deflate writer reference when closing. [#224](https://github.com/klauspost/compress/pull/224)
-	
-* Feb 4, 2020: (v1.10.0) 
-	* Add optional dictionary to [stateless deflate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc#StatelessDeflate). Breaking change, send `nil` for previous behaviour. [#216](https://github.com/klauspost/compress/pull/216)
-	* Fix buffer overflow on repeated small block deflate.  [#218](https://github.com/klauspost/compress/pull/218)
-	* Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214)
-	* Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s.  [#186](https://github.com/klauspost/compress/pull/186)
-
-</details>
-
-<details>
-	<summary>See changes prior to v1.10.0</summary>
-
-* Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056),  [#206](https://github.com/klauspost/compress/pull/206).
-* Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204) 
-* Jan 5, 2020: (v1.9.7) Fix another zstd regression in v1.9.5 - v1.9.6 removed.
-* Jan 4, 2020: (v1.9.6) Regression in v1.9.5 fixed causing corrupt zstd encodes in rare cases.
-* Jan 4, 2020: Faster IO in [s2c + s2d commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) compression/decompression. [#192](https://github.com/klauspost/compress/pull/192)
-* Dec 29, 2019: Removed v1.9.5 since fuzz tests showed a compatibility problem with the reference zstandard decoder.
-* Dec 29, 2019: (v1.9.5) zstd: 10-20% faster block compression. [#199](https://github.com/klauspost/compress/pull/199)
-* Dec 29, 2019: [zip](https://godoc.org/github.com/klauspost/compress/zip) package updated with latest Go features
-* Dec 29, 2019: zstd: Single segment flag condintions tweaked. [#197](https://github.com/klauspost/compress/pull/197)
-* Dec 18, 2019: s2: Faster compression when ReadFrom is used. [#198](https://github.com/klauspost/compress/pull/198)
-* Dec 10, 2019: s2: Fix repeat length output when just above at 16MB limit.
-* Dec 10, 2019: zstd: Add function to get decoder as io.ReadCloser. [#191](https://github.com/klauspost/compress/pull/191)
-* Dec 3, 2019: (v1.9.4) S2: limit max repeat length. [#188](https://github.com/klauspost/compress/pull/188)
-* Dec 3, 2019: Add [WithNoEntropyCompression](https://godoc.org/github.com/klauspost/compress/zstd#WithNoEntropyCompression) to zstd [#187](https://github.com/klauspost/compress/pull/187)
-* Dec 3, 2019: Reduce memory use for tests. Check for leaked goroutines.
-* Nov 28, 2019 (v1.9.3) Less allocations in stateless deflate.
-* Nov 28, 2019: 5-20% Faster huff0 decode. Impacts zstd as well. [#184](https://github.com/klauspost/compress/pull/184)
-* Nov 12, 2019 (v1.9.2) Added [Stateless Compression](#stateless-compression) for gzip/deflate.
-* Nov 12, 2019: Fixed zstd decompression of large single blocks. [#180](https://github.com/klauspost/compress/pull/180)
-* Nov 11, 2019: Set default  [s2c](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) block size to 4MB.
-* Nov 11, 2019: Reduce inflate memory use by 1KB.
-* Nov 10, 2019: Less allocations in deflate bit writer.
-* Nov 10, 2019: Fix inconsistent error returned by zstd decoder.
-* Oct 28, 2019 (v1.9.1) ztsd: Fix crash when compressing blocks. [#174](https://github.com/klauspost/compress/pull/174)
-* Oct 24, 2019 (v1.9.0) zstd: Fix rare data corruption [#173](https://github.com/klauspost/compress/pull/173)
-* Oct 24, 2019 zstd: Fix huff0 out of buffer write [#171](https://github.com/klauspost/compress/pull/171) and always return errors [#172](https://github.com/klauspost/compress/pull/172) 
-* Oct 10, 2019: Big deflate rewrite, 30-40% faster with better compression [#105](https://github.com/klauspost/compress/pull/105)
-
-</details>
-
-<details>
-	<summary>See changes prior to v1.9.0</summary>
-
-* Oct 10, 2019: (v1.8.6) zstd: Allow partial reads to get flushed data. [#169](https://github.com/klauspost/compress/pull/169)
-* Oct 3, 2019: Fix inconsistent results on broken zstd streams.
-* Sep 25, 2019: Added `-rm` (remove source files) and `-q` (no output except errors) to `s2c` and `s2d` [commands](https://github.com/klauspost/compress/tree/master/s2#commandline-tools)
-* Sep 16, 2019: (v1.8.4) Add `s2c` and `s2d` [commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools).
-* Sep 10, 2019: (v1.8.3) Fix s2 decoder [Skip](https://godoc.org/github.com/klauspost/compress/s2#Reader.Skip).
-* Sep 7, 2019: zstd: Added [WithWindowSize](https://godoc.org/github.com/klauspost/compress/zstd#WithWindowSize), contributed by [ianwilkes](https://github.com/ianwilkes).
-* Sep 5, 2019: (v1.8.2) Add [WithZeroFrames](https://godoc.org/github.com/klauspost/compress/zstd#WithZeroFrames) which adds full zero payload block encoding option.
-* Sep 5, 2019: Lazy initialization of zstandard predefined en/decoder tables.
-* Aug 26, 2019: (v1.8.1) S2: 1-2% compression increase in "better" compression mode.
-* Aug 26, 2019: zstd: Check maximum size of Huffman 1X compressed literals while decoding.
-* Aug 24, 2019: (v1.8.0) Added [S2 compression](https://github.com/klauspost/compress/tree/master/s2#s2-compression), a high performance replacement for Snappy. 
-* Aug 21, 2019: (v1.7.6) Fixed minor issues found by fuzzer. One could lead to zstd not decompressing.
-* Aug 18, 2019: Add [fuzzit](https://fuzzit.dev/) continuous fuzzing.
-* Aug 14, 2019: zstd: Skip incompressible data 2x faster.  [#147](https://github.com/klauspost/compress/pull/147)
-* Aug 4, 2019 (v1.7.5): Better literal compression. [#146](https://github.com/klauspost/compress/pull/146)
-* Aug 4, 2019: Faster zstd compression. [#143](https://github.com/klauspost/compress/pull/143) [#144](https://github.com/klauspost/compress/pull/144)
-* Aug 4, 2019: Faster zstd decompression. [#145](https://github.com/klauspost/compress/pull/145) [#143](https://github.com/klauspost/compress/pull/143) [#142](https://github.com/klauspost/compress/pull/142)
-* July 15, 2019 (v1.7.4): Fix double EOF block in rare cases on zstd encoder.
-* July 15, 2019 (v1.7.3): Minor speedup/compression increase in default zstd encoder.
-* July 14, 2019: zstd decoder: Fix decompression error on multiple uses with mixed content.
-* July 7, 2019 (v1.7.2): Snappy update, zstd decoder potential race fix.
-* June 17, 2019: zstd decompression bugfix.
-* June 17, 2019: fix 32 bit builds.
-* June 17, 2019: Easier use in modules (less dependencies).
-* June 9, 2019: New stronger "default" [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression mode. Matches zstd default compression ratio.
-* June 5, 2019: 20-40% throughput in [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and better compression.
-* June 5, 2019: deflate/gzip compression: Reduce memory usage of lower compression levels.
-* June 2, 2019: Added [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression!
-* May 25, 2019: deflate/gzip: 10% faster bit writer, mostly visible in lower levels.
-* Apr 22, 2019: [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) decompression added.
-* Aug 1, 2018: Added [huff0 README](https://github.com/klauspost/compress/tree/master/huff0#huff0-entropy-compression).
-* Jul 8, 2018: Added [Performance Update 2018](#performance-update-2018) below.
-* Jun 23, 2018: Merged [Go 1.11 inflate optimizations](https://go-review.googlesource.com/c/go/+/102235). Go 1.9 is now required. Backwards compatible version tagged with [v1.3.0](https://github.com/klauspost/compress/releases/tag/v1.3.0).
-* Apr 2, 2018: Added [huff0](https://godoc.org/github.com/klauspost/compress/huff0) en/decoder. Experimental for now, API may change.
-* Mar 4, 2018: Added [FSE Entropy](https://godoc.org/github.com/klauspost/compress/fse) en/decoder. Experimental for now, API may change.
-* Nov 3, 2017: Add compression [Estimate](https://godoc.org/github.com/klauspost/compress#Estimate) function.
-* May 28, 2017: Reduce allocations when resetting decoder.
-* Apr 02, 2017: Change back to official crc32, since changes were merged in Go 1.7.
-* Jan 14, 2017: Reduce stack pressure due to array copies. See [Issue #18625](https://github.com/golang/go/issues/18625).
-* Oct 25, 2016: Level 2-4 have been rewritten and now offers significantly better performance than before.
-* Oct 20, 2016: Port zlib changes from Go 1.7 to fix zlib writer issue. Please update.
-* Oct 16, 2016: Go 1.7 changes merged. Apples to apples this package is a few percent faster, but has a significantly better balance between speed and compression per level. 
-* Mar 24, 2016: Always attempt Huffman encoding on level 4-7. This improves base 64 encoded data compression.
-* Mar 24, 2016: Small speedup for level 1-3.
-* Feb 19, 2016: Faster bit writer, level -2 is 15% faster, level 1 is 4% faster.
-* Feb 19, 2016: Handle small payloads faster in level 1-3.
-* Feb 19, 2016: Added faster level 2 + 3 compression modes.
-* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progression in terms of compression. New default level is 5.
-* Feb 14, 2016: Snappy: Merge upstream changes. 
-* Feb 14, 2016: Snappy: Fix aggressive skipping.
-* Feb 14, 2016: Snappy: Update benchmark.
-* Feb 13, 2016: Deflate: Fixed assembler problem that could lead to sub-optimal compression.
-* Feb 12, 2016: Snappy: Added AMD64 SSE 4.2 optimizations to matching, which makes easy to compress material run faster. Typical speedup is around 25%.
-* Feb 9, 2016: Added Snappy package fork. This version is 5-7% faster, much more on hard to compress content.
-* Jan 30, 2016: Optimize level 1 to 3 by not considering static dictionary or storing uncompressed. ~4-5% speedup.
-* Jan 16, 2016: Optimization on deflate level 1,2,3 compression.
-* Jan 8 2016: Merge [CL 18317](https://go-review.googlesource.com/#/c/18317): fix reading, writing of zip64 archives.
-* Dec 8 2015: Make level 1 and -2 deterministic even if write size differs.
-* Dec 8 2015: Split encoding functions, so hashing and matching can potentially be inlined. 1-3% faster on AMD64. 5% faster on other platforms.
-* Dec 8 2015: Fixed rare [one byte out-of bounds read](https://github.com/klauspost/compress/issues/20). Please update!
-* Nov 23 2015: Optimization on token writer. ~2-4% faster. Contributed by [@dsnet](https://github.com/dsnet).
-* Nov 20 2015: Small optimization to bit writer on 64 bit systems.
-* Nov 17 2015: Fixed out-of-bound errors if the underlying Writer returned an error. See [#15](https://github.com/klauspost/compress/issues/15).
-* Nov 12 2015: Added [io.WriterTo](https://golang.org/pkg/io/#WriterTo) support to gzip/inflate.
-* Nov 11 2015: Merged [CL 16669](https://go-review.googlesource.com/#/c/16669/4): archive/zip: enable overriding (de)compressors per file
-* Oct 15 2015: Added skipping on uncompressible data. Random data speed up >5x.
-
-</details>
-
-# deflate usage
-
-The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
-
-Typical speed is about 2x of the standard library packages.
-
-| old import       | new import                            | Documentation                                                           |
-|------------------|---------------------------------------|-------------------------------------------------------------------------|
-| `compress/gzip`  | `github.com/klauspost/compress/gzip`  | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)   |
-| `compress/zlib`  | `github.com/klauspost/compress/zlib`  | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)   |
-| `archive/zip`    | `github.com/klauspost/compress/zip`   | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)     |
-| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc) |
-
-* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
-
-You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages.
-
-The packages contains the same as the standard library, so you can use the godoc for that: [gzip](http://golang.org/pkg/compress/gzip/), [zip](http://golang.org/pkg/archive/zip/),  [zlib](http://golang.org/pkg/compress/zlib/), [flate](http://golang.org/pkg/compress/flate/).
-
-Currently there is only minor speedup on decompression (mostly CRC32 calculation).
-
-Memory usage is typically 1MB for a Writer. stdlib is in the same range. 
-If you expect to have a lot of concurrently allocated Writers consider using 
-the stateless compress described below.
-
-For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).
-
-To disable all assembly add `-tags=noasm`. This works across all packages.
-
-# Stateless compression
-
-This package offers stateless compression as a special option for gzip/deflate. 
-It will do compression but without maintaining any state between Write calls.
-
-This means there will be no memory kept between Write calls, but compression and speed will be suboptimal.
-
-This is only relevant in cases where you expect to run many thousands of compressors concurrently, 
-but with very little activity. This is *not* intended for regular web servers serving individual requests.  
-
-Because of this, the size of actual Write calls will affect output size.
-
-In gzip, specify level `-3` / `gzip.StatelessCompression` to enable.
-
-For direct deflate use, NewStatelessWriter and StatelessDeflate are available. See [documentation](https://godoc.org/github.com/klauspost/compress/flate#NewStatelessWriter)
-
-A `bufio.Writer` can of course be used to control write sizes. For example, to use a 4KB buffer:
-
-```go
-	// replace 'ioutil.Discard' with your output.
-	gzw, err := gzip.NewWriterLevel(ioutil.Discard, gzip.StatelessCompression)
-	if err != nil {
-		return err
-	}
-	defer gzw.Close()
-
-	w := bufio.NewWriterSize(gzw, 4096)
-	defer w.Flush()
-	
-	// Write to 'w' 
-```
-
-This will only use up to 4KB in memory when the writer is idle. 
-
-Compression is almost always worse than the fastest compression level 
-and each write will allocate (a little) memory. 
-
-
-# Other packages
-
-Here are other packages of good quality and pure Go (no cgo wrappers or autoconverted code):
-
-* [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.
-* [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.
-* [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.
-* [github.com/ronanh/intcomp](https://github.com/ronanh/intcomp) - Integer compression.
-* [github.com/spenczar/fpc](https://github.com/spenczar/fpc) - Float compression.
-* [github.com/minio/zipindex](https://github.com/minio/zipindex) - External ZIP directory index.
-* [github.com/ybirader/pzip](https://github.com/ybirader/pzip) - Fast concurrent zip archiver and extractor.
-
-# license
-
-This code is licensed under the same conditions as the original Go code. See LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/SECURITY.md b/vendor/github.com/klauspost/compress/SECURITY.md
deleted file mode 100644
index ca6685e2b..000000000
--- a/vendor/github.com/klauspost/compress/SECURITY.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Security Policy
-
-## Supported Versions
-
-Security updates are applied only to the latest release.
-
-## Vulnerability Definition
-
-A security vulnerability is a bug that with certain input triggers a crash or an infinite loop. Most calls will have varying execution time and only in rare cases will slow operation be considered a security vulnerability.
-
-Corrupted output generally is not considered a security vulnerability, unless independent operations are able to affect each other. Note that not all functionality is re-entrant and safe to use concurrently.
-
-Out-of-memory crashes only applies if the en/decoder uses an abnormal amount of memory, with appropriate options applied, to limit maximum window size, concurrency, etc. However, if you are in doubt you are welcome to file a security issue.
-
-It is assumed that all callers are trusted, meaning internal data exposed through reflection or inspection of returned data structures is not considered a vulnerability.
-
-Vulnerabilities resulting from compiler/assembler errors should be reported upstream. Depending on the severity this package may or may not implement a workaround.
-
-## Reporting a Vulnerability
-
-If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-
-Please disclose it at [security advisory](https://github.com/klauspost/compress/security/advisories/new). If possible please provide a minimal reproducer. If the issue only applies to a single platform, it would be helpful to provide access to that.
-
-This project is maintained by a team of volunteers on a reasonable-effort basis. As such, vulnerabilities will be disclosed in a best effort base.
diff --git a/vendor/github.com/klauspost/compress/compressible.go b/vendor/github.com/klauspost/compress/compressible.go
deleted file mode 100644
index ea5a692d5..000000000
--- a/vendor/github.com/klauspost/compress/compressible.go
+++ /dev/null
@@ -1,85 +0,0 @@
-package compress
-
-import "math"
-
-// Estimate returns a normalized compressibility estimate of block b.
-// Values close to zero are likely uncompressible.
-// Values above 0.1 are likely to be compressible.
-// Values above 0.5 are very compressible.
-// Very small lengths will return 0.
-func Estimate(b []byte) float64 {
-	if len(b) < 16 {
-		return 0
-	}
-
-	// Correctly predicted order 1
-	hits := 0
-	lastMatch := false
-	var o1 [256]byte
-	var hist [256]int
-	c1 := byte(0)
-	for _, c := range b {
-		if c == o1[c1] {
-			// We only count a hit if there was two correct predictions in a row.
-			if lastMatch {
-				hits++
-			}
-			lastMatch = true
-		} else {
-			lastMatch = false
-		}
-		o1[c1] = c
-		c1 = c
-		hist[c]++
-	}
-
-	// Use x^0.6 to give better spread
-	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
-
-	// Calculate histogram distribution
-	variance := float64(0)
-	avg := float64(len(b)) / 256
-
-	for _, v := range hist {
-		Δ := float64(v) - avg
-		variance += Δ * Δ
-	}
-
-	stddev := math.Sqrt(float64(variance)) / float64(len(b))
-	exp := math.Sqrt(1 / float64(len(b)))
-
-	// Subtract expected stddev
-	stddev -= exp
-	if stddev < 0 {
-		stddev = 0
-	}
-	stddev *= 1 + exp
-
-	// Use x^0.4 to give better spread
-	entropy := math.Pow(stddev, 0.4)
-
-	// 50/50 weight between prediction and histogram distribution
-	return math.Pow((prediction+entropy)/2, 0.9)
-}
-
-// ShannonEntropyBits returns the number of bits minimum required to represent
-// an entropy encoding of the input bytes.
-// https://en.wiktionary.org/wiki/Shannon_entropy
-func ShannonEntropyBits(b []byte) int {
-	if len(b) == 0 {
-		return 0
-	}
-	var hist [256]int
-	for _, c := range b {
-		hist[c]++
-	}
-	shannon := float64(0)
-	invTotal := 1.0 / float64(len(b))
-	for _, v := range hist[:] {
-		if v > 0 {
-			n := float64(v)
-			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
-		}
-	}
-	return int(math.Ceil(shannon))
-}
diff --git a/vendor/github.com/klauspost/compress/fse/README.md b/vendor/github.com/klauspost/compress/fse/README.md
deleted file mode 100644
index ea7324da6..000000000
--- a/vendor/github.com/klauspost/compress/fse/README.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# Finite State Entropy
-
-This package provides Finite State Entropy encoding and decoding.
-            
-Finite State Entropy (also referenced as [tANS](https://en.wikipedia.org/wiki/Asymmetric_numeral_systems#tANS)) 
-encoding provides a fast near-optimal symbol encoding/decoding
-for byte blocks as implemented in [zstandard](https://github.com/facebook/zstd).
-
-This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
-This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
-but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
-
-* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/fse)
-
-## News
-
- * Feb 2018: First implementation released. Consider this beta software for now.
-
-# Usage
-
-This package provides a low level interface that allows to compress single independent blocks. 
-
-Each block is separate, and there is no built in integrity checks. 
-This means that the caller should keep track of block sizes and also do checksums if needed.  
-
-Compressing a block is done via the [`Compress`](https://godoc.org/github.com/klauspost/compress/fse#Compress) function.
-You must provide input and will receive the output and maybe an error.
-
-These error values can be returned:
-
-| Error               | Description                                                                 |
-|---------------------|-----------------------------------------------------------------------------|
-| `<nil>`             | Everything ok, output is returned                                           |
-| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
-| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
-| `(error)`           | An internal error occurred.                                                 |
-
-As can be seen above there are errors that will be returned even under normal operation so it is important to handle these.
-
-To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/fse#Scratch) object 
-that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
-object can be used for both.   
-
-Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
-you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
-
-Decompressing is done by calling the [`Decompress`](https://godoc.org/github.com/klauspost/compress/fse#Decompress) function.
-You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
-your input was likely corrupted. 
-
-It is important to note that a successful decoding does *not* mean your output matches your original input. 
-There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
-
-For more detailed usage, see examples in the [godoc documentation](https://godoc.org/github.com/klauspost/compress/fse#pkg-examples).
-
-# Performance
-
-A lot of factors are affecting speed. Block sizes and compressibility of the material are primary factors.  
-All compression functions are currently only running on the calling goroutine so only one core will be used per block.  
-
-The compressor is significantly faster if symbols are kept as small as possible. The highest byte value of the input
-is used to reduce some of the processing, so if all your input is above byte value 64 for instance, it may be 
-beneficial to transpose all your input values down by 64.   
-
-With moderate block sizes around 64k speed are typically 200MB/s per core for compression and 
-around 300MB/s decompression speed. 
-
-The same hardware typically does Huffman (deflate) encoding at 125MB/s and decompression at 100MB/s. 
-
-# Plans
-
-At one point, more internals will be exposed to facilitate more "expert" usage of the components. 
-
-A streaming interface is also likely to be implemented. Likely compatible with [FSE stream format](https://github.com/Cyan4973/FiniteStateEntropy/blob/dev/programs/fileio.c#L261).  
-
-# Contributing
-
-Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.  
\ No newline at end of file
diff --git a/vendor/github.com/klauspost/compress/fse/bitreader.go b/vendor/github.com/klauspost/compress/fse/bitreader.go
deleted file mode 100644
index f65eb3909..000000000
--- a/vendor/github.com/klauspost/compress/fse/bitreader.go
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package fse
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReader struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBits(uint32(v)))
-	return nil
-}
-
-// getBits will return n bits. n can be 0.
-func (b *bitReader) getBits(n uint8) uint16 {
-	if n == 0 || b.bitsRead >= 64 {
-		return 0
-	}
-	return b.getBitsFast(n)
-}
-
-// getBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	b.bitsRead += n
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-	// 2 bounds checks.
-	v := b.in[b.off-4:]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.bitsRead >= 64 && b.off == 0
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/fse/bitwriter.go b/vendor/github.com/klauspost/compress/fse/bitwriter.go
deleted file mode 100644
index e82fa3bb7..000000000
--- a/vendor/github.com/klauspost/compress/fse/bitwriter.go
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package fse
-
-import "fmt"
-
-// bitWriter will write bits.
-// First bit will be LSB of the first byte of output.
-type bitWriter struct {
-	bitContainer uint64
-	nBits        uint8
-	out          []byte
-}
-
-// bitMask16 is bitmasks. Has extra to avoid bounds check.
-var bitMask16 = [32]uint16{
-	0, 1, 3, 7, 0xF, 0x1F,
-	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
-	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF} /* up to 16 bits */
-
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// addBits16ZeroNC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
-	if bits == 0 {
-		return
-	}
-	value <<= (16 - bits) & 15
-	value >>= (16 - bits) & 15
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
-	v := b.nBits >> 3
-	switch v {
-	case 0:
-	case 1:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-		)
-	case 2:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-		)
-	case 3:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-		)
-	case 4:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-		)
-	case 5:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-		)
-	case 6:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-		)
-	case 7:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-		)
-	case 8:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-			byte(b.bitContainer>>56),
-		)
-	default:
-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
-	}
-	b.bitContainer >>= v << 3
-	b.nBits &= 7
-}
-
-// flush32 will flush out, so there are at least 32 bits available for writing.
-func (b *bitWriter) flush32() {
-	if b.nBits < 32 {
-		return
-	}
-	b.out = append(b.out,
-		byte(b.bitContainer),
-		byte(b.bitContainer>>8),
-		byte(b.bitContainer>>16),
-		byte(b.bitContainer>>24))
-	b.nBits -= 32
-	b.bitContainer >>= 32
-}
-
-// flushAlign will flush remaining full bytes and align to next byte boundary.
-func (b *bitWriter) flushAlign() {
-	nbBytes := (b.nBits + 7) >> 3
-	for i := uint8(0); i < nbBytes; i++ {
-		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
-	}
-	b.nBits = 0
-	b.bitContainer = 0
-}
-
-// close will write the alignment bit and write the final byte(s)
-// to the output.
-func (b *bitWriter) close() {
-	// End mark
-	b.addBits16Clean(1, 1)
-	// flush until next byte.
-	b.flushAlign()
-}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
-	b.bitContainer = 0
-	b.nBits = 0
-	b.out = out
-}
diff --git a/vendor/github.com/klauspost/compress/fse/bytereader.go b/vendor/github.com/klauspost/compress/fse/bytereader.go
deleted file mode 100644
index abade2d60..000000000
--- a/vendor/github.com/klauspost/compress/fse/bytereader.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package fse
-
-// byteReader provides a byte reader that reads
-// little endian values from a byte stream.
-// The input stream is manually advanced.
-// The reader performs no bounds checks.
-type byteReader struct {
-	b   []byte
-	off int
-}
-
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
-	b.b = in
-	b.off = 0
-}
-
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
-	b.off += int(n)
-}
-
-// Uint32 returns a little endian uint32 starting at current offset.
-func (b byteReader) Uint32() uint32 {
-	b2 := b.b[b.off:]
-	b2 = b2[:4]
-	v3 := uint32(b2[3])
-	v2 := uint32(b2[2])
-	v1 := uint32(b2[1])
-	v0 := uint32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
-}
-
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
-	return b.b[b.off:]
-}
-
-// remain will return the number of bytes remaining.
-func (b byteReader) remain() int {
-	return len(b.b) - b.off
-}
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
deleted file mode 100644
index 074018d8f..000000000
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ /dev/null
@@ -1,683 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package fse
-
-import (
-	"errors"
-	"fmt"
-)
-
-// Compress the input bytes. Input must be < 2GB.
-// Provide a Scratch buffer to avoid memory allocations.
-// Note that the output is also kept in the scratch buffer.
-// If input is too hard to compress, ErrIncompressible is returned.
-// If input is a single byte value repeated ErrUseRLE is returned.
-func Compress(in []byte, s *Scratch) ([]byte, error) {
-	if len(in) <= 1 {
-		return nil, ErrIncompressible
-	}
-	if len(in) > (2<<30)-1 {
-		return nil, errors.New("input too big, must be < 2GB")
-	}
-	s, err := s.prepare(in)
-	if err != nil {
-		return nil, err
-	}
-
-	// Create histogram, if none was provided.
-	maxCount := s.maxCount
-	if maxCount == 0 {
-		maxCount = s.countSimple(in)
-	}
-	// Reset for next run.
-	s.clearCount = true
-	s.maxCount = 0
-	if maxCount == len(in) {
-		// One symbol, use RLE
-		return nil, ErrUseRLE
-	}
-	if maxCount == 1 || maxCount < (len(in)>>7) {
-		// Each symbol present maximum once or too well distributed.
-		return nil, ErrIncompressible
-	}
-	s.optimalTableLog()
-	err = s.normalizeCount()
-	if err != nil {
-		return nil, err
-	}
-	err = s.writeCount()
-	if err != nil {
-		return nil, err
-	}
-
-	if false {
-		err = s.validateNorm()
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	err = s.buildCTable()
-	if err != nil {
-		return nil, err
-	}
-	err = s.compress(in)
-	if err != nil {
-		return nil, err
-	}
-	s.Out = s.bw.out
-	// Check if we compressed.
-	if len(s.Out) >= len(in) {
-		return nil, ErrIncompressible
-	}
-	return s.Out, nil
-}
-
-// cState contains the compression state of a stream.
-type cState struct {
-	bw         *bitWriter
-	stateTable []uint16
-	state      uint16
-}
-
-// init will initialize the compression state to the first symbol of the stream.
-func (c *cState) init(bw *bitWriter, ct *cTable, tableLog uint8, first symbolTransform) {
-	c.bw = bw
-	c.stateTable = ct.stateTable
-
-	nbBitsOut := (first.deltaNbBits + (1 << 15)) >> 16
-	im := int32((nbBitsOut << 16) - first.deltaNbBits)
-	lu := (im >> nbBitsOut) + first.deltaFindState
-	c.state = c.stateTable[lu]
-}
-
-// encode the output symbol provided and write it to the bitstream.
-func (c *cState) encode(symbolTT symbolTransform) {
-	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
-	dstState := int32(c.state>>(nbBitsOut&15)) + symbolTT.deltaFindState
-	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
-	c.state = c.stateTable[dstState]
-}
-
-// encode the output symbol provided and write it to the bitstream.
-func (c *cState) encodeZero(symbolTT symbolTransform) {
-	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
-	dstState := int32(c.state>>(nbBitsOut&15)) + symbolTT.deltaFindState
-	c.bw.addBits16ZeroNC(c.state, uint8(nbBitsOut))
-	c.state = c.stateTable[dstState]
-}
-
-// flush will write the tablelog to the output and flush the remaining full bytes.
-func (c *cState) flush(tableLog uint8) {
-	c.bw.flush32()
-	c.bw.addBits16NC(c.state, tableLog)
-	c.bw.flush()
-}
-
-// compress is the main compression loop that will encode the input from the last byte to the first.
-func (s *Scratch) compress(src []byte) error {
-	if len(src) <= 2 {
-		return errors.New("compress: src too small")
-	}
-	tt := s.ct.symbolTT[:256]
-	s.bw.reset(s.Out)
-
-	// Our two states each encodes every second byte.
-	// Last byte encoded (first byte decoded) will always be encoded by c1.
-	var c1, c2 cState
-
-	// Encode so remaining size is divisible by 4.
-	ip := len(src)
-	if ip&1 == 1 {
-		c1.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-1]])
-		c2.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-2]])
-		c1.encodeZero(tt[src[ip-3]])
-		ip -= 3
-	} else {
-		c2.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-1]])
-		c1.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-2]])
-		ip -= 2
-	}
-	if ip&2 != 0 {
-		c2.encodeZero(tt[src[ip-1]])
-		c1.encodeZero(tt[src[ip-2]])
-		ip -= 2
-	}
-	src = src[:ip]
-
-	// Main compression loop.
-	switch {
-	case !s.zeroBits && s.actualTableLog <= 8:
-		// We can encode 4 symbols without requiring a flush.
-		// We do not need to check if any output is 0 bits.
-		for ; len(src) >= 4; src = src[:len(src)-4] {
-			s.bw.flush32()
-			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
-			c2.encode(tt[v0])
-			c1.encode(tt[v1])
-			c2.encode(tt[v2])
-			c1.encode(tt[v3])
-		}
-	case !s.zeroBits:
-		// We do not need to check if any output is 0 bits.
-		for ; len(src) >= 4; src = src[:len(src)-4] {
-			s.bw.flush32()
-			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
-			c2.encode(tt[v0])
-			c1.encode(tt[v1])
-			s.bw.flush32()
-			c2.encode(tt[v2])
-			c1.encode(tt[v3])
-		}
-	case s.actualTableLog <= 8:
-		// We can encode 4 symbols without requiring a flush
-		for ; len(src) >= 4; src = src[:len(src)-4] {
-			s.bw.flush32()
-			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
-			c2.encodeZero(tt[v0])
-			c1.encodeZero(tt[v1])
-			c2.encodeZero(tt[v2])
-			c1.encodeZero(tt[v3])
-		}
-	default:
-		for ; len(src) >= 4; src = src[:len(src)-4] {
-			s.bw.flush32()
-			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
-			c2.encodeZero(tt[v0])
-			c1.encodeZero(tt[v1])
-			s.bw.flush32()
-			c2.encodeZero(tt[v2])
-			c1.encodeZero(tt[v3])
-		}
-	}
-
-	// Flush final state.
-	// Used to initialize state when decoding.
-	c2.flush(s.actualTableLog)
-	c1.flush(s.actualTableLog)
-
-	s.bw.close()
-	return nil
-}
-
-// writeCount will write the normalized histogram count to header.
-// This is read back by readNCount.
-func (s *Scratch) writeCount() error {
-	var (
-		tableLog  = s.actualTableLog
-		tableSize = 1 << tableLog
-		previous0 bool
-		charnum   uint16
-
-		maxHeaderSize = ((int(s.symbolLen)*int(tableLog) + 4 + 2) >> 3) + 3
-
-		// Write Table Size
-		bitStream = uint32(tableLog - minTablelog)
-		bitCount  = uint(4)
-		remaining = int16(tableSize + 1) /* +1 for extra accuracy */
-		threshold = int16(tableSize)
-		nbBits    = uint(tableLog + 1)
-	)
-	if cap(s.Out) < maxHeaderSize {
-		s.Out = make([]byte, 0, s.br.remain()+maxHeaderSize)
-	}
-	outP := uint(0)
-	out := s.Out[:maxHeaderSize]
-
-	// stops at 1
-	for remaining > 1 {
-		if previous0 {
-			start := charnum
-			for s.norm[charnum] == 0 {
-				charnum++
-			}
-			for charnum >= start+24 {
-				start += 24
-				bitStream += uint32(0xFFFF) << bitCount
-				out[outP] = byte(bitStream)
-				out[outP+1] = byte(bitStream >> 8)
-				outP += 2
-				bitStream >>= 16
-			}
-			for charnum >= start+3 {
-				start += 3
-				bitStream += 3 << bitCount
-				bitCount += 2
-			}
-			bitStream += uint32(charnum-start) << bitCount
-			bitCount += 2
-			if bitCount > 16 {
-				out[outP] = byte(bitStream)
-				out[outP+1] = byte(bitStream >> 8)
-				outP += 2
-				bitStream >>= 16
-				bitCount -= 16
-			}
-		}
-
-		count := s.norm[charnum]
-		charnum++
-		max := (2*threshold - 1) - remaining
-		if count < 0 {
-			remaining += count
-		} else {
-			remaining -= count
-		}
-		count++ // +1 for extra accuracy
-		if count >= threshold {
-			count += max // [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[
-		}
-		bitStream += uint32(count) << bitCount
-		bitCount += nbBits
-		if count < max {
-			bitCount--
-		}
-
-		previous0 = count == 1
-		if remaining < 1 {
-			return errors.New("internal error: remaining<1")
-		}
-		for remaining < threshold {
-			nbBits--
-			threshold >>= 1
-		}
-
-		if bitCount > 16 {
-			out[outP] = byte(bitStream)
-			out[outP+1] = byte(bitStream >> 8)
-			outP += 2
-			bitStream >>= 16
-			bitCount -= 16
-		}
-	}
-
-	out[outP] = byte(bitStream)
-	out[outP+1] = byte(bitStream >> 8)
-	outP += (bitCount + 7) / 8
-
-	if charnum > s.symbolLen {
-		return errors.New("internal error: charnum > s.symbolLen")
-	}
-	s.Out = out[:outP]
-	return nil
-}
-
-// symbolTransform contains the state transform for a symbol.
-type symbolTransform struct {
-	deltaFindState int32
-	deltaNbBits    uint32
-}
-
-// String prints values as a human readable string.
-func (s symbolTransform) String() string {
-	return fmt.Sprintf("dnbits: %08x, fs:%d", s.deltaNbBits, s.deltaFindState)
-}
-
-// cTable contains tables used for compression.
-type cTable struct {
-	tableSymbol []byte
-	stateTable  []uint16
-	symbolTT    []symbolTransform
-}
-
-// allocCtable will allocate tables needed for compression.
-// If existing tables a re big enough, they are simply re-used.
-func (s *Scratch) allocCtable() {
-	tableSize := 1 << s.actualTableLog
-	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < tableSize {
-		s.ct.tableSymbol = make([]byte, tableSize)
-	}
-	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
-
-	ctSize := tableSize
-	if cap(s.ct.stateTable) < ctSize {
-		s.ct.stateTable = make([]uint16, ctSize)
-	}
-	s.ct.stateTable = s.ct.stateTable[:ctSize]
-
-	if cap(s.ct.symbolTT) < 256 {
-		s.ct.symbolTT = make([]symbolTransform, 256)
-	}
-	s.ct.symbolTT = s.ct.symbolTT[:256]
-}
-
-// buildCTable will populate the compression table so it is ready to be used.
-func (s *Scratch) buildCTable() error {
-	tableSize := uint32(1 << s.actualTableLog)
-	highThreshold := tableSize - 1
-	var cumul [maxSymbolValue + 2]int16
-
-	s.allocCtable()
-	tableSymbol := s.ct.tableSymbol[:tableSize]
-	// symbol start positions
-	{
-		cumul[0] = 0
-		for ui, v := range s.norm[:s.symbolLen-1] {
-			u := byte(ui) // one less than reference
-			if v == -1 {
-				// Low proba symbol
-				cumul[u+1] = cumul[u] + 1
-				tableSymbol[highThreshold] = u
-				highThreshold--
-			} else {
-				cumul[u+1] = cumul[u] + v
-			}
-		}
-		// Encode last symbol separately to avoid overflowing u
-		u := int(s.symbolLen - 1)
-		v := s.norm[s.symbolLen-1]
-		if v == -1 {
-			// Low proba symbol
-			cumul[u+1] = cumul[u] + 1
-			tableSymbol[highThreshold] = byte(u)
-			highThreshold--
-		} else {
-			cumul[u+1] = cumul[u] + v
-		}
-		if uint32(cumul[s.symbolLen]) != tableSize {
-			return fmt.Errorf("internal error: expected cumul[s.symbolLen] (%d) == tableSize (%d)", cumul[s.symbolLen], tableSize)
-		}
-		cumul[s.symbolLen] = int16(tableSize) + 1
-	}
-	// Spread symbols
-	s.zeroBits = false
-	{
-		step := tableStep(tableSize)
-		tableMask := tableSize - 1
-		var position uint32
-		// if any symbol > largeLimit, we may have 0 bits output.
-		largeLimit := int16(1 << (s.actualTableLog - 1))
-		for ui, v := range s.norm[:s.symbolLen] {
-			symbol := byte(ui)
-			if v > largeLimit {
-				s.zeroBits = true
-			}
-			for nbOccurrences := int16(0); nbOccurrences < v; nbOccurrences++ {
-				tableSymbol[position] = symbol
-				position = (position + step) & tableMask
-				for position > highThreshold {
-					position = (position + step) & tableMask
-				} /* Low proba area */
-			}
-		}
-
-		// Check if we have gone through all positions
-		if position != 0 {
-			return errors.New("position!=0")
-		}
-	}
-
-	// Build table
-	table := s.ct.stateTable
-	{
-		tsi := int(tableSize)
-		for u, v := range tableSymbol {
-			// TableU16 : sorted by symbol order; gives next state value
-			table[cumul[v]] = uint16(tsi + u)
-			cumul[v]++
-		}
-	}
-
-	// Build Symbol Transformation Table
-	{
-		total := int16(0)
-		symbolTT := s.ct.symbolTT[:s.symbolLen]
-		tableLog := s.actualTableLog
-		tl := (uint32(tableLog) << 16) - (1 << tableLog)
-		for i, v := range s.norm[:s.symbolLen] {
-			switch v {
-			case 0:
-			case -1, 1:
-				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = int32(total - 1)
-				total++
-			default:
-				maxBitsOut := uint32(tableLog) - highBits(uint32(v-1))
-				minStatePlus := uint32(v) << maxBitsOut
-				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = int32(total - v)
-				total += v
-			}
-		}
-		if total != int16(tableSize) {
-			return fmt.Errorf("total mismatch %d (got) != %d (want)", total, tableSize)
-		}
-	}
-	return nil
-}
-
-// countSimple will create a simple histogram in s.count.
-// Returns the biggest count.
-// Does not update s.clearCount.
-func (s *Scratch) countSimple(in []byte) (max int) {
-	for _, v := range in {
-		s.count[v]++
-	}
-	m, symlen := uint32(0), s.symbolLen
-	for i, v := range s.count[:] {
-		if v == 0 {
-			continue
-		}
-		if v > m {
-			m = v
-		}
-		symlen = uint16(i) + 1
-	}
-	s.symbolLen = symlen
-	return int(m)
-}
-
-// minTableLog provides the minimum logSize to safely represent a distribution.
-func (s *Scratch) minTableLog() uint8 {
-	minBitsSrc := highBits(uint32(s.br.remain()-1)) + 1
-	minBitsSymbols := highBits(uint32(s.symbolLen-1)) + 2
-	if minBitsSrc < minBitsSymbols {
-		return uint8(minBitsSrc)
-	}
-	return uint8(minBitsSymbols)
-}
-
-// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
-func (s *Scratch) optimalTableLog() {
-	tableLog := s.TableLog
-	minBits := s.minTableLog()
-	maxBitsSrc := uint8(highBits(uint32(s.br.remain()-1))) - 2
-	if maxBitsSrc < tableLog {
-		// Accuracy can be reduced
-		tableLog = maxBitsSrc
-	}
-	if minBits > tableLog {
-		tableLog = minBits
-	}
-	// Need a minimum to safely represent all symbol values
-	if tableLog < minTablelog {
-		tableLog = minTablelog
-	}
-	if tableLog > maxTableLog {
-		tableLog = maxTableLog
-	}
-	s.actualTableLog = tableLog
-}
-
-var rtbTable = [...]uint32{0, 473195, 504333, 520860, 550000, 700000, 750000, 830000}
-
-// normalizeCount will normalize the count of the symbols so
-// the total is equal to the table size.
-func (s *Scratch) normalizeCount() error {
-	var (
-		tableLog          = s.actualTableLog
-		scale             = 62 - uint64(tableLog)
-		step              = (1 << 62) / uint64(s.br.remain())
-		vStep             = uint64(1) << (scale - 20)
-		stillToDistribute = int16(1 << tableLog)
-		largest           int
-		largestP          int16
-		lowThreshold      = (uint32)(s.br.remain() >> tableLog)
-	)
-
-	for i, cnt := range s.count[:s.symbolLen] {
-		// already handled
-		// if (count[s] == s.length) return 0;   /* rle special case */
-
-		if cnt == 0 {
-			s.norm[i] = 0
-			continue
-		}
-		if cnt <= lowThreshold {
-			s.norm[i] = -1
-			stillToDistribute--
-		} else {
-			proba := (int16)((uint64(cnt) * step) >> scale)
-			if proba < 8 {
-				restToBeat := vStep * uint64(rtbTable[proba])
-				v := uint64(cnt)*step - (uint64(proba) << scale)
-				if v > restToBeat {
-					proba++
-				}
-			}
-			if proba > largestP {
-				largestP = proba
-				largest = i
-			}
-			s.norm[i] = proba
-			stillToDistribute -= proba
-		}
-	}
-
-	if -stillToDistribute >= (s.norm[largest] >> 1) {
-		// corner case, need another normalization method
-		return s.normalizeCount2()
-	}
-	s.norm[largest] += stillToDistribute
-	return nil
-}
-
-// Secondary normalization method.
-// To be used when primary method fails.
-func (s *Scratch) normalizeCount2() error {
-	const notYetAssigned = -2
-	var (
-		distributed  uint32
-		total        = uint32(s.br.remain())
-		tableLog     = s.actualTableLog
-		lowThreshold = total >> tableLog
-		lowOne       = (total * 3) >> (tableLog + 1)
-	)
-	for i, cnt := range s.count[:s.symbolLen] {
-		if cnt == 0 {
-			s.norm[i] = 0
-			continue
-		}
-		if cnt <= lowThreshold {
-			s.norm[i] = -1
-			distributed++
-			total -= cnt
-			continue
-		}
-		if cnt <= lowOne {
-			s.norm[i] = 1
-			distributed++
-			total -= cnt
-			continue
-		}
-		s.norm[i] = notYetAssigned
-	}
-	toDistribute := (1 << tableLog) - distributed
-
-	if (total / toDistribute) > lowOne {
-		// risk of rounding to zero
-		lowOne = (total * 3) / (toDistribute * 2)
-		for i, cnt := range s.count[:s.symbolLen] {
-			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
-				s.norm[i] = 1
-				distributed++
-				total -= cnt
-				continue
-			}
-		}
-		toDistribute = (1 << tableLog) - distributed
-	}
-	if distributed == uint32(s.symbolLen)+1 {
-		// all values are pretty poor;
-		//   probably incompressible data (should have already been detected);
-		//   find max, then give all remaining points to max
-		var maxV int
-		var maxC uint32
-		for i, cnt := range s.count[:s.symbolLen] {
-			if cnt > maxC {
-				maxV = i
-				maxC = cnt
-			}
-		}
-		s.norm[maxV] += int16(toDistribute)
-		return nil
-	}
-
-	if total == 0 {
-		// all of the symbols were low enough for the lowOne or lowThreshold
-		for i := uint32(0); toDistribute > 0; i = (i + 1) % (uint32(s.symbolLen)) {
-			if s.norm[i] > 0 {
-				toDistribute--
-				s.norm[i]++
-			}
-		}
-		return nil
-	}
-
-	var (
-		vStepLog = 62 - uint64(tableLog)
-		mid      = uint64((1 << (vStepLog - 1)) - 1)
-		rStep    = (((1 << vStepLog) * uint64(toDistribute)) + mid) / uint64(total) // scale on remaining
-		tmpTotal = mid
-	)
-	for i, cnt := range s.count[:s.symbolLen] {
-		if s.norm[i] == notYetAssigned {
-			var (
-				end    = tmpTotal + uint64(cnt)*rStep
-				sStart = uint32(tmpTotal >> vStepLog)
-				sEnd   = uint32(end >> vStepLog)
-				weight = sEnd - sStart
-			)
-			if weight < 1 {
-				return errors.New("weight < 1")
-			}
-			s.norm[i] = int16(weight)
-			tmpTotal = end
-		}
-	}
-	return nil
-}
-
-// validateNorm validates the normalized histogram table.
-func (s *Scratch) validateNorm() (err error) {
-	var total int
-	for _, v := range s.norm[:s.symbolLen] {
-		if v >= 0 {
-			total += int(v)
-		} else {
-			total -= int(v)
-		}
-	}
-	defer func() {
-		if err == nil {
-			return
-		}
-		fmt.Printf("selected TableLog: %d, Symbol length: %d\n", s.actualTableLog, s.symbolLen)
-		for i, v := range s.norm[:s.symbolLen] {
-			fmt.Printf("%3d: %5d -> %4d \n", i, s.count[i], v)
-		}
-	}()
-	if total != (1 << s.actualTableLog) {
-		return fmt.Errorf("warning: Total == %d != %d", total, 1<<s.actualTableLog)
-	}
-	for i, v := range s.count[s.symbolLen:] {
-		if v != 0 {
-			return fmt.Errorf("warning: Found symbol out of range, %d after cut", i)
-		}
-	}
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
deleted file mode 100644
index 0c7dd4ffe..000000000
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ /dev/null
@@ -1,376 +0,0 @@
-package fse
-
-import (
-	"errors"
-	"fmt"
-)
-
-const (
-	tablelogAbsoluteMax = 15
-)
-
-// Decompress a block of data.
-// You can provide a scratch buffer to avoid allocations.
-// If nil is provided a temporary one will be allocated.
-// It is possible, but by no way guaranteed that corrupt data will
-// return an error.
-// It is up to the caller to verify integrity of the returned data.
-// Use a predefined Scratch to set maximum acceptable output size.
-func Decompress(b []byte, s *Scratch) ([]byte, error) {
-	s, err := s.prepare(b)
-	if err != nil {
-		return nil, err
-	}
-	s.Out = s.Out[:0]
-	err = s.readNCount()
-	if err != nil {
-		return nil, err
-	}
-	err = s.buildDtable()
-	if err != nil {
-		return nil, err
-	}
-	err = s.decompress()
-	if err != nil {
-		return nil, err
-	}
-
-	return s.Out, nil
-}
-
-// readNCount will read the symbol distribution so decoding tables can be constructed.
-func (s *Scratch) readNCount() error {
-	var (
-		charnum   uint16
-		previous0 bool
-		b         = &s.br
-	)
-	iend := b.remain()
-	if iend < 4 {
-		return errors.New("input too small")
-	}
-	bitStream := b.Uint32()
-	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
-	if nbBits > tablelogAbsoluteMax {
-		return errors.New("tableLog too large")
-	}
-	bitStream >>= 4
-	bitCount := uint(4)
-
-	s.actualTableLog = uint8(nbBits)
-	remaining := int32((1 << nbBits) + 1)
-	threshold := int32(1 << nbBits)
-	gotTotal := int32(0)
-	nbBits++
-
-	for remaining > 1 {
-		if previous0 {
-			n0 := charnum
-			for (bitStream & 0xFFFF) == 0xFFFF {
-				n0 += 24
-				if b.off < iend-5 {
-					b.advance(2)
-					bitStream = b.Uint32() >> bitCount
-				} else {
-					bitStream >>= 16
-					bitCount += 16
-				}
-			}
-			for (bitStream & 3) == 3 {
-				n0 += 3
-				bitStream >>= 2
-				bitCount += 2
-			}
-			n0 += uint16(bitStream & 3)
-			bitCount += 2
-			if n0 > maxSymbolValue {
-				return errors.New("maxSymbolValue too small")
-			}
-			for charnum < n0 {
-				s.norm[charnum&0xff] = 0
-				charnum++
-			}
-
-			if b.off <= iend-7 || b.off+int(bitCount>>3) <= iend-4 {
-				b.advance(bitCount >> 3)
-				bitCount &= 7
-				bitStream = b.Uint32() >> bitCount
-			} else {
-				bitStream >>= 2
-			}
-		}
-
-		max := (2*(threshold) - 1) - (remaining)
-		var count int32
-
-		if (int32(bitStream) & (threshold - 1)) < max {
-			count = int32(bitStream) & (threshold - 1)
-			bitCount += nbBits - 1
-		} else {
-			count = int32(bitStream) & (2*threshold - 1)
-			if count >= threshold {
-				count -= max
-			}
-			bitCount += nbBits
-		}
-
-		count-- // extra accuracy
-		if count < 0 {
-			// -1 means +1
-			remaining += count
-			gotTotal -= count
-		} else {
-			remaining -= count
-			gotTotal += count
-		}
-		s.norm[charnum&0xff] = int16(count)
-		charnum++
-		previous0 = count == 0
-		for remaining < threshold {
-			nbBits--
-			threshold >>= 1
-		}
-		if b.off <= iend-7 || b.off+int(bitCount>>3) <= iend-4 {
-			b.advance(bitCount >> 3)
-			bitCount &= 7
-		} else {
-			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
-			b.off = len(b.b) - 4
-		}
-		bitStream = b.Uint32() >> (bitCount & 31)
-	}
-	s.symbolLen = charnum
-
-	if s.symbolLen <= 1 {
-		return fmt.Errorf("symbolLen (%d) too small", s.symbolLen)
-	}
-	if s.symbolLen > maxSymbolValue+1 {
-		return fmt.Errorf("symbolLen (%d) too big", s.symbolLen)
-	}
-	if remaining != 1 {
-		return fmt.Errorf("corruption detected (remaining %d != 1)", remaining)
-	}
-	if bitCount > 32 {
-		return fmt.Errorf("corruption detected (bitCount %d > 32)", bitCount)
-	}
-	if gotTotal != 1<<s.actualTableLog {
-		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
-	}
-	b.advance((bitCount + 7) >> 3)
-	return nil
-}
-
-// decSymbol contains information about a state entry,
-// Including the state offset base, the output symbol and
-// the number of bits to read for the low part of the destination state.
-type decSymbol struct {
-	newState uint16
-	symbol   uint8
-	nbBits   uint8
-}
-
-// allocDtable will allocate decoding tables if they are not big enough.
-func (s *Scratch) allocDtable() {
-	tableSize := 1 << s.actualTableLog
-	if cap(s.decTable) < tableSize {
-		s.decTable = make([]decSymbol, tableSize)
-	}
-	s.decTable = s.decTable[:tableSize]
-
-	if cap(s.ct.tableSymbol) < 256 {
-		s.ct.tableSymbol = make([]byte, 256)
-	}
-	s.ct.tableSymbol = s.ct.tableSymbol[:256]
-
-	if cap(s.ct.stateTable) < 256 {
-		s.ct.stateTable = make([]uint16, 256)
-	}
-	s.ct.stateTable = s.ct.stateTable[:256]
-}
-
-// buildDtable will build the decoding table.
-func (s *Scratch) buildDtable() error {
-	tableSize := uint32(1 << s.actualTableLog)
-	highThreshold := tableSize - 1
-	s.allocDtable()
-	symbolNext := s.ct.stateTable[:256]
-
-	// Init, lay down lowprob symbols
-	s.zeroBits = false
-	{
-		largeLimit := int16(1 << (s.actualTableLog - 1))
-		for i, v := range s.norm[:s.symbolLen] {
-			if v == -1 {
-				s.decTable[highThreshold].symbol = uint8(i)
-				highThreshold--
-				symbolNext[i] = 1
-			} else {
-				if v >= largeLimit {
-					s.zeroBits = true
-				}
-				symbolNext[i] = uint16(v)
-			}
-		}
-	}
-	// Spread symbols
-	{
-		tableMask := tableSize - 1
-		step := tableStep(tableSize)
-		position := uint32(0)
-		for ss, v := range s.norm[:s.symbolLen] {
-			for i := 0; i < int(v); i++ {
-				s.decTable[position].symbol = uint8(ss)
-				position = (position + step) & tableMask
-				for position > highThreshold {
-					// lowprob area
-					position = (position + step) & tableMask
-				}
-			}
-		}
-		if position != 0 {
-			// position must reach all cells once, otherwise normalizedCounter is incorrect
-			return errors.New("corrupted input (position != 0)")
-		}
-	}
-
-	// Build Decoding table
-	{
-		tableSize := uint16(1 << s.actualTableLog)
-		for u, v := range s.decTable {
-			symbol := v.symbol
-			nextState := symbolNext[symbol]
-			symbolNext[symbol] = nextState + 1
-			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
-			s.decTable[u].nbBits = nBits
-			newState := (nextState << nBits) - tableSize
-			if newState >= tableSize {
-				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
-			}
-			if newState == uint16(u) && nBits == 0 {
-				// Seems weird that this is possible with nbits > 0.
-				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
-			}
-			s.decTable[u].newState = newState
-		}
-	}
-	return nil
-}
-
-// decompress will decompress the bitstream.
-// If the buffer is over-read an error is returned.
-func (s *Scratch) decompress() error {
-	br := &s.bits
-	if err := br.init(s.br.unread()); err != nil {
-		return err
-	}
-
-	var s1, s2 decoder
-	// Initialize and decode first state and symbol.
-	s1.init(br, s.decTable, s.actualTableLog)
-	s2.init(br, s.decTable, s.actualTableLog)
-
-	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.ct.tableSymbol[:256]
-	var off uint8
-
-	// Main part
-	if !s.zeroBits {
-		for br.off >= 8 {
-			br.fillFast()
-			tmp[off+0] = s1.nextFast()
-			tmp[off+1] = s2.nextFast()
-			br.fillFast()
-			tmp[off+2] = s1.nextFast()
-			tmp[off+3] = s2.nextFast()
-			off += 4
-			// When off is 0, we have overflowed and should write.
-			if off == 0 {
-				s.Out = append(s.Out, tmp...)
-				if len(s.Out) >= s.DecompressLimit {
-					return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
-				}
-			}
-		}
-	} else {
-		for br.off >= 8 {
-			br.fillFast()
-			tmp[off+0] = s1.next()
-			tmp[off+1] = s2.next()
-			br.fillFast()
-			tmp[off+2] = s1.next()
-			tmp[off+3] = s2.next()
-			off += 4
-			if off == 0 {
-				s.Out = append(s.Out, tmp...)
-				// When off is 0, we have overflowed and should write.
-				if len(s.Out) >= s.DecompressLimit {
-					return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
-				}
-			}
-		}
-	}
-	s.Out = append(s.Out, tmp[:off]...)
-
-	// Final bits, a bit more expensive check
-	for {
-		if s1.finished() {
-			s.Out = append(s.Out, s1.final(), s2.final())
-			break
-		}
-		br.fill()
-		s.Out = append(s.Out, s1.next())
-		if s2.finished() {
-			s.Out = append(s.Out, s2.final(), s1.final())
-			break
-		}
-		s.Out = append(s.Out, s2.next())
-		if len(s.Out) >= s.DecompressLimit {
-			return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
-		}
-	}
-	return br.close()
-}
-
-// decoder keeps track of the current state and updates it from the bitstream.
-type decoder struct {
-	state uint16
-	br    *bitReader
-	dt    []decSymbol
-}
-
-// init will initialize the decoder and read the first state from the stream.
-func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
-	d.dt = dt
-	d.br = in
-	d.state = in.getBits(tableLog)
-}
-
-// next returns the next symbol and sets the next state.
-// At least tablelog bits must be available in the bit reader.
-func (d *decoder) next() uint8 {
-	n := &d.dt[d.state]
-	lowBits := d.br.getBits(n.nbBits)
-	d.state = n.newState + lowBits
-	return n.symbol
-}
-
-// finished returns true if all bits have been read from the bitstream
-// and the next state would require reading bits from the input.
-func (d *decoder) finished() bool {
-	return d.br.finished() && d.dt[d.state].nbBits > 0
-}
-
-// final returns the current state symbol without decoding the next.
-func (d *decoder) final() uint8 {
-	return d.dt[d.state].symbol
-}
-
-// nextFast returns the next symbol and sets the next state.
-// This can only be used if no symbols are 0 bits.
-// At least tablelog bits must be available in the bit reader.
-func (d *decoder) nextFast() uint8 {
-	n := d.dt[d.state]
-	lowBits := d.br.getBitsFast(n.nbBits)
-	d.state = n.newState + lowBits
-	return n.symbol
-}
diff --git a/vendor/github.com/klauspost/compress/fse/fse.go b/vendor/github.com/klauspost/compress/fse/fse.go
deleted file mode 100644
index 535cbadfd..000000000
--- a/vendor/github.com/klauspost/compress/fse/fse.go
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-// Package fse provides Finite State Entropy encoding and decoding.
-//
-// Finite State Entropy encoding provides a fast near-optimal symbol encoding/decoding
-// for byte blocks as implemented in zstd.
-//
-// See https://github.com/klauspost/compress/tree/master/fse for more information.
-package fse
-
-import (
-	"errors"
-	"fmt"
-	"math/bits"
-)
-
-const (
-	/*!MEMORY_USAGE :
-	 *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
-	 *  Increasing memory usage improves compression ratio
-	 *  Reduced memory usage can improve speed, due to cache effect
-	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage     = 14
-	defaultMemoryUsage = 13
-
-	maxTableLog     = maxMemoryUsage - 2
-	maxTablesize    = 1 << maxTableLog
-	defaultTablelog = defaultMemoryUsage - 2
-	minTablelog     = 5
-	maxSymbolValue  = 255
-)
-
-var (
-	// ErrIncompressible is returned when input is judged to be too hard to compress.
-	ErrIncompressible = errors.New("input is not compressible")
-
-	// ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
-	ErrUseRLE = errors.New("input is single value repeated")
-)
-
-// Scratch provides temporary storage for compression and decompression.
-type Scratch struct {
-	// Private
-	count    [maxSymbolValue + 1]uint32
-	norm     [maxSymbolValue + 1]int16
-	br       byteReader
-	bits     bitReader
-	bw       bitWriter
-	ct       cTable      // Compression tables.
-	decTable []decSymbol // Decompression table.
-	maxCount int         // count of the most probable symbol
-
-	// Per block parameters.
-	// These can be used to override compression parameters of the block.
-	// Do not touch, unless you know what you are doing.
-
-	// Out is output buffer.
-	// If the scratch is re-used before the caller is done processing the output,
-	// set this field to nil.
-	// Otherwise the output buffer will be re-used for next Compression/Decompression step
-	// and allocation will be avoided.
-	Out []byte
-
-	// DecompressLimit limits the maximum decoded size acceptable.
-	// If > 0 decompression will stop when approximately this many bytes
-	// has been decoded.
-	// If 0, maximum size will be 2GB.
-	DecompressLimit int
-
-	symbolLen      uint16 // Length of active part of the symbol table.
-	actualTableLog uint8  // Selected tablelog.
-	zeroBits       bool   // no bits has prob > 50%.
-	clearCount     bool   // clear count
-
-	// MaxSymbolValue will override the maximum symbol value of the next block.
-	MaxSymbolValue uint8
-
-	// TableLog will attempt to override the tablelog for the next block.
-	TableLog uint8
-}
-
-// Histogram allows to populate the histogram and skip that step in the compression,
-// It otherwise allows to inspect the histogram when compression is done.
-// To indicate that you have populated the histogram call HistogramFinished
-// with the value of the highest populated symbol, as well as the number of entries
-// in the most populated entry. These are accepted at face value.
-// The returned slice will always be length 256.
-func (s *Scratch) Histogram() []uint32 {
-	return s.count[:]
-}
-
-// HistogramFinished can be called to indicate that the histogram has been populated.
-// maxSymbol is the index of the highest set symbol of the next data segment.
-// maxCount is the number of entries in the most populated entry.
-// These are accepted at face value.
-func (s *Scratch) HistogramFinished(maxSymbol uint8, maxCount int) {
-	s.maxCount = maxCount
-	s.symbolLen = uint16(maxSymbol) + 1
-	s.clearCount = maxCount != 0
-}
-
-// prepare will prepare and allocate scratch tables used for both compression and decompression.
-func (s *Scratch) prepare(in []byte) (*Scratch, error) {
-	if s == nil {
-		s = &Scratch{}
-	}
-	if s.MaxSymbolValue == 0 {
-		s.MaxSymbolValue = 255
-	}
-	if s.TableLog == 0 {
-		s.TableLog = defaultTablelog
-	}
-	if s.TableLog > maxTableLog {
-		return nil, fmt.Errorf("tableLog (%d) > maxTableLog (%d)", s.TableLog, maxTableLog)
-	}
-	if cap(s.Out) == 0 {
-		s.Out = make([]byte, 0, len(in))
-	}
-	if s.clearCount && s.maxCount == 0 {
-		for i := range s.count {
-			s.count[i] = 0
-		}
-		s.clearCount = false
-	}
-	s.br.init(in)
-	if s.DecompressLimit == 0 {
-		// Max size 2GB.
-		s.DecompressLimit = (2 << 30) - 1
-	}
-
-	return s, nil
-}
-
-// tableStep returns the next table index.
-func tableStep(tableSize uint32) uint32 {
-	return (tableSize >> 1) + (tableSize >> 3) + 3
-}
-
-func highBits(val uint32) (n uint32) {
-	return uint32(bits.Len32(val) - 1)
-}
diff --git a/vendor/github.com/klauspost/compress/gen.sh b/vendor/github.com/klauspost/compress/gen.sh
deleted file mode 100644
index aff942205..000000000
--- a/vendor/github.com/klauspost/compress/gen.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-cd s2/cmd/_s2sx/ || exit 1
-go generate .
diff --git a/vendor/github.com/klauspost/compress/huff0/.gitignore b/vendor/github.com/klauspost/compress/huff0/.gitignore
deleted file mode 100644
index b3d262958..000000000
--- a/vendor/github.com/klauspost/compress/huff0/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/huff0-fuzz.zip
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
deleted file mode 100644
index 8b6e5c663..000000000
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Huff0 entropy compression
-
-This package provides Huff0 encoding and decoding as used in zstd.
-            
-[Huff0](https://github.com/Cyan4973/FiniteStateEntropy#new-generation-entropy-coders), 
-a Huffman codec designed for modern CPU, featuring OoO (Out of Order) operations on multiple ALU 
-(Arithmetic Logic Unit), achieving extremely fast compression and decompression speeds.
-
-This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
-This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
-but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
-
-* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
-
-## News
-
-This is used as part of the [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression package.
-
-This ensures that most functionality is well tested.
-
-# Usage
-
-This package provides a low level interface that allows to compress single independent blocks. 
-
-Each block is separate, and there is no built in integrity checks. 
-This means that the caller should keep track of block sizes and also do checksums if needed.  
-
-Compressing a block is done via the [`Compress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress1X) and 
-[`Compress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress4X) functions.
-You must provide input and will receive the output and maybe an error.
-
-These error values can be returned:
-
-| Error               | Description                                                                 |
-|---------------------|-----------------------------------------------------------------------------|
-| `<nil>`             | Everything ok, output is returned                                           |
-| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
-| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
-| `ErrTooBig`         | Returned if the input block exceeds the maximum allowed size (128 Kib)      |
-| `(error)`           | An internal error occurred.                                                 |
-
-
-As can be seen above some of there are errors that will be returned even under normal operation so it is important to handle these.
-
-To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object 
-that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
-object can be used for both.   
-
-Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
-you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
-
-The `Scratch` object will retain state that allows to re-use previous tables for encoding and decoding.  
-
-## Tables and re-use
-
-Huff0 allows for reusing tables from the previous block to save space if that is expected to give better/faster results. 
-
-The Scratch object allows you to set a [`ReusePolicy`](https://godoc.org/github.com/klauspost/compress/huff0#ReusePolicy) 
-that controls this behaviour. See the documentation for details. This can be altered between each block.
-
-Do however note that this information is *not* stored in the output block and it is up to the users of the package to
-record whether [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable) should be called,
-based on the boolean reported back from the CompressXX call. 
-
-If you want to store the table separate from the data, you can access them as `OutData` and `OutTable` on the 
-[`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object.
-
-## Decompressing
-
-The first part of decoding is to initialize the decoding table through [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable).
-This will initialize the decoding tables. 
-You can supply the complete block to `ReadTable` and it will return the data part of the block 
-which can be given to the decompressor. 
-
-Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
-or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
-
-For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
-
-You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
-your input was likely corrupted. 
-
-It is important to note that a successful decoding does *not* mean your output matches your original input. 
-There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
-
-# Contributing
-
-Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
deleted file mode 100644
index bfc7a523d..000000000
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-import (
-	"errors"
-	"fmt"
-	"io"
-
-	"github.com/klauspost/compress/internal/le"
-)
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReaderBytes struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReaderBytes) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.advance(8 - uint8(highBit32(uint32(v))))
-	return nil
-}
-
-// peekByteFast requires that at least one byte is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReaderBytes) peekByteFast() uint8 {
-	got := uint8(b.value >> 56)
-	return got
-}
-
-func (b *bitReaderBytes) advance(n uint8) {
-	b.bitsRead += n
-	b.value <<= n & 63
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReaderBytes) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	low := le.Load32(b.in, b.off-4)
-	b.value |= uint64(low) << (b.bitsRead - 32)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
-func (b *bitReaderBytes) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = le.Load64(b.in, b.off-8)
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReaderBytes) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off >= 4 {
-		low := le.Load32(b.in, b.off-4)
-		b.value |= uint64(low) << (b.bitsRead - 32)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderBytes) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-func (b *bitReaderBytes) remaining() uint {
-	return b.off*8 + uint(64-b.bitsRead)
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReaderBytes) close() error {
-	// Release reference.
-	b.in = nil
-	if b.remaining() > 0 {
-		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
-	}
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
-// bitReaderShifted reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReaderShifted struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReaderShifted) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.advance(8 - uint8(highBit32(uint32(v))))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
-	return uint16(b.value >> ((64 - n) & 63))
-}
-
-func (b *bitReaderShifted) advance(n uint8) {
-	b.bitsRead += n
-	b.value <<= n & 63
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReaderShifted) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	low := le.Load32(b.in, b.off-4)
-	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
-func (b *bitReaderShifted) fillFastStart() {
-	b.value = le.Load64(b.in, b.off-8)
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReaderShifted) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		low := le.Load32(b.in, b.off-4)
-		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-func (b *bitReaderShifted) remaining() uint {
-	return b.off*8 + uint(64-b.bitsRead)
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReaderShifted) close() error {
-	// Release reference.
-	b.in = nil
-	if b.remaining() > 0 {
-		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
-	}
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
deleted file mode 100644
index 0ebc9aaac..000000000
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-// bitWriter will write bits.
-// First bit will be LSB of the first byte of output.
-type bitWriter struct {
-	bitContainer uint64
-	nBits        uint8
-	out          []byte
-}
-
-// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// encSymbol will add up to 16 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
-	enc := ct[symbol]
-	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
-	if false {
-		if enc.nBits == 0 {
-			panic("nbits 0")
-		}
-	}
-	b.nBits += enc.nBits
-}
-
-// encTwoSymbols will add up to 32 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
-	encA := ct[av]
-	encB := ct[bv]
-	sh := b.nBits & 63
-	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
-	b.bitContainer |= combined << sh
-	if false {
-		if encA.nBits == 0 {
-			panic("nbitsA 0")
-		}
-		if encB.nBits == 0 {
-			panic("nbitsB 0")
-		}
-	}
-	b.nBits += encA.nBits + encB.nBits
-}
-
-// encFourSymbols adds up to 32 bits from four symbols.
-// It will not check if there is space for them,
-// so the caller must ensure that b has been flushed recently.
-func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
-	bitsA := encA.nBits
-	bitsB := bitsA + encB.nBits
-	bitsC := bitsB + encC.nBits
-	bitsD := bitsC + encD.nBits
-	combined := uint64(encA.val) |
-		(uint64(encB.val) << (bitsA & 63)) |
-		(uint64(encC.val) << (bitsB & 63)) |
-		(uint64(encD.val) << (bitsC & 63))
-	b.bitContainer |= combined << (b.nBits & 63)
-	b.nBits += bitsD
-}
-
-// flush32 will flush out, so there are at least 32 bits available for writing.
-func (b *bitWriter) flush32() {
-	if b.nBits < 32 {
-		return
-	}
-	b.out = append(b.out,
-		byte(b.bitContainer),
-		byte(b.bitContainer>>8),
-		byte(b.bitContainer>>16),
-		byte(b.bitContainer>>24))
-	b.nBits -= 32
-	b.bitContainer >>= 32
-}
-
-// flushAlign will flush remaining full bytes and align to next byte boundary.
-func (b *bitWriter) flushAlign() {
-	nbBytes := (b.nBits + 7) >> 3
-	for i := uint8(0); i < nbBytes; i++ {
-		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
-	}
-	b.nBits = 0
-	b.bitContainer = 0
-}
-
-// close will write the alignment bit and write the final byte(s)
-// to the output.
-func (b *bitWriter) close() {
-	// End mark
-	b.addBits16Clean(1, 1)
-	// flush until next byte.
-	b.flushAlign()
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
deleted file mode 100644
index 84aa3d12f..000000000
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ /dev/null
@@ -1,742 +0,0 @@
-package huff0
-
-import (
-	"fmt"
-	"math"
-	"runtime"
-	"sync"
-)
-
-// Compress1X will compress the input.
-// The output can be decoded using Decompress1X.
-// Supply a Scratch object. The scratch object contains state about re-use,
-// So when sharing across independent encodes, be sure to set the re-use policy.
-func Compress1X(in []byte, s *Scratch) (out []byte, reUsed bool, err error) {
-	s, err = s.prepare(in)
-	if err != nil {
-		return nil, false, err
-	}
-	return compress(in, s, s.compress1X)
-}
-
-// Compress4X will compress the input. The input is split into 4 independent blocks
-// and compressed similar to Compress1X.
-// The output can be decoded using Decompress4X.
-// Supply a Scratch object. The scratch object contains state about re-use,
-// So when sharing across independent encodes, be sure to set the re-use policy.
-func Compress4X(in []byte, s *Scratch) (out []byte, reUsed bool, err error) {
-	s, err = s.prepare(in)
-	if err != nil {
-		return nil, false, err
-	}
-	if false {
-		// TODO: compress4Xp only slightly faster.
-		const parallelThreshold = 8 << 10
-		if len(in) < parallelThreshold || runtime.GOMAXPROCS(0) == 1 {
-			return compress(in, s, s.compress4X)
-		}
-		return compress(in, s, s.compress4Xp)
-	}
-	return compress(in, s, s.compress4X)
-}
-
-func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)) (out []byte, reUsed bool, err error) {
-	// Nuke previous table if we cannot reuse anyway.
-	if s.Reuse == ReusePolicyNone {
-		s.prevTable = s.prevTable[:0]
-	}
-
-	// Create histogram, if none was provided.
-	maxCount := s.maxCount
-	var canReuse = false
-	if maxCount == 0 {
-		maxCount, canReuse = s.countSimple(in)
-	} else {
-		canReuse = s.canUseTable(s.prevTable)
-	}
-
-	// We want the output size to be less than this:
-	wantSize := len(in)
-	if s.WantLogLess > 0 {
-		wantSize -= wantSize >> s.WantLogLess
-	}
-
-	// Reset for next run.
-	s.clearCount = true
-	s.maxCount = 0
-	if maxCount >= len(in) {
-		if maxCount > len(in) {
-			return nil, false, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
-		}
-		if len(in) == 1 {
-			return nil, false, ErrIncompressible
-		}
-		// One symbol, use RLE
-		return nil, false, ErrUseRLE
-	}
-	if maxCount == 1 || maxCount < (len(in)>>7) {
-		// Each symbol present maximum once or too well distributed.
-		return nil, false, ErrIncompressible
-	}
-	if s.Reuse == ReusePolicyMust && !canReuse {
-		// We must reuse, but we can't.
-		return nil, false, ErrIncompressible
-	}
-	if (s.Reuse == ReusePolicyPrefer || s.Reuse == ReusePolicyMust) && canReuse {
-		keepTable := s.cTable
-		keepTL := s.actualTableLog
-		s.cTable = s.prevTable
-		s.actualTableLog = s.prevTableLog
-		s.Out, err = compressor(in)
-		s.cTable = keepTable
-		s.actualTableLog = keepTL
-		if err == nil && len(s.Out) < wantSize {
-			s.OutData = s.Out
-			return s.Out, true, nil
-		}
-		if s.Reuse == ReusePolicyMust {
-			return nil, false, ErrIncompressible
-		}
-		// Do not attempt to re-use later.
-		s.prevTable = s.prevTable[:0]
-	}
-
-	// Calculate new table.
-	err = s.buildCTable()
-	if err != nil {
-		return nil, false, err
-	}
-
-	if false && !s.canUseTable(s.cTable) {
-		panic("invalid table generated")
-	}
-
-	if s.Reuse == ReusePolicyAllow && canReuse {
-		hSize := len(s.Out)
-		oldSize := s.prevTable.estimateSize(s.count[:s.symbolLen])
-		newSize := s.cTable.estimateSize(s.count[:s.symbolLen])
-		if oldSize <= hSize+newSize || hSize+12 >= wantSize {
-			// Retain cTable even if we re-use.
-			keepTable := s.cTable
-			keepTL := s.actualTableLog
-
-			s.cTable = s.prevTable
-			s.actualTableLog = s.prevTableLog
-			s.Out, err = compressor(in)
-
-			// Restore ctable.
-			s.cTable = keepTable
-			s.actualTableLog = keepTL
-			if err != nil {
-				return nil, false, err
-			}
-			if len(s.Out) >= wantSize {
-				return nil, false, ErrIncompressible
-			}
-			s.OutData = s.Out
-			return s.Out, true, nil
-		}
-	}
-
-	// Use new table
-	err = s.cTable.write(s)
-	if err != nil {
-		s.OutTable = nil
-		return nil, false, err
-	}
-	s.OutTable = s.Out
-
-	// Compress using new table
-	s.Out, err = compressor(in)
-	if err != nil {
-		s.OutTable = nil
-		return nil, false, err
-	}
-	if len(s.Out) >= wantSize {
-		s.OutTable = nil
-		return nil, false, ErrIncompressible
-	}
-	// Move current table into previous.
-	s.prevTable, s.prevTableLog, s.cTable = s.cTable, s.actualTableLog, s.prevTable[:0]
-	s.OutData = s.Out[len(s.OutTable):]
-	return s.Out, false, nil
-}
-
-// EstimateSizes will estimate the data sizes
-func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
-	s, err = s.prepare(in)
-	if err != nil {
-		return 0, 0, 0, err
-	}
-
-	// Create histogram, if none was provided.
-	tableSz, dataSz, reuseSz = -1, -1, -1
-	maxCount := s.maxCount
-	var canReuse = false
-	if maxCount == 0 {
-		maxCount, canReuse = s.countSimple(in)
-	} else {
-		canReuse = s.canUseTable(s.prevTable)
-	}
-
-	// We want the output size to be less than this:
-	wantSize := len(in)
-	if s.WantLogLess > 0 {
-		wantSize -= wantSize >> s.WantLogLess
-	}
-
-	// Reset for next run.
-	s.clearCount = true
-	s.maxCount = 0
-	if maxCount >= len(in) {
-		if maxCount > len(in) {
-			return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
-		}
-		if len(in) == 1 {
-			return 0, 0, 0, ErrIncompressible
-		}
-		// One symbol, use RLE
-		return 0, 0, 0, ErrUseRLE
-	}
-	if maxCount == 1 || maxCount < (len(in)>>7) {
-		// Each symbol present maximum once or too well distributed.
-		return 0, 0, 0, ErrIncompressible
-	}
-
-	// Calculate new table.
-	err = s.buildCTable()
-	if err != nil {
-		return 0, 0, 0, err
-	}
-
-	if false && !s.canUseTable(s.cTable) {
-		panic("invalid table generated")
-	}
-
-	tableSz, err = s.cTable.estTableSize(s)
-	if err != nil {
-		return 0, 0, 0, err
-	}
-	if canReuse {
-		reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
-	}
-	dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
-
-	// Restore
-	return tableSz, dataSz, reuseSz, nil
-}
-
-func (s *Scratch) compress1X(src []byte) ([]byte, error) {
-	return s.compress1xDo(s.Out, src), nil
-}
-
-func (s *Scratch) compress1xDo(dst, src []byte) []byte {
-	var bw = bitWriter{out: dst}
-
-	// N is length divisible by 4.
-	n := len(src)
-	n -= n & 3
-	cTable := s.cTable[:256]
-
-	// Encode last bytes.
-	for i := len(src) & 3; i > 0; i-- {
-		bw.encSymbol(cTable, src[n+i-1])
-	}
-	n -= 4
-	if s.actualTableLog <= 8 {
-		for ; n >= 0; n -= 4 {
-			tmp := src[n : n+4]
-			// tmp should be len 4
-			bw.flush32()
-			bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
-		}
-	} else {
-		for ; n >= 0; n -= 4 {
-			tmp := src[n : n+4]
-			// tmp should be len 4
-			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
-			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
-		}
-	}
-	bw.close()
-	return bw.out
-}
-
-var sixZeros [6]byte
-
-func (s *Scratch) compress4X(src []byte) ([]byte, error) {
-	if len(src) < 12 {
-		return nil, ErrIncompressible
-	}
-	segmentSize := (len(src) + 3) / 4
-
-	// Add placeholder for output length
-	offsetIdx := len(s.Out)
-	s.Out = append(s.Out, sixZeros[:]...)
-
-	for i := 0; i < 4; i++ {
-		toDo := src
-		if len(toDo) > segmentSize {
-			toDo = toDo[:segmentSize]
-		}
-		src = src[len(toDo):]
-
-		idx := len(s.Out)
-		s.Out = s.compress1xDo(s.Out, toDo)
-		if len(s.Out)-idx > math.MaxUint16 {
-			// We cannot store the size in the jump table
-			return nil, ErrIncompressible
-		}
-		// Write compressed length as little endian before block.
-		if i < 3 {
-			// Last length is not written.
-			length := len(s.Out) - idx
-			s.Out[i*2+offsetIdx] = byte(length)
-			s.Out[i*2+offsetIdx+1] = byte(length >> 8)
-		}
-	}
-
-	return s.Out, nil
-}
-
-// compress4Xp will compress 4 streams using separate goroutines.
-func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
-	if len(src) < 12 {
-		return nil, ErrIncompressible
-	}
-	// Add placeholder for output length
-	s.Out = s.Out[:6]
-
-	segmentSize := (len(src) + 3) / 4
-	var wg sync.WaitGroup
-	wg.Add(4)
-	for i := 0; i < 4; i++ {
-		toDo := src
-		if len(toDo) > segmentSize {
-			toDo = toDo[:segmentSize]
-		}
-		src = src[len(toDo):]
-
-		// Separate goroutine for each block.
-		go func(i int) {
-			s.tmpOut[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
-			wg.Done()
-		}(i)
-	}
-	wg.Wait()
-	for i := 0; i < 4; i++ {
-		o := s.tmpOut[i]
-		if len(o) > math.MaxUint16 {
-			// We cannot store the size in the jump table
-			return nil, ErrIncompressible
-		}
-		// Write compressed length as little endian before block.
-		if i < 3 {
-			// Last length is not written.
-			s.Out[i*2] = byte(len(o))
-			s.Out[i*2+1] = byte(len(o) >> 8)
-		}
-
-		// Write output.
-		s.Out = append(s.Out, o...)
-	}
-	return s.Out, nil
-}
-
-// countSimple will create a simple histogram in s.count.
-// Returns the biggest count.
-// Does not update s.clearCount.
-func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
-	reuse = true
-	_ = s.count // Assert that s != nil to speed up the following loop.
-	for _, v := range in {
-		s.count[v]++
-	}
-	m := uint32(0)
-	if len(s.prevTable) > 0 {
-		for i, v := range s.count[:] {
-			if v == 0 {
-				continue
-			}
-			if v > m {
-				m = v
-			}
-			s.symbolLen = uint16(i) + 1
-			if i >= len(s.prevTable) {
-				reuse = false
-			} else if s.prevTable[i].nBits == 0 {
-				reuse = false
-			}
-		}
-		return int(m), reuse
-	}
-	for i, v := range s.count[:] {
-		if v == 0 {
-			continue
-		}
-		if v > m {
-			m = v
-		}
-		s.symbolLen = uint16(i) + 1
-	}
-	return int(m), false
-}
-
-func (s *Scratch) canUseTable(c cTable) bool {
-	if len(c) < int(s.symbolLen) {
-		return false
-	}
-	for i, v := range s.count[:s.symbolLen] {
-		if v != 0 && c[i].nBits == 0 {
-			return false
-		}
-	}
-	return true
-}
-
-//lint:ignore U1000 used for debugging
-func (s *Scratch) validateTable(c cTable) bool {
-	if len(c) < int(s.symbolLen) {
-		return false
-	}
-	for i, v := range s.count[:s.symbolLen] {
-		if v != 0 {
-			if c[i].nBits == 0 {
-				return false
-			}
-			if c[i].nBits > s.actualTableLog {
-				return false
-			}
-		}
-	}
-	return true
-}
-
-// minTableLog provides the minimum logSize to safely represent a distribution.
-func (s *Scratch) minTableLog() uint8 {
-	minBitsSrc := highBit32(uint32(s.srcLen)) + 1
-	minBitsSymbols := highBit32(uint32(s.symbolLen-1)) + 2
-	if minBitsSrc < minBitsSymbols {
-		return uint8(minBitsSrc)
-	}
-	return uint8(minBitsSymbols)
-}
-
-// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
-func (s *Scratch) optimalTableLog() {
-	tableLog := s.TableLog
-	minBits := s.minTableLog()
-	maxBitsSrc := uint8(highBit32(uint32(s.srcLen-1))) - 1
-	if maxBitsSrc < tableLog {
-		// Accuracy can be reduced
-		tableLog = maxBitsSrc
-	}
-	if minBits > tableLog {
-		tableLog = minBits
-	}
-	// Need a minimum to safely represent all symbol values
-	if tableLog < minTablelog {
-		tableLog = minTablelog
-	}
-	if tableLog > tableLogMax {
-		tableLog = tableLogMax
-	}
-	s.actualTableLog = tableLog
-}
-
-type cTableEntry struct {
-	val   uint16
-	nBits uint8
-	// We have 8 bits extra
-}
-
-const huffNodesMask = huffNodesLen - 1
-
-func (s *Scratch) buildCTable() error {
-	s.optimalTableLog()
-	s.huffSort()
-	if cap(s.cTable) < maxSymbolValue+1 {
-		s.cTable = make([]cTableEntry, s.symbolLen, maxSymbolValue+1)
-	} else {
-		s.cTable = s.cTable[:s.symbolLen]
-		for i := range s.cTable {
-			s.cTable[i] = cTableEntry{}
-		}
-	}
-
-	var startNode = int16(s.symbolLen)
-	nonNullRank := s.symbolLen - 1
-
-	nodeNb := startNode
-	huffNode := s.nodes[1 : huffNodesLen+1]
-
-	// This overlays the slice above, but allows "-1" index lookups.
-	// Different from reference implementation.
-	huffNode0 := s.nodes[0 : huffNodesLen+1]
-
-	for huffNode[nonNullRank].count() == 0 {
-		nonNullRank--
-	}
-
-	lowS := int16(nonNullRank)
-	nodeRoot := nodeNb + lowS - 1
-	lowN := nodeNb
-	huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
-	huffNode[lowS].setParent(nodeNb)
-	huffNode[lowS-1].setParent(nodeNb)
-	nodeNb++
-	lowS -= 2
-	for n := nodeNb; n <= nodeRoot; n++ {
-		huffNode[n].setCount(1 << 30)
-	}
-	// fake entry, strong barrier
-	huffNode0[0].setCount(1 << 31)
-
-	// create parents
-	for nodeNb <= nodeRoot {
-		var n1, n2 int16
-		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
-			n1 = lowS
-			lowS--
-		} else {
-			n1 = lowN
-			lowN++
-		}
-		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
-			n2 = lowS
-			lowS--
-		} else {
-			n2 = lowN
-			lowN++
-		}
-
-		huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
-		huffNode0[n1+1].setParent(nodeNb)
-		huffNode0[n2+1].setParent(nodeNb)
-		nodeNb++
-	}
-
-	// distribute weights (unlimited tree height)
-	huffNode[nodeRoot].setNbBits(0)
-	for n := nodeRoot - 1; n >= startNode; n-- {
-		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
-	}
-	for n := uint16(0); n <= nonNullRank; n++ {
-		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
-	}
-	s.actualTableLog = s.setMaxHeight(int(nonNullRank))
-	maxNbBits := s.actualTableLog
-
-	// fill result into tree (val, nbBits)
-	if maxNbBits > tableLogMax {
-		return fmt.Errorf("internal error: maxNbBits (%d) > tableLogMax (%d)", maxNbBits, tableLogMax)
-	}
-	var nbPerRank [tableLogMax + 1]uint16
-	var valPerRank [16]uint16
-	for _, v := range huffNode[:nonNullRank+1] {
-		nbPerRank[v.nbBits()]++
-	}
-	// determine stating value per rank
-	{
-		min := uint16(0)
-		for n := maxNbBits; n > 0; n-- {
-			// get starting value within each rank
-			valPerRank[n] = min
-			min += nbPerRank[n]
-			min >>= 1
-		}
-	}
-
-	// push nbBits per symbol, symbol order
-	for _, v := range huffNode[:nonNullRank+1] {
-		s.cTable[v.symbol()].nBits = v.nbBits()
-	}
-
-	// assign value within rank, symbol order
-	t := s.cTable[:s.symbolLen]
-	for n, val := range t {
-		nbits := val.nBits & 15
-		v := valPerRank[nbits]
-		t[n].val = v
-		valPerRank[nbits] = v + 1
-	}
-
-	return nil
-}
-
-// huffSort will sort symbols, decreasing order.
-func (s *Scratch) huffSort() {
-	type rankPos struct {
-		base    uint32
-		current uint32
-	}
-
-	// Clear nodes
-	nodes := s.nodes[:huffNodesLen+1]
-	s.nodes = nodes
-	nodes = nodes[1 : huffNodesLen+1]
-
-	// Sort into buckets based on length of symbol count.
-	var rank [32]rankPos
-	for _, v := range s.count[:s.symbolLen] {
-		r := highBit32(v+1) & 31
-		rank[r].base++
-	}
-	// maxBitLength is log2(BlockSizeMax) + 1
-	const maxBitLength = 18 + 1
-	for n := maxBitLength; n > 0; n-- {
-		rank[n-1].base += rank[n].base
-	}
-	for n := range rank[:maxBitLength] {
-		rank[n].current = rank[n].base
-	}
-	for n, c := range s.count[:s.symbolLen] {
-		r := (highBit32(c+1) + 1) & 31
-		pos := rank[r].current
-		rank[r].current++
-		prev := nodes[(pos-1)&huffNodesMask]
-		for pos > rank[r].base && c > prev.count() {
-			nodes[pos&huffNodesMask] = prev
-			pos--
-			prev = nodes[(pos-1)&huffNodesMask]
-		}
-		nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
-	}
-}
-
-func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
-	maxNbBits := s.actualTableLog
-	huffNode := s.nodes[1 : huffNodesLen+1]
-	//huffNode = huffNode[: huffNodesLen]
-
-	largestBits := huffNode[lastNonNull].nbBits()
-
-	// early exit : no elt > maxNbBits
-	if largestBits <= maxNbBits {
-		return largestBits
-	}
-	totalCost := int(0)
-	baseCost := int(1) << (largestBits - maxNbBits)
-	n := uint32(lastNonNull)
-
-	for huffNode[n].nbBits() > maxNbBits {
-		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
-		huffNode[n].setNbBits(maxNbBits)
-		n--
-	}
-	// n stops at huffNode[n].nbBits <= maxNbBits
-
-	for huffNode[n].nbBits() == maxNbBits {
-		n--
-	}
-	// n end at index of smallest symbol using < maxNbBits
-
-	// renorm totalCost
-	totalCost >>= largestBits - maxNbBits /* note : totalCost is necessarily a multiple of baseCost */
-
-	// repay normalized cost
-	{
-		const noSymbol = 0xF0F0F0F0
-		var rankLast [tableLogMax + 2]uint32
-
-		for i := range rankLast[:] {
-			rankLast[i] = noSymbol
-		}
-
-		// Get pos of last (smallest) symbol per rank
-		{
-			currentNbBits := maxNbBits
-			for pos := int(n); pos >= 0; pos-- {
-				if huffNode[pos].nbBits() >= currentNbBits {
-					continue
-				}
-				currentNbBits = huffNode[pos].nbBits() // < maxNbBits
-				rankLast[maxNbBits-currentNbBits] = uint32(pos)
-			}
-		}
-
-		for totalCost > 0 {
-			nBitsToDecrease := uint8(highBit32(uint32(totalCost))) + 1
-
-			for ; nBitsToDecrease > 1; nBitsToDecrease-- {
-				highPos := rankLast[nBitsToDecrease]
-				lowPos := rankLast[nBitsToDecrease-1]
-				if highPos == noSymbol {
-					continue
-				}
-				if lowPos == noSymbol {
-					break
-				}
-				highTotal := huffNode[highPos].count()
-				lowTotal := 2 * huffNode[lowPos].count()
-				if highTotal <= lowTotal {
-					break
-				}
-			}
-			// only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !)
-			// HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary
-			// FIXME: try to remove
-			for (nBitsToDecrease <= tableLogMax) && (rankLast[nBitsToDecrease] == noSymbol) {
-				nBitsToDecrease++
-			}
-			totalCost -= 1 << (nBitsToDecrease - 1)
-			if rankLast[nBitsToDecrease-1] == noSymbol {
-				// this rank is no longer empty
-				rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
-			}
-			huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
-				huffNode[rankLast[nBitsToDecrease]].nbBits())
-			if rankLast[nBitsToDecrease] == 0 {
-				/* special case, reached largest symbol */
-				rankLast[nBitsToDecrease] = noSymbol
-			} else {
-				rankLast[nBitsToDecrease]--
-				if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
-					rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
-				}
-			}
-		}
-
-		for totalCost < 0 { /* Sometimes, cost correction overshoot */
-			if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
-				for huffNode[n].nbBits() == maxNbBits {
-					n--
-				}
-				huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
-				rankLast[1] = n + 1
-				totalCost++
-				continue
-			}
-			huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
-			rankLast[1]++
-			totalCost++
-		}
-	}
-	return maxNbBits
-}
-
-// A nodeElt is the fields
-//
-//	count  uint32
-//	parent uint16
-//	symbol byte
-//	nbBits uint8
-//
-// in some order, all squashed into an integer so that the compiler
-// always loads and stores entire nodeElts instead of separate fields.
-type nodeElt uint64
-
-func makeNodeElt(count uint32, symbol byte) nodeElt {
-	return nodeElt(count) | nodeElt(symbol)<<48
-}
-
-func (e *nodeElt) count() uint32  { return uint32(*e) }
-func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
-func (e *nodeElt) symbol() byte   { return byte(*e >> 48) }
-func (e *nodeElt) nbBits() uint8  { return uint8(*e >> 56) }
-
-func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
-func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
-func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
deleted file mode 100644
index 0f56b02d7..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ /dev/null
@@ -1,1167 +0,0 @@
-package huff0
-
-import (
-	"errors"
-	"fmt"
-	"io"
-	"sync"
-
-	"github.com/klauspost/compress/fse"
-)
-
-type dTable struct {
-	single []dEntrySingle
-}
-
-// single-symbols decoding
-type dEntrySingle struct {
-	entry uint16
-}
-
-// Uses special code for all tables that are < 8 bits.
-const use8BitTables = true
-
-// ReadTable will read a table from the input.
-// The size of the input may be larger than the table definition.
-// Any content remaining after the table definition will be returned.
-// If no Scratch is provided a new one is allocated.
-// The returned Scratch can be used for encoding or decoding input using this table.
-func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
-	s, err = s.prepare(nil)
-	if err != nil {
-		return s, nil, err
-	}
-	if len(in) <= 1 {
-		return s, nil, errors.New("input too small for table")
-	}
-	iSize := in[0]
-	in = in[1:]
-	if iSize >= 128 {
-		// Uncompressed
-		oSize := iSize - 127
-		iSize = (oSize + 1) / 2
-		if int(iSize) > len(in) {
-			return s, nil, errors.New("input too small for table")
-		}
-		for n := uint8(0); n < oSize; n += 2 {
-			v := in[n/2]
-			s.huffWeight[n] = v >> 4
-			s.huffWeight[n+1] = v & 15
-		}
-		s.symbolLen = uint16(oSize)
-		in = in[iSize:]
-	} else {
-		if len(in) < int(iSize) {
-			return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
-		}
-		// FSE compressed weights
-		s.fse.DecompressLimit = 255
-		hw := s.huffWeight[:]
-		s.fse.Out = hw
-		b, err := fse.Decompress(in[:iSize], s.fse)
-		s.fse.Out = nil
-		if err != nil {
-			return s, nil, fmt.Errorf("fse decompress returned: %w", err)
-		}
-		if len(b) > 255 {
-			return s, nil, errors.New("corrupt input: output table too large")
-		}
-		s.symbolLen = uint16(len(b))
-		in = in[iSize:]
-	}
-
-	// collect weight stats
-	var rankStats [16]uint32
-	weightTotal := uint32(0)
-	for _, v := range s.huffWeight[:s.symbolLen] {
-		if v > tableLogMax {
-			return s, nil, errors.New("corrupt input: weight too large")
-		}
-		v2 := v & 15
-		rankStats[v2]++
-		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
-		weightTotal += (1 << v2) >> 1
-	}
-	if weightTotal == 0 {
-		return s, nil, errors.New("corrupt input: weights zero")
-	}
-
-	// get last non-null symbol weight (implied, total must be 2^n)
-	{
-		tableLog := highBit32(weightTotal) + 1
-		if tableLog > tableLogMax {
-			return s, nil, errors.New("corrupt input: tableLog too big")
-		}
-		s.actualTableLog = uint8(tableLog)
-		// determine last weight
-		{
-			total := uint32(1) << tableLog
-			rest := total - weightTotal
-			verif := uint32(1) << highBit32(rest)
-			lastWeight := highBit32(rest) + 1
-			if verif != rest {
-				// last value must be a clean power of 2
-				return s, nil, errors.New("corrupt input: last value not power of two")
-			}
-			s.huffWeight[s.symbolLen] = uint8(lastWeight)
-			s.symbolLen++
-			rankStats[lastWeight]++
-		}
-	}
-
-	if (rankStats[1] < 2) || (rankStats[1]&1 != 0) {
-		// by construction : at least 2 elts of rank 1, must be even
-		return s, nil, errors.New("corrupt input: min elt size, even check failed ")
-	}
-
-	// TODO: Choose between single/double symbol decoding
-
-	// Calculate starting value for each rank
-	{
-		var nextRankStart uint32
-		for n := uint8(1); n < s.actualTableLog+1; n++ {
-			current := nextRankStart
-			nextRankStart += rankStats[n] << (n - 1)
-			rankStats[n] = current
-		}
-	}
-
-	// fill DTable (always full size)
-	tSize := 1 << tableLogMax
-	if len(s.dt.single) != tSize {
-		s.dt.single = make([]dEntrySingle, tSize)
-	}
-	cTable := s.prevTable
-	if cap(cTable) < maxSymbolValue+1 {
-		cTable = make([]cTableEntry, 0, maxSymbolValue+1)
-	}
-	cTable = cTable[:maxSymbolValue+1]
-	s.prevTable = cTable[:s.symbolLen]
-	s.prevTableLog = s.actualTableLog
-
-	for n, w := range s.huffWeight[:s.symbolLen] {
-		if w == 0 {
-			cTable[n] = cTableEntry{
-				val:   0,
-				nBits: 0,
-			}
-			continue
-		}
-		length := (uint32(1) << w) >> 1
-		d := dEntrySingle{
-			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
-		}
-
-		rank := &rankStats[w]
-		cTable[n] = cTableEntry{
-			val:   uint16(*rank >> (w - 1)),
-			nBits: uint8(d.entry),
-		}
-
-		single := s.dt.single[*rank : *rank+length]
-		for i := range single {
-			single[i] = d
-		}
-		*rank += length
-	}
-
-	return s, in, nil
-}
-
-// Decompress1X will decompress a 1X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// Before this is called, the table must be initialized with ReadTable unless
-// the encoder re-used the table.
-// deprecated: Use the stateless Decoder() to get a concurrent version.
-func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
-	if cap(s.Out) < s.MaxDecodedSize {
-		s.Out = make([]byte, s.MaxDecodedSize)
-	}
-	s.Out = s.Out[:0:s.MaxDecodedSize]
-	s.Out, err = s.Decoder().Decompress1X(s.Out, in)
-	return s.Out, err
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// Before this is called, the table must be initialized with ReadTable unless
-// the encoder re-used the table.
-// The length of the supplied input must match the end of a block exactly.
-// The destination size of the uncompressed data must be known and provided.
-// deprecated: Use the stateless Decoder() to get a concurrent version.
-func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
-	if dstSize > s.MaxDecodedSize {
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	if cap(s.Out) < dstSize {
-		s.Out = make([]byte, s.MaxDecodedSize)
-	}
-	s.Out = s.Out[:0:dstSize]
-	s.Out, err = s.Decoder().Decompress4X(s.Out, in)
-	return s.Out, err
-}
-
-// Decoder will return a stateless decoder that can be used by multiple
-// decompressors concurrently.
-// Before this is called, the table must be initialized with ReadTable.
-// The Decoder is still linked to the scratch buffer so that cannot be reused.
-// However, it is safe to discard the scratch.
-func (s *Scratch) Decoder() *Decoder {
-	return &Decoder{
-		dt:             s.dt,
-		actualTableLog: s.actualTableLog,
-		bufs:           &s.decPool,
-	}
-}
-
-// Decoder provides stateless decoding.
-type Decoder struct {
-	dt             dTable
-	actualTableLog uint8
-	bufs           *sync.Pool
-}
-
-func (d *Decoder) buffer() *[4][256]byte {
-	buf, ok := d.bufs.Get().(*[4][256]byte)
-	if ok {
-		return buf
-	}
-	return &[4][256]byte{}
-}
-
-// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
-	if d.actualTableLog == 8 {
-		return d.decompress1X8BitExactly(dst, src)
-	}
-	var br bitReaderBytes
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	dt := d.dt.single[:256]
-
-	// Use temp table to avoid bound checks/append penalty.
-	bufs := d.buffer()
-	buf := &bufs[0]
-	var off uint8
-
-	switch d.actualTableLog {
-	case 8:
-		const shift = 0
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					br.close()
-					d.bufs.Put(bufs)
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 7:
-		const shift = 8 - 7
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					br.close()
-					d.bufs.Put(bufs)
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 6:
-		const shift = 8 - 6
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 5:
-		const shift = 8 - 5
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 4:
-		const shift = 8 - 4
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 3:
-		const shift = 8 - 3
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 2:
-		const shift = 8 - 2
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 1:
-		const shift = 8 - 1
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	default:
-		d.bufs.Put(bufs)
-		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		d.bufs.Put(bufs)
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 4, so uint8 is fine
-	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
-	shift := (8 - d.actualTableLog) & 7
-
-	for bitsLeft > 0 {
-		if br.bitsRead >= 64-8 {
-			for br.off > 0 {
-				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-				br.bitsRead -= 8
-				br.off--
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			br.close()
-			d.bufs.Put(bufs)
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := dt[br.peekByteFast()>>shift]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= int8(nBits)
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	d.bufs.Put(bufs)
-	return dst, br.close()
-}
-
-// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
-	var br bitReaderBytes
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	dt := d.dt.single[:256]
-
-	// Use temp table to avoid bound checks/append penalty.
-	bufs := d.buffer()
-	buf := &bufs[0]
-	var off uint8
-
-	const shift = 56
-
-	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
-	for br.off >= 4 {
-		br.fillFast()
-		v := dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		v = dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				d.bufs.Put(bufs)
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			dst = append(dst, buf[:]...)
-		}
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		d.bufs.Put(bufs)
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 4, so uint8 is fine
-	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
-	for bitsLeft > 0 {
-		if br.bitsRead >= 64-8 {
-			for br.off > 0 {
-				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-				br.bitsRead -= 8
-				br.off--
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			d.bufs.Put(bufs)
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := dt[br.peekByteFast()]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= int8(nBits)
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	d.bufs.Put(bufs)
-	return dst, br.close()
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
-	if d.actualTableLog == 8 {
-		return d.decompress4X8bitExactly(dst, src)
-	}
-
-	var br [4]bitReaderBytes
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	shift := (56 + (8 - d.actualTableLog)) & 63
-
-	const tlSize = 1 << 8
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 4 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			// Interleave 2 decodes.
-			const stream = 0
-			const stream2 = 1
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		off += 4
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			// There must at least be 3 buffers left.
-			if len(out)-bufoff < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-			//copy(out, buf[0][:])
-			//copy(out[dstEvery:], buf[1][:])
-			//copy(out[dstEvery*2:], buf[2][:])
-			*(*[bufoff]byte)(out) = buf[0]
-			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
-			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
-			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
-			out = out[bufoff:]
-			decoded += bufoff * 4
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			if br.finished() {
-				d.bufs.Put(buf)
-				return nil, io.ErrUnexpectedEOF
-			}
-			if br.bitsRead >= 56 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value |= uint64(low) << (br.bitsRead - 32)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			v := single[uint8(br.value>>shift)].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			d.bufs.Put(buf)
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
-	var br [4]bitReaderBytes
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const shift = 56
-	const tlSize = 1 << 8
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 4 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			// Interleave 2 decodes.
-			const stream = 0
-			const stream2 = 1
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		off += 4
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			// There must at least be 3 buffers left.
-			if len(out)-bufoff < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-
-			//copy(out, buf[0][:])
-			//copy(out[dstEvery:], buf[1][:])
-			//copy(out[dstEvery*2:], buf[2][:])
-			// copy(out[dstEvery*3:], buf[3][:])
-			*(*[bufoff]byte)(out) = buf[0]
-			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
-			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
-			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
-			out = out[bufoff:]
-			decoded += bufoff * 4
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			if br.finished() {
-				d.bufs.Put(buf)
-				return nil, io.ErrUnexpectedEOF
-			}
-			if br.bitsRead >= 56 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value |= uint64(low) << (br.bitsRead - 32)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			v := single[br.peekByteFast()].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			d.bufs.Put(buf)
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// matches will compare a decoding table to a coding table.
-// Errors are written to the writer.
-// Nothing will be written if table is ok.
-func (s *Scratch) matches(ct cTable, w io.Writer) {
-	if s == nil || len(s.dt.single) == 0 {
-		return
-	}
-	dt := s.dt.single[:1<<s.actualTableLog]
-	tablelog := s.actualTableLog
-	ok := 0
-	broken := 0
-	for sym, enc := range ct {
-		errs := 0
-		broken++
-		if enc.nBits == 0 {
-			for _, dec := range dt {
-				if uint8(dec.entry>>8) == byte(sym) {
-					fmt.Fprintf(w, "symbol %x has decoder, but no encoder\n", sym)
-					errs++
-					break
-				}
-			}
-			if errs == 0 {
-				broken--
-			}
-			continue
-		}
-		// Unused bits in input
-		ub := tablelog - enc.nBits
-		top := enc.val << ub
-		// decoder looks at top bits.
-		dec := dt[top]
-		if uint8(dec.entry) != enc.nBits {
-			fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", sym, enc.nBits, uint8(dec.entry))
-			errs++
-		}
-		if uint8(dec.entry>>8) != uint8(sym) {
-			fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", sym, sym, uint8(dec.entry>>8))
-			errs++
-		}
-		if errs > 0 {
-			fmt.Fprintf(w, "%d errors in base, stopping\n", errs)
-			continue
-		}
-		// Ensure that all combinations are covered.
-		for i := uint16(0); i < (1 << ub); i++ {
-			vval := top | i
-			dec := dt[vval]
-			if uint8(dec.entry) != enc.nBits {
-				fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", vval, enc.nBits, uint8(dec.entry))
-				errs++
-			}
-			if uint8(dec.entry>>8) != uint8(sym) {
-				fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", vval, sym, uint8(dec.entry>>8))
-				errs++
-			}
-			if errs > 20 {
-				fmt.Fprintf(w, "%d errors, stopping\n", errs)
-				break
-			}
-		}
-		if errs == 0 {
-			ok++
-			broken--
-		}
-	}
-	if broken > 0 {
-		fmt.Fprintf(w, "%d broken, %d ok\n", broken, ok)
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
deleted file mode 100644
index ba7e8e6b0..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ /dev/null
@@ -1,226 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-// This file contains the specialisation of Decoder.Decompress4X
-// and Decoder.Decompress1X that use an asm implementation of thir main loops.
-package huff0
-
-import (
-	"errors"
-	"fmt"
-
-	"github.com/klauspost/compress/internal/cpuinfo"
-)
-
-// decompress4x_main_loop_x86 is an x86 assembler implementation
-// of Decompress4X when tablelog > 8.
-//
-//go:noescape
-func decompress4x_main_loop_amd64(ctx *decompress4xContext)
-
-// decompress4x_8b_loop_x86 is an x86 assembler implementation
-// of Decompress4X when tablelog <= 8 which decodes 4 entries
-// per loop.
-//
-//go:noescape
-func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
-
-// fallback8BitSize is the size where using Go version is faster.
-const fallback8BitSize = 800
-
-type decompress4xContext struct {
-	pbr      *[4]bitReaderShifted
-	peekBits uint8
-	out      *byte
-	dstEvery int
-	tbl      *dEntrySingle
-	decoded  int
-	limit    *byte
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-
-	use8BitTables := d.actualTableLog <= 8
-	if cap(dst) < fallback8BitSize && use8BitTables {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	// Decode "jump table"
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	var decoded int
-
-	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
-		ctx := decompress4xContext{
-			pbr:      &br,
-			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
-			out:      &out[0],
-			dstEvery: dstEvery,
-			tbl:      &single[0],
-			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
-		}
-		if use8BitTables {
-			decompress4x_8b_main_loop_amd64(&ctx)
-		} else {
-			decompress4x_main_loop_amd64(&ctx)
-		}
-
-		decoded = ctx.decoded
-		out = out[decoded/4:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			br.fill()
-			if offset >= endsAt {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// decompress4x_main_loop_x86 is an x86 assembler implementation
-// of Decompress1X when tablelog > 8.
-//
-//go:noescape
-func decompress1x_main_loop_amd64(ctx *decompress1xContext)
-
-// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
-// of Decompress1X when tablelog > 8.
-//
-//go:noescape
-func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
-
-type decompress1xContext struct {
-	pbr      *bitReaderShifted
-	peekBits uint8
-	out      *byte
-	outCap   int
-	tbl      *dEntrySingle
-	decoded  int
-}
-
-// Error reported by asm implementations
-const error_max_decoded_size_exeeded = -1
-
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	var br bitReaderShifted
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:maxDecodedSize]
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-
-	if maxDecodedSize >= 4 {
-		ctx := decompress1xContext{
-			pbr:      &br,
-			out:      &dst[0],
-			outCap:   maxDecodedSize,
-			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
-			tbl:      &d.dt.single[0],
-		}
-
-		if cpuinfo.HasBMI2() {
-			decompress1x_main_loop_bmi2(&ctx)
-		} else {
-			decompress1x_main_loop_amd64(&ctx)
-		}
-		if ctx.decoded == error_max_decoded_size_exeeded {
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-
-		dst = dst[:ctx.decoded]
-	}
-
-	// br < 8, so uint8 is fine
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
-	for bitsLeft > 0 {
-		br.fill()
-		if len(dst) >= maxDecodedSize {
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= nBits
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	return dst, br.close()
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
deleted file mode 100644
index c4c7ab2d1..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ /dev/null
@@ -1,830 +0,0 @@
-// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
-
-//go:build amd64 && !appengine && !noasm && gc
-
-// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_main_loop_amd64(SB), $0-8
-	// Preload values
-	MOVQ    ctx+0(FP), AX
-	MOVBQZX 8(AX), DI
-	MOVQ    16(AX), BX
-	MOVQ    48(AX), SI
-	MOVQ    24(AX), R8
-	MOVQ    32(AX), R9
-	MOVQ    (AX), R10
-
-	// Main loop
-main_loop:
-	XORL  DX, DX
-	CMPQ  BX, SI
-	SETGE DL
-
-	// br0.fillFast32()
-	MOVQ    32(R10), R11
-	MOVBQZX 40(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill0
-	MOVQ    24(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    (R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 24(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br0.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill0:
-	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br0.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (BX)
-
-	// update the bitreader structure
-	MOVQ R11, 32(R10)
-	MOVB R12, 40(R10)
-
-	// br1.fillFast32()
-	MOVQ    80(R10), R11
-	MOVBQZX 88(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill1
-	MOVQ    72(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    48(R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 72(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br1.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill1:
-	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br1.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (BX)(R8*1)
-
-	// update the bitreader structure
-	MOVQ R11, 80(R10)
-	MOVB R12, 88(R10)
-
-	// br2.fillFast32()
-	MOVQ    128(R10), R11
-	MOVBQZX 136(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill2
-	MOVQ    120(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    96(R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 120(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br2.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill2:
-	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br2.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (BX)(R8*2)
-
-	// update the bitreader structure
-	MOVQ R11, 128(R10)
-	MOVB R12, 136(R10)
-
-	// br3.fillFast32()
-	MOVQ    176(R10), R11
-	MOVBQZX 184(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill3
-	MOVQ    168(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    144(R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 168(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br3.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill3:
-	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	LEAQ (R8)(R8*2), CX
-	MOVW AX, (BX)(CX*1)
-
-	// update the bitreader structure
-	MOVQ  R11, 176(R10)
-	MOVB  R12, 184(R10)
-	ADDQ  $0x02, BX
-	TESTB DL, DL
-	JZ    main_loop
-	MOVQ  ctx+0(FP), AX
-	SUBQ  16(AX), BX
-	SHLQ  $0x02, BX
-	MOVQ  BX, 40(AX)
-	RET
-
-// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
-	// Preload values
-	MOVQ    ctx+0(FP), CX
-	MOVBQZX 8(CX), DI
-	MOVQ    16(CX), BX
-	MOVQ    48(CX), SI
-	MOVQ    24(CX), R8
-	MOVQ    32(CX), R9
-	MOVQ    (CX), R10
-
-	// Main loop
-main_loop:
-	XORL  DX, DX
-	CMPQ  BX, SI
-	SETGE DL
-
-	// br0.fillFast32()
-	MOVQ    32(R10), R11
-	MOVBQZX 40(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill0
-	MOVQ    24(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    (R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 24(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br0.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill0:
-	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (BX)
-
-	// update the bitreader structure
-	MOVQ R11, 32(R10)
-	MOVB R12, 40(R10)
-
-	// br1.fillFast32()
-	MOVQ    80(R10), R11
-	MOVBQZX 88(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill1
-	MOVQ    72(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    48(R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 72(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br1.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill1:
-	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (BX)(R8*1)
-
-	// update the bitreader structure
-	MOVQ R11, 80(R10)
-	MOVB R12, 88(R10)
-
-	// br2.fillFast32()
-	MOVQ    128(R10), R11
-	MOVBQZX 136(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill2
-	MOVQ    120(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    96(R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 120(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br2.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill2:
-	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (BX)(R8*2)
-
-	// update the bitreader structure
-	MOVQ R11, 128(R10)
-	MOVB R12, 136(R10)
-
-	// br3.fillFast32()
-	MOVQ    176(R10), R11
-	MOVBQZX 184(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill3
-	MOVQ    168(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    144(R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 168(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br3.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill3:
-	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	LEAQ (R8)(R8*2), CX
-	MOVL AX, (BX)(CX*1)
-
-	// update the bitreader structure
-	MOVQ  R11, 176(R10)
-	MOVB  R12, 184(R10)
-	ADDQ  $0x04, BX
-	TESTB DL, DL
-	JZ    main_loop
-	MOVQ  ctx+0(FP), AX
-	SUBQ  16(AX), BX
-	SHLQ  $0x02, BX
-	MOVQ  BX, 40(AX)
-	RET
-
-// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
-TEXT ·decompress1x_main_loop_amd64(SB), $0-8
-	MOVQ    ctx+0(FP), CX
-	MOVQ    16(CX), DX
-	MOVQ    24(CX), BX
-	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exceeded
-	LEAQ    (DX)(BX*1), BX
-	MOVQ    (CX), SI
-	MOVQ    (SI), R8
-	MOVQ    24(SI), R9
-	MOVQ    32(SI), R10
-	MOVBQZX 40(SI), R11
-	MOVQ    32(CX), SI
-	MOVBQZX 8(CX), DI
-	JMP     loop_condition
-
-main_loop:
-	// Check if we have room for 4 bytes in the output buffer
-	LEAQ 4(DX), CX
-	CMPQ CX, BX
-	JGE  error_max_decoded_size_exceeded
-
-	// Decode 4 values
-	CMPQ R11, $0x20
-	JL   bitReader_fillFast_1_end
-	SUBQ $0x20, R11
-	SUBQ $0x04, R9
-	MOVL (R8)(R9*1), R12
-	MOVQ R11, CX
-	SHLQ CL, R12
-	ORQ  R12, R10
-
-bitReader_fillFast_1_end:
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	BSWAPL  AX
-	CMPQ    R11, $0x20
-	JL      bitReader_fillFast_2_end
-	SUBQ    $0x20, R11
-	SUBQ    $0x04, R9
-	MOVL    (R8)(R9*1), R12
-	MOVQ    R11, CX
-	SHLQ    CL, R12
-	ORQ     R12, R10
-
-bitReader_fillFast_2_end:
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	BSWAPL  AX
-
-	// Store the decoded values
-	MOVL AX, (DX)
-	ADDQ $0x04, DX
-
-loop_condition:
-	CMPQ R9, $0x08
-	JGE  main_loop
-
-	// Update ctx structure
-	MOVQ ctx+0(FP), AX
-	SUBQ 16(AX), DX
-	MOVQ DX, 40(AX)
-	MOVQ (AX), AX
-	MOVQ R9, 24(AX)
-	MOVQ R10, 32(AX)
-	MOVB R11, 40(AX)
-	RET
-
-	// Report error
-error_max_decoded_size_exceeded:
-	MOVQ ctx+0(FP), AX
-	MOVQ $-1, CX
-	MOVQ CX, 40(AX)
-	RET
-
-// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
-// Requires: BMI2
-TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
-	MOVQ    ctx+0(FP), CX
-	MOVQ    16(CX), DX
-	MOVQ    24(CX), BX
-	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exceeded
-	LEAQ    (DX)(BX*1), BX
-	MOVQ    (CX), SI
-	MOVQ    (SI), R8
-	MOVQ    24(SI), R9
-	MOVQ    32(SI), R10
-	MOVBQZX 40(SI), R11
-	MOVQ    32(CX), SI
-	MOVBQZX 8(CX), DI
-	JMP     loop_condition
-
-main_loop:
-	// Check if we have room for 4 bytes in the output buffer
-	LEAQ 4(DX), CX
-	CMPQ CX, BX
-	JGE  error_max_decoded_size_exceeded
-
-	// Decode 4 values
-	CMPQ  R11, $0x20
-	JL    bitReader_fillFast_1_end
-	SUBQ  $0x20, R11
-	SUBQ  $0x04, R9
-	MOVL  (R8)(R9*1), CX
-	SHLXQ R11, CX, CX
-	ORQ   CX, R10
-
-bitReader_fillFast_1_end:
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	BSWAPL  AX
-	CMPQ    R11, $0x20
-	JL      bitReader_fillFast_2_end
-	SUBQ    $0x20, R11
-	SUBQ    $0x04, R9
-	MOVL    (R8)(R9*1), CX
-	SHLXQ   R11, CX, CX
-	ORQ     CX, R10
-
-bitReader_fillFast_2_end:
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	BSWAPL  AX
-
-	// Store the decoded values
-	MOVL AX, (DX)
-	ADDQ $0x04, DX
-
-loop_condition:
-	CMPQ R9, $0x08
-	JGE  main_loop
-
-	// Update ctx structure
-	MOVQ ctx+0(FP), AX
-	SUBQ 16(AX), DX
-	MOVQ DX, 40(AX)
-	MOVQ (AX), AX
-	MOVQ R9, 24(AX)
-	MOVQ R10, 32(AX)
-	MOVB R11, 40(AX)
-	RET
-
-	// Report error
-error_max_decoded_size_exceeded:
-	MOVQ ctx+0(FP), AX
-	MOVQ $-1, CX
-	MOVQ CX, 40(AX)
-	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
deleted file mode 100644
index 908c17de6..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ /dev/null
@@ -1,299 +0,0 @@
-//go:build !amd64 || appengine || !gc || noasm
-// +build !amd64 appengine !gc noasm
-
-// This file contains a generic implementation of Decoder.Decompress4X.
-package huff0
-
-import (
-	"errors"
-	"fmt"
-)
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	// Decode "jump table"
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			// There must at least be 3 buffers left.
-			if len(out)-bufoff < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-			//copy(out, buf[0][:])
-			//copy(out[dstEvery:], buf[1][:])
-			//copy(out[dstEvery*2:], buf[2][:])
-			//copy(out[dstEvery*3:], buf[3][:])
-			*(*[bufoff]byte)(out) = buf[0]
-			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
-			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
-			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
-			out = out[bufoff:]
-			decoded += bufoff * 4
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			br.fill()
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress1X8Bit(dst, src)
-	}
-	var br bitReaderShifted
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	dt := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	bufs := d.buffer()
-	buf := &bufs[0]
-	var off uint8
-
-	for br.off >= 8 {
-		br.fillFast()
-		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		// Refill
-		br.fillFast()
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				d.bufs.Put(bufs)
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			dst = append(dst, buf[:]...)
-		}
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		d.bufs.Put(bufs)
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 8, so uint8 is fine
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
-	for bitsLeft > 0 {
-		br.fill()
-		if false && br.bitsRead >= 32 {
-			if br.off >= 4 {
-				v := br.in[br.off-4:]
-				v = v[:4]
-				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-				br.value = (br.value << 32) | uint64(low)
-				br.bitsRead -= 32
-				br.off -= 4
-			} else {
-				for br.off > 0 {
-					br.value = (br.value << 8) | uint64(br.in[br.off-1])
-					br.bitsRead -= 8
-					br.off--
-				}
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			d.bufs.Put(bufs)
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= nBits
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	d.bufs.Put(bufs)
-	return dst, br.close()
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
deleted file mode 100644
index 77ecd68e0..000000000
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ /dev/null
@@ -1,337 +0,0 @@
-// Package huff0 provides fast huffman encoding as used in zstd.
-//
-// See README.md at https://github.com/klauspost/compress/tree/master/huff0 for details.
-package huff0
-
-import (
-	"errors"
-	"fmt"
-	"math"
-	"math/bits"
-	"sync"
-
-	"github.com/klauspost/compress/fse"
-)
-
-const (
-	maxSymbolValue = 255
-
-	// zstandard limits tablelog to 11, see:
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-description
-	tableLogMax     = 11
-	tableLogDefault = 11
-	minTablelog     = 5
-	huffNodesLen    = 512
-
-	// BlockSizeMax is maximum input size for a single block uncompressed.
-	BlockSizeMax = 1<<18 - 1
-)
-
-var (
-	// ErrIncompressible is returned when input is judged to be too hard to compress.
-	ErrIncompressible = errors.New("input is not compressible")
-
-	// ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
-	ErrUseRLE = errors.New("input is single value repeated")
-
-	// ErrTooBig is return if input is too large for a single block.
-	ErrTooBig = errors.New("input too big")
-
-	// ErrMaxDecodedSizeExceeded is return if input is too large for a single block.
-	ErrMaxDecodedSizeExceeded = errors.New("maximum output size exceeded")
-)
-
-type ReusePolicy uint8
-
-const (
-	// ReusePolicyAllow will allow reuse if it produces smaller output.
-	ReusePolicyAllow ReusePolicy = iota
-
-	// ReusePolicyPrefer will re-use aggressively if possible.
-	// This will not check if a new table will produce smaller output,
-	// except if the current table is impossible to use or
-	// compressed output is bigger than input.
-	ReusePolicyPrefer
-
-	// ReusePolicyNone will disable re-use of tables.
-	// This is slightly faster than ReusePolicyAllow but may produce larger output.
-	ReusePolicyNone
-
-	// ReusePolicyMust must allow reuse and produce smaller output.
-	ReusePolicyMust
-)
-
-type Scratch struct {
-	count [maxSymbolValue + 1]uint32
-
-	// Per block parameters.
-	// These can be used to override compression parameters of the block.
-	// Do not touch, unless you know what you are doing.
-
-	// Out is output buffer.
-	// If the scratch is re-used before the caller is done processing the output,
-	// set this field to nil.
-	// Otherwise the output buffer will be re-used for next Compression/Decompression step
-	// and allocation will be avoided.
-	Out []byte
-
-	// OutTable will contain the table data only, if a new table has been generated.
-	// Slice of the returned data.
-	OutTable []byte
-
-	// OutData will contain the compressed data.
-	// Slice of the returned data.
-	OutData []byte
-
-	// MaxDecodedSize will set the maximum allowed output size.
-	// This value will automatically be set to BlockSizeMax if not set.
-	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
-	MaxDecodedSize int
-
-	srcLen int
-
-	// MaxSymbolValue will override the maximum symbol value of the next block.
-	MaxSymbolValue uint8
-
-	// TableLog will attempt to override the tablelog for the next block.
-	// Must be <= 11 and >= 5.
-	TableLog uint8
-
-	// Reuse will specify the reuse policy
-	Reuse ReusePolicy
-
-	// WantLogLess allows to specify a log 2 reduction that should at least be achieved,
-	// otherwise the block will be returned as incompressible.
-	// The reduction should then at least be (input size >> WantLogLess)
-	// If WantLogLess == 0 any improvement will do.
-	WantLogLess uint8
-
-	symbolLen      uint16 // Length of active part of the symbol table.
-	maxCount       int    // count of the most probable symbol
-	clearCount     bool   // clear count
-	actualTableLog uint8  // Selected tablelog.
-	prevTableLog   uint8  // Tablelog for previous table
-	prevTable      cTable // Table used for previous compression.
-	cTable         cTable // compression table
-	dt             dTable // decompression table
-	nodes          []nodeElt
-	tmpOut         [4][]byte
-	fse            *fse.Scratch
-	decPool        sync.Pool // *[4][256]byte buffers.
-	huffWeight     [maxSymbolValue + 1]byte
-}
-
-// TransferCTable will transfer the previously used compression table.
-func (s *Scratch) TransferCTable(src *Scratch) {
-	if cap(s.prevTable) < len(src.prevTable) {
-		s.prevTable = make(cTable, 0, maxSymbolValue+1)
-	}
-	s.prevTable = s.prevTable[:len(src.prevTable)]
-	copy(s.prevTable, src.prevTable)
-	s.prevTableLog = src.prevTableLog
-}
-
-func (s *Scratch) prepare(in []byte) (*Scratch, error) {
-	if len(in) > BlockSizeMax {
-		return nil, ErrTooBig
-	}
-	if s == nil {
-		s = &Scratch{}
-	}
-	if s.MaxSymbolValue == 0 {
-		s.MaxSymbolValue = maxSymbolValue
-	}
-	if s.TableLog == 0 {
-		s.TableLog = tableLogDefault
-	}
-	if s.TableLog > tableLogMax || s.TableLog < minTablelog {
-		return nil, fmt.Errorf(" invalid tableLog %d (%d -> %d)", s.TableLog, minTablelog, tableLogMax)
-	}
-	if s.MaxDecodedSize <= 0 || s.MaxDecodedSize > BlockSizeMax {
-		s.MaxDecodedSize = BlockSizeMax
-	}
-	if s.clearCount && s.maxCount == 0 {
-		for i := range s.count {
-			s.count[i] = 0
-		}
-		s.clearCount = false
-	}
-	if cap(s.Out) == 0 {
-		s.Out = make([]byte, 0, len(in))
-	}
-	s.Out = s.Out[:0]
-
-	s.OutTable = nil
-	s.OutData = nil
-	if cap(s.nodes) < huffNodesLen+1 {
-		s.nodes = make([]nodeElt, 0, huffNodesLen+1)
-	}
-	s.nodes = s.nodes[:0]
-	if s.fse == nil {
-		s.fse = &fse.Scratch{}
-	}
-	s.srcLen = len(in)
-
-	return s, nil
-}
-
-type cTable []cTableEntry
-
-func (c cTable) write(s *Scratch) error {
-	var (
-		// precomputed conversion table
-		bitsToWeight [tableLogMax + 1]byte
-		huffLog      = s.actualTableLog
-		// last weight is not saved.
-		maxSymbolValue = uint8(s.symbolLen - 1)
-		huffWeight     = s.huffWeight[:256]
-	)
-	const (
-		maxFSETableLog = 6
-	)
-	// convert to weight
-	bitsToWeight[0] = 0
-	for n := uint8(1); n < huffLog+1; n++ {
-		bitsToWeight[n] = huffLog + 1 - n
-	}
-
-	// Acquire histogram for FSE.
-	hist := s.fse.Histogram()
-	hist = hist[:256]
-	for i := range hist[:16] {
-		hist[i] = 0
-	}
-	for n := uint8(0); n < maxSymbolValue; n++ {
-		v := bitsToWeight[c[n].nBits] & 15
-		huffWeight[n] = v
-		hist[v]++
-	}
-
-	// FSE compress if feasible.
-	if maxSymbolValue >= 2 {
-		huffMaxCnt := uint32(0)
-		huffMax := uint8(0)
-		for i, v := range hist[:16] {
-			if v == 0 {
-				continue
-			}
-			huffMax = byte(i)
-			if v > huffMaxCnt {
-				huffMaxCnt = v
-			}
-		}
-		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
-		s.fse.TableLog = maxFSETableLog
-		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
-		if err == nil && len(b) < int(s.symbolLen>>1) {
-			s.Out = append(s.Out, uint8(len(b)))
-			s.Out = append(s.Out, b...)
-			return nil
-		}
-		// Unable to compress (RLE/uncompressible)
-	}
-	// write raw values as 4-bits (max : 15)
-	if maxSymbolValue > (256 - 128) {
-		// should not happen : likely means source cannot be compressed
-		return ErrIncompressible
-	}
-	op := s.Out
-	// special case, pack weights 4 bits/weight.
-	op = append(op, 128|(maxSymbolValue-1))
-	// be sure it doesn't cause msan issue in final combination
-	huffWeight[maxSymbolValue] = 0
-	for n := uint16(0); n < uint16(maxSymbolValue); n += 2 {
-		op = append(op, (huffWeight[n]<<4)|huffWeight[n+1])
-	}
-	s.Out = op
-	return nil
-}
-
-func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
-	var (
-		// precomputed conversion table
-		bitsToWeight [tableLogMax + 1]byte
-		huffLog      = s.actualTableLog
-		// last weight is not saved.
-		maxSymbolValue = uint8(s.symbolLen - 1)
-		huffWeight     = s.huffWeight[:256]
-	)
-	const (
-		maxFSETableLog = 6
-	)
-	// convert to weight
-	bitsToWeight[0] = 0
-	for n := uint8(1); n < huffLog+1; n++ {
-		bitsToWeight[n] = huffLog + 1 - n
-	}
-
-	// Acquire histogram for FSE.
-	hist := s.fse.Histogram()
-	hist = hist[:256]
-	for i := range hist[:16] {
-		hist[i] = 0
-	}
-	for n := uint8(0); n < maxSymbolValue; n++ {
-		v := bitsToWeight[c[n].nBits] & 15
-		huffWeight[n] = v
-		hist[v]++
-	}
-
-	// FSE compress if feasible.
-	if maxSymbolValue >= 2 {
-		huffMaxCnt := uint32(0)
-		huffMax := uint8(0)
-		for i, v := range hist[:16] {
-			if v == 0 {
-				continue
-			}
-			huffMax = byte(i)
-			if v > huffMaxCnt {
-				huffMaxCnt = v
-			}
-		}
-		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
-		s.fse.TableLog = maxFSETableLog
-		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
-		if err == nil && len(b) < int(s.symbolLen>>1) {
-			sz += 1 + len(b)
-			return sz, nil
-		}
-		// Unable to compress (RLE/uncompressible)
-	}
-	// write raw values as 4-bits (max : 15)
-	if maxSymbolValue > (256 - 128) {
-		// should not happen : likely means source cannot be compressed
-		return 0, ErrIncompressible
-	}
-	// special case, pack weights 4 bits/weight.
-	sz += 1 + int(maxSymbolValue/2)
-	return sz, nil
-}
-
-// estimateSize returns the estimated size in bytes of the input represented in the
-// histogram supplied.
-func (c cTable) estimateSize(hist []uint32) int {
-	nbBits := uint32(7)
-	for i, v := range c[:len(hist)] {
-		nbBits += uint32(v.nBits) * hist[i]
-	}
-	return int(nbBits >> 3)
-}
-
-// minSize returns the minimum possible size considering the shannon limit.
-func (s *Scratch) minSize(total int) int {
-	nbBits := float64(7)
-	fTotal := float64(total)
-	for _, v := range s.count[:s.symbolLen] {
-		n := float64(v)
-		if n > 0 {
-			nbBits += math.Log2(fTotal/n) * n
-		}
-	}
-	return int(nbBits) >> 3
-}
-
-func highBit32(val uint32) (n uint32) {
-	return uint32(bits.Len32(val) - 1)
-}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
deleted file mode 100644
index 3954c5121..000000000
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
+++ /dev/null
@@ -1,34 +0,0 @@
-// Package cpuinfo gives runtime info about the current CPU.
-//
-// This is a very limited module meant for use internally
-// in this project. For more versatile solution check
-// https://github.com/klauspost/cpuid.
-package cpuinfo
-
-// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
-func HasBMI1() bool {
-	return hasBMI1
-}
-
-// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
-func HasBMI2() bool {
-	return hasBMI2
-}
-
-// DisableBMI2 will disable BMI2, for testing purposes.
-// Call returned function to restore previous state.
-func DisableBMI2() func() {
-	old := hasBMI2
-	hasBMI2 = false
-	return func() {
-		hasBMI2 = old
-	}
-}
-
-// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
-func HasBMI() bool {
-	return HasBMI1() && HasBMI2()
-}
-
-var hasBMI1 bool
-var hasBMI2 bool
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
deleted file mode 100644
index e802579c4..000000000
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-package cpuinfo
-
-// go:noescape
-func x86extensions() (bmi1, bmi2 bool)
-
-func init() {
-	hasBMI1, hasBMI2 = x86extensions()
-}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
deleted file mode 100644
index 4465fbe9e..000000000
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
+++ /dev/null
@@ -1,36 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-TEXT ·x86extensions(SB), NOSPLIT, $0
-	// 1. determine max EAX value
-	XORQ AX, AX
-	CPUID
-
-	CMPQ AX, $7
-	JB   unsupported
-
-	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
-	MOVQ $7, AX
-	MOVQ $0, CX
-	CPUID
-
-	BTQ   $3, BX // bit 3 = BMI1
-	SETCS AL
-
-	BTQ   $8, BX // bit 8 = BMI2
-	SETCS AH
-
-	MOVB AL, bmi1+0(FP)
-	MOVB AH, bmi2+1(FP)
-	RET
-
-unsupported:
-	XORQ AX, AX
-	MOVB AL, bmi1+0(FP)
-	MOVB AL, bmi2+1(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/LICENSE b/vendor/github.com/klauspost/compress/internal/snapref/LICENSE
deleted file mode 100644
index 6050c10f4..000000000
--- a/vendor/github.com/klauspost/compress/internal/snapref/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/decode.go b/vendor/github.com/klauspost/compress/internal/snapref/decode.go
deleted file mode 100644
index 40796a49d..000000000
--- a/vendor/github.com/klauspost/compress/internal/snapref/decode.go
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snapref
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-var (
-	// ErrCorrupt reports that the input is invalid.
-	ErrCorrupt = errors.New("snappy: corrupt input")
-	// ErrTooLarge reports that the uncompressed length is too large.
-	ErrTooLarge = errors.New("snappy: decoded block is too large")
-	// ErrUnsupported reports that the input isn't supported.
-	ErrUnsupported = errors.New("snappy: unsupported input")
-
-	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
-)
-
-// DecodedLen returns the length of the decoded block.
-func DecodedLen(src []byte) (int, error) {
-	v, _, err := decodedLen(src)
-	return v, err
-}
-
-// decodedLen returns the length of the decoded block and the number of bytes
-// that the length header occupied.
-func decodedLen(src []byte) (blockLen, headerLen int, err error) {
-	v, n := binary.Uvarint(src)
-	if n <= 0 || v > 0xffffffff {
-		return 0, 0, ErrCorrupt
-	}
-
-	const wordSize = 32 << (^uint(0) >> 32 & 1)
-	if wordSize == 32 && v > 0x7fffffff {
-		return 0, 0, ErrTooLarge
-	}
-	return int(v), n, nil
-}
-
-const (
-	decodeErrCodeCorrupt                  = 1
-	decodeErrCodeUnsupportedLiteralLength = 2
-)
-
-// Decode returns the decoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire decoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-//
-// Decode handles the Snappy block format, not the Snappy stream format.
-func Decode(dst, src []byte) ([]byte, error) {
-	dLen, s, err := decodedLen(src)
-	if err != nil {
-		return nil, err
-	}
-	if dLen <= len(dst) {
-		dst = dst[:dLen]
-	} else {
-		dst = make([]byte, dLen)
-	}
-	switch decode(dst, src[s:]) {
-	case 0:
-		return dst, nil
-	case decodeErrCodeUnsupportedLiteralLength:
-		return nil, errUnsupportedLiteralLength
-	}
-	return nil, ErrCorrupt
-}
-
-// NewReader returns a new Reader that decompresses from r, using the framing
-// format described at
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func NewReader(r io.Reader) *Reader {
-	return &Reader{
-		r:       r,
-		decoded: make([]byte, maxBlockSize),
-		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
-	}
-}
-
-// Reader is an io.Reader that can read Snappy-compressed bytes.
-//
-// Reader handles the Snappy stream format, not the Snappy block format.
-type Reader struct {
-	r       io.Reader
-	err     error
-	decoded []byte
-	buf     []byte
-	// decoded[i:j] contains decoded bytes that have not yet been passed on.
-	i, j       int
-	readHeader bool
-}
-
-// Reset discards any buffered data, resets all state, and switches the Snappy
-// reader to read from r. This permits reusing a Reader rather than allocating
-// a new one.
-func (r *Reader) Reset(reader io.Reader) {
-	r.r = reader
-	r.err = nil
-	r.i = 0
-	r.j = 0
-	r.readHeader = false
-}
-
-func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
-	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
-		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
-			r.err = ErrCorrupt
-		}
-		return false
-	}
-	return true
-}
-
-func (r *Reader) fill() error {
-	for r.i >= r.j {
-		if !r.readFull(r.buf[:4], true) {
-			return r.err
-		}
-		chunkType := r.buf[0]
-		if !r.readHeader {
-			if chunkType != chunkTypeStreamIdentifier {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			r.readHeader = true
-		}
-		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
-		if chunkLen > len(r.buf) {
-			r.err = ErrUnsupported
-			return r.err
-		}
-
-		// The chunk types are specified at
-		// https://github.com/google/snappy/blob/master/framing_format.txt
-		switch chunkType {
-		case chunkTypeCompressedData:
-			// Section 4.2. Compressed data (chunk type 0x00).
-			if chunkLen < checksumSize {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			buf := r.buf[:chunkLen]
-			if !r.readFull(buf, false) {
-				return r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			buf = buf[checksumSize:]
-
-			n, err := DecodedLen(buf)
-			if err != nil {
-				r.err = err
-				return r.err
-			}
-			if n > len(r.decoded) {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			if _, err := Decode(r.decoded, buf); err != nil {
-				r.err = err
-				return r.err
-			}
-			if crc(r.decoded[:n]) != checksum {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			r.i, r.j = 0, n
-			continue
-
-		case chunkTypeUncompressedData:
-			// Section 4.3. Uncompressed data (chunk type 0x01).
-			if chunkLen < checksumSize {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			buf := r.buf[:checksumSize]
-			if !r.readFull(buf, false) {
-				return r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			// Read directly into r.decoded instead of via r.buf.
-			n := chunkLen - checksumSize
-			if n > len(r.decoded) {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			if !r.readFull(r.decoded[:n], false) {
-				return r.err
-			}
-			if crc(r.decoded[:n]) != checksum {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			r.i, r.j = 0, n
-			continue
-
-		case chunkTypeStreamIdentifier:
-			// Section 4.1. Stream identifier (chunk type 0xff).
-			if chunkLen != len(magicBody) {
-				r.err = ErrCorrupt
-				return r.err
-			}
-			if !r.readFull(r.buf[:len(magicBody)], false) {
-				return r.err
-			}
-			for i := 0; i < len(magicBody); i++ {
-				if r.buf[i] != magicBody[i] {
-					r.err = ErrCorrupt
-					return r.err
-				}
-			}
-			continue
-		}
-
-		if chunkType <= 0x7f {
-			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
-			r.err = ErrUnsupported
-			return r.err
-		}
-		// Section 4.4 Padding (chunk type 0xfe).
-		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
-		if !r.readFull(r.buf[:chunkLen], false) {
-			return r.err
-		}
-	}
-
-	return nil
-}
-
-// Read satisfies the io.Reader interface.
-func (r *Reader) Read(p []byte) (int, error) {
-	if r.err != nil {
-		return 0, r.err
-	}
-
-	if err := r.fill(); err != nil {
-		return 0, err
-	}
-
-	n := copy(p, r.decoded[r.i:r.j])
-	r.i += n
-	return n, nil
-}
-
-// ReadByte satisfies the io.ByteReader interface.
-func (r *Reader) ReadByte() (byte, error) {
-	if r.err != nil {
-		return 0, r.err
-	}
-
-	if err := r.fill(); err != nil {
-		return 0, err
-	}
-
-	c := r.decoded[r.i]
-	r.i++
-	return c, nil
-}
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
deleted file mode 100644
index 77395a6b8..000000000
--- a/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snapref
-
-// decode writes the decoding of src to dst. It assumes that the varint-encoded
-// length of the decompressed bytes has already been read, and that len(dst)
-// equals that length.
-//
-// It returns 0 on success or a decodeErrCodeXxx error code on failure.
-func decode(dst, src []byte) int {
-	var d, s, offset, length int
-	for s < len(src) {
-		switch src[s] & 0x03 {
-		case tagLiteral:
-			x := uint32(src[s] >> 2)
-			switch {
-			case x < 60:
-				s++
-			case x == 60:
-				s += 2
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-1])
-			case x == 61:
-				s += 3
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-2]) | uint32(src[s-1])<<8
-			case x == 62:
-				s += 4
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-			case x == 63:
-				s += 5
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-			}
-			length = int(x) + 1
-			if length <= 0 {
-				return decodeErrCodeUnsupportedLiteralLength
-			}
-			if length > len(dst)-d || length > len(src)-s {
-				return decodeErrCodeCorrupt
-			}
-			copy(dst[d:], src[s:s+length])
-			d += length
-			s += length
-			continue
-
-		case tagCopy1:
-			s += 2
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 4 + int(src[s-2])>>2&0x7
-			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-
-		case tagCopy2:
-			s += 3
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 1 + int(src[s-3])>>2
-			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-
-		case tagCopy4:
-			s += 5
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 1 + int(src[s-5])>>2
-			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-		}
-
-		if offset <= 0 || d < offset || length > len(dst)-d {
-			return decodeErrCodeCorrupt
-		}
-		// Copy from an earlier sub-slice of dst to a later sub-slice.
-		// If no overlap, use the built-in copy:
-		if offset >= length {
-			copy(dst[d:d+length], dst[d-offset:])
-			d += length
-			continue
-		}
-
-		// Unlike the built-in copy function, this byte-by-byte copy always runs
-		// forwards, even if the slices overlap. Conceptually, this is:
-		//
-		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
-		//
-		// We align the slices into a and b and show the compiler they are the same size.
-		// This allows the loop to run without bounds checks.
-		a := dst[d : d+length]
-		b := dst[d-offset:]
-		b = b[:len(a)]
-		for i := range a {
-			a[i] = b[i]
-		}
-		d += length
-	}
-	if d != len(dst) {
-		return decodeErrCodeCorrupt
-	}
-	return 0
-}
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/encode.go b/vendor/github.com/klauspost/compress/internal/snapref/encode.go
deleted file mode 100644
index 13c6040a5..000000000
--- a/vendor/github.com/klauspost/compress/internal/snapref/encode.go
+++ /dev/null
@@ -1,289 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snapref
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-// Encode returns the encoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire encoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-//
-// Encode handles the Snappy block format, not the Snappy stream format.
-func Encode(dst, src []byte) []byte {
-	if n := MaxEncodedLen(len(src)); n < 0 {
-		panic(ErrTooLarge)
-	} else if len(dst) < n {
-		dst = make([]byte, n)
-	}
-
-	// The block starts with the varint-encoded length of the decompressed bytes.
-	d := binary.PutUvarint(dst, uint64(len(src)))
-
-	for len(src) > 0 {
-		p := src
-		src = nil
-		if len(p) > maxBlockSize {
-			p, src = p[:maxBlockSize], p[maxBlockSize:]
-		}
-		if len(p) < minNonLiteralBlockSize {
-			d += emitLiteral(dst[d:], p)
-		} else {
-			d += encodeBlock(dst[d:], p)
-		}
-	}
-	return dst[:d]
-}
-
-// inputMargin is the minimum number of extra input bytes to keep, inside
-// encodeBlock's inner loop. On some architectures, this margin lets us
-// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
-// literals can be implemented as a single load to and store from a 16-byte
-// register. That literal's actual length can be as short as 1 byte, so this
-// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
-// the encoding loop will fix up the copy overrun, and this inputMargin ensures
-// that we don't overrun the dst and src buffers.
-const inputMargin = 16 - 1
-
-// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
-// could be encoded with a copy tag. This is the minimum with respect to the
-// algorithm used by encodeBlock, not a minimum enforced by the file format.
-//
-// The encoded output must start with at least a 1 byte literal, as there are
-// no previous bytes to copy. A minimal (1 byte) copy after that, generated
-// from an emitCopy call in encodeBlock's main loop, would require at least
-// another inputMargin bytes, for the reason above: we want any emitLiteral
-// calls inside encodeBlock's main loop to use the fast path if possible, which
-// requires being able to overrun by inputMargin bytes. Thus,
-// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
-//
-// The C++ code doesn't use this exact threshold, but it could, as discussed at
-// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
-// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
-// optimization. It should not affect the encoded form. This is tested by
-// TestSameEncodingAsCppShortCopies.
-const minNonLiteralBlockSize = 1 + 1 + inputMargin
-
-// MaxEncodedLen returns the maximum length of a snappy block, given its
-// uncompressed length.
-//
-// It will return a negative value if srcLen is too large to encode.
-func MaxEncodedLen(srcLen int) int {
-	n := uint64(srcLen)
-	if n > 0xffffffff {
-		return -1
-	}
-	// Compressed data can be defined as:
-	//    compressed := item* literal*
-	//    item       := literal* copy
-	//
-	// The trailing literal sequence has a space blowup of at most 62/60
-	// since a literal of length 60 needs one tag byte + one extra byte
-	// for length information.
-	//
-	// Item blowup is trickier to measure. Suppose the "copy" op copies
-	// 4 bytes of data. Because of a special check in the encoding code,
-	// we produce a 4-byte copy only if the offset is < 65536. Therefore
-	// the copy op takes 3 bytes to encode, and this type of item leads
-	// to at most the 62/60 blowup for representing literals.
-	//
-	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
-	// enough, it will take 5 bytes to encode the copy op. Therefore the
-	// worst case here is a one-byte literal followed by a five-byte copy.
-	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
-	//
-	// This last factor dominates the blowup, so the final estimate is:
-	n = 32 + n + n/6
-	if n > 0xffffffff {
-		return -1
-	}
-	return int(n)
-}
-
-var errClosed = errors.New("snappy: Writer is closed")
-
-// NewWriter returns a new Writer that compresses to w.
-//
-// The Writer returned does not buffer writes. There is no need to Flush or
-// Close such a Writer.
-//
-// Deprecated: the Writer returned is not suitable for many small writes, only
-// for few large writes. Use NewBufferedWriter instead, which is efficient
-// regardless of the frequency and shape of the writes, and remember to Close
-// that Writer when done.
-func NewWriter(w io.Writer) *Writer {
-	return &Writer{
-		w:    w,
-		obuf: make([]byte, obufLen),
-	}
-}
-
-// NewBufferedWriter returns a new Writer that compresses to w, using the
-// framing format described at
-// https://github.com/google/snappy/blob/master/framing_format.txt
-//
-// The Writer returned buffers writes. Users must call Close to guarantee all
-// data has been forwarded to the underlying io.Writer. They may also call
-// Flush zero or more times before calling Close.
-func NewBufferedWriter(w io.Writer) *Writer {
-	return &Writer{
-		w:    w,
-		ibuf: make([]byte, 0, maxBlockSize),
-		obuf: make([]byte, obufLen),
-	}
-}
-
-// Writer is an io.Writer that can write Snappy-compressed bytes.
-//
-// Writer handles the Snappy stream format, not the Snappy block format.
-type Writer struct {
-	w   io.Writer
-	err error
-
-	// ibuf is a buffer for the incoming (uncompressed) bytes.
-	//
-	// Its use is optional. For backwards compatibility, Writers created by the
-	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
-	// therefore do not need to be Flush'ed or Close'd.
-	ibuf []byte
-
-	// obuf is a buffer for the outgoing (compressed) bytes.
-	obuf []byte
-
-	// wroteStreamHeader is whether we have written the stream header.
-	wroteStreamHeader bool
-}
-
-// Reset discards the writer's state and switches the Snappy writer to write to
-// w. This permits reusing a Writer rather than allocating a new one.
-func (w *Writer) Reset(writer io.Writer) {
-	w.w = writer
-	w.err = nil
-	if w.ibuf != nil {
-		w.ibuf = w.ibuf[:0]
-	}
-	w.wroteStreamHeader = false
-}
-
-// Write satisfies the io.Writer interface.
-func (w *Writer) Write(p []byte) (nRet int, errRet error) {
-	if w.ibuf == nil {
-		// Do not buffer incoming bytes. This does not perform or compress well
-		// if the caller of Writer.Write writes many small slices. This
-		// behavior is therefore deprecated, but still supported for backwards
-		// compatibility with code that doesn't explicitly Flush or Close.
-		return w.write(p)
-	}
-
-	// The remainder of this method is based on bufio.Writer.Write from the
-	// standard library.
-
-	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
-		var n int
-		if len(w.ibuf) == 0 {
-			// Large write, empty buffer.
-			// Write directly from p to avoid copy.
-			n, _ = w.write(p)
-		} else {
-			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
-			w.ibuf = w.ibuf[:len(w.ibuf)+n]
-			w.Flush()
-		}
-		nRet += n
-		p = p[n:]
-	}
-	if w.err != nil {
-		return nRet, w.err
-	}
-	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
-	w.ibuf = w.ibuf[:len(w.ibuf)+n]
-	nRet += n
-	return nRet, nil
-}
-
-func (w *Writer) write(p []byte) (nRet int, errRet error) {
-	if w.err != nil {
-		return 0, w.err
-	}
-	for len(p) > 0 {
-		obufStart := len(magicChunk)
-		if !w.wroteStreamHeader {
-			w.wroteStreamHeader = true
-			copy(w.obuf, magicChunk)
-			obufStart = 0
-		}
-
-		var uncompressed []byte
-		if len(p) > maxBlockSize {
-			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
-		} else {
-			uncompressed, p = p, nil
-		}
-		checksum := crc(uncompressed)
-
-		// Compress the buffer, discarding the result if the improvement
-		// isn't at least 12.5%.
-		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
-		chunkType := uint8(chunkTypeCompressedData)
-		chunkLen := 4 + len(compressed)
-		obufEnd := obufHeaderLen + len(compressed)
-		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
-			chunkType = chunkTypeUncompressedData
-			chunkLen = 4 + len(uncompressed)
-			obufEnd = obufHeaderLen
-		}
-
-		// Fill in the per-chunk header that comes before the body.
-		w.obuf[len(magicChunk)+0] = chunkType
-		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
-		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
-		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
-		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
-		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
-		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
-		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
-
-		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
-			w.err = err
-			return nRet, err
-		}
-		if chunkType == chunkTypeUncompressedData {
-			if _, err := w.w.Write(uncompressed); err != nil {
-				w.err = err
-				return nRet, err
-			}
-		}
-		nRet += len(uncompressed)
-	}
-	return nRet, nil
-}
-
-// Flush flushes the Writer to its underlying io.Writer.
-func (w *Writer) Flush() error {
-	if w.err != nil {
-		return w.err
-	}
-	if len(w.ibuf) == 0 {
-		return nil
-	}
-	w.write(w.ibuf)
-	w.ibuf = w.ibuf[:0]
-	return w.err
-}
-
-// Close calls Flush and then closes the Writer.
-func (w *Writer) Close() error {
-	w.Flush()
-	ret := w.err
-	if w.err == nil {
-		w.err = errClosed
-	}
-	return ret
-}
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
deleted file mode 100644
index 2754bac6f..000000000
--- a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snapref
-
-func load32(b []byte, i int) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-//
-// It assumes that:
-//
-//	dst is long enough to hold the encoded bytes
-//	1 <= len(lit) && len(lit) <= 65536
-func emitLiteral(dst, lit []byte) int {
-	i, n := 0, uint(len(lit)-1)
-	switch {
-	case n < 60:
-		dst[0] = uint8(n)<<2 | tagLiteral
-		i = 1
-	case n < 1<<8:
-		dst[0] = 60<<2 | tagLiteral
-		dst[1] = uint8(n)
-		i = 2
-	default:
-		dst[0] = 61<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		i = 3
-	}
-	return i + copy(dst[i:], lit)
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-//
-// It assumes that:
-//
-//	dst is long enough to hold the encoded bytes
-//	1 <= offset && offset <= 65535
-//	4 <= length && length <= 65535
-func emitCopy(dst []byte, offset, length int) int {
-	i := 0
-	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
-	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
-	// length emitted down below is a little lower (at 60 = 64 - 4), because
-	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
-	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
-	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
-	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
-	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
-	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
-	for length >= 68 {
-		// Emit a length 64 copy, encoded as 3 bytes.
-		dst[i+0] = 63<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 64
-	}
-	if length > 64 {
-		// Emit a length 60 copy, encoded as 3 bytes.
-		dst[i+0] = 59<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 60
-	}
-	if length >= 12 || offset >= 2048 {
-		// Emit the remaining copy, encoded as 3 bytes.
-		dst[i+0] = uint8(length-1)<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		return i + 3
-	}
-	// Emit the remaining copy, encoded as 2 bytes.
-	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
-	dst[i+1] = uint8(offset)
-	return i + 2
-}
-
-func hash(u, shift uint32) uint32 {
-	return (u * 0x1e35a7bd) >> shift
-}
-
-// EncodeBlockInto exposes encodeBlock but checks dst size.
-func EncodeBlockInto(dst, src []byte) (d int) {
-	if MaxEncodedLen(len(src)) > len(dst) {
-		return 0
-	}
-
-	// encodeBlock breaks on too big blocks, so split.
-	for len(src) > 0 {
-		p := src
-		src = nil
-		if len(p) > maxBlockSize {
-			p, src = p[:maxBlockSize], p[maxBlockSize:]
-		}
-		if len(p) < minNonLiteralBlockSize {
-			d += emitLiteral(dst[d:], p)
-		} else {
-			d += encodeBlock(dst[d:], p)
-		}
-	}
-	return d
-}
-
-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
-// assumes that the varint-encoded length of the decompressed bytes has already
-// been written.
-//
-// It also assumes that:
-//
-//	len(dst) >= MaxEncodedLen(len(src)) &&
-//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
-	const (
-		maxTableSize = 1 << 14
-		// tableMask is redundant, but helps the compiler eliminate bounds
-		// checks.
-		tableMask = maxTableSize - 1
-	)
-	shift := uint32(32 - 8)
-	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-		shift--
-	}
-	// In Go, all array elements are zero-initialized, so there is no advantage
-	// to a smaller tableSize per se. However, it matches the C++ algorithm,
-	// and in the asm versions of this code, we can get away with zeroing only
-	// the first tableSize elements.
-	var table [maxTableSize]uint16
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := len(src) - inputMargin
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := 0
-
-	// The encoded form must start with a literal, as there are no previous
-	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
-	nextHash := hash(load32(src, s), shift)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := 32
-
-		nextS := s
-		candidate := 0
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = int(table[nextHash&tableMask])
-			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS), shift)
-			if load32(src, s) == load32(src, candidate) {
-				break
-			}
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		d += emitLiteral(dst[d:], src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-			base := s
-
-			// Extend the 4-byte match as long as possible.
-			//
-			// This is an inlined version of:
-			//	s = extendMatch(src, candidate+4, s+4)
-			s += 4
-			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
-			}
-
-			d += emitCopy(dst[d:], base-candidate, s-base)
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x>>8), shift)
-			candidate = int(table[currHash&tableMask])
-			table[currHash&tableMask] = uint16(s)
-			if uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x>>16), shift)
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if nextEmit < len(src) {
-		d += emitLiteral(dst[d:], src[nextEmit:])
-	}
-	return d
-}
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/snappy.go b/vendor/github.com/klauspost/compress/internal/snapref/snappy.go
deleted file mode 100644
index 34d01f4aa..000000000
--- a/vendor/github.com/klauspost/compress/internal/snapref/snappy.go
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package snapref implements the Snappy compression format. It aims for very
-// high speeds and reasonable compression.
-//
-// There are actually two Snappy formats: block and stream. They are related,
-// but different: trying to decompress block-compressed data as a Snappy stream
-// will fail, and vice versa. The block format is the Decode and Encode
-// functions and the stream format is the Reader and Writer types.
-//
-// The block format, the more common case, is used when the complete size (the
-// number of bytes) of the original data is known upfront, at the time
-// compression starts. The stream format, also known as the framing format, is
-// for when that isn't always true.
-//
-// The canonical, C++ implementation is at https://github.com/google/snappy and
-// it only implements the block format.
-package snapref
-
-import (
-	"hash/crc32"
-)
-
-/*
-Each encoded block begins with the varint-encoded length of the decoded data,
-followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
-first byte of each chunk is broken into its 2 least and 6 most significant bits
-called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
-Zero means a literal tag. All other values mean a copy tag.
-
-For literal tags:
-  - If m < 60, the next 1 + m bytes are literal bytes.
-  - Otherwise, let n be the little-endian unsigned integer denoted by the next
-    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
-
-For copy tags, length bytes are copied from offset bytes ago, in the style of
-Lempel-Ziv compression algorithms. In particular:
-  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
-    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
-    of the offset. The next byte is bits 0-7 of the offset.
-  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
-    The length is 1 + m. The offset is the little-endian unsigned integer
-    denoted by the next 2 bytes.
-  - For l == 3, this tag is a legacy format that is no longer issued by most
-    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
-    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
-    integer denoted by the next 4 bytes.
-*/
-const (
-	tagLiteral = 0x00
-	tagCopy1   = 0x01
-	tagCopy2   = 0x02
-	tagCopy4   = 0x03
-)
-
-const (
-	checksumSize    = 4
-	chunkHeaderSize = 4
-	magicChunk      = "\xff\x06\x00\x00" + magicBody
-	magicBody       = "sNaPpY"
-
-	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
-	// part of the wire format per se, but some parts of the encoder assume
-	// that an offset fits into a uint16.
-	//
-	// Also, for the framing format (Writer type instead of Encode function),
-	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536
-	// bytes".
-	maxBlockSize = 65536
-
-	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
-	// hard coded to be a const instead of a variable, so that obufLen can also
-	// be a const. Their equivalence is confirmed by
-	// TestMaxEncodedLenOfMaxBlockSize.
-	maxEncodedLenOfMaxBlockSize = 76490
-
-	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
-	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
-)
-
-const (
-	chunkTypeCompressedData   = 0x00
-	chunkTypeUncompressedData = 0x01
-	chunkTypePadding          = 0xfe
-	chunkTypeStreamIdentifier = 0xff
-)
-
-var crcTable = crc32.MakeTable(crc32.Castagnoli)
-
-// crc implements the checksum specified in section 3 of
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func crc(b []byte) uint32 {
-	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
-}
diff --git a/vendor/github.com/klauspost/compress/s2sx.mod b/vendor/github.com/klauspost/compress/s2sx.mod
deleted file mode 100644
index 81bda5e29..000000000
--- a/vendor/github.com/klauspost/compress/s2sx.mod
+++ /dev/null
@@ -1,3 +0,0 @@
-module github.com/klauspost/compress
-
-go 1.22
diff --git a/vendor/github.com/klauspost/compress/s2sx.sum b/vendor/github.com/klauspost/compress/s2sx.sum
deleted file mode 100644
index e69de29bb..000000000
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
deleted file mode 100644
index c11d7fa28..000000000
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ /dev/null
@@ -1,441 +0,0 @@
-# zstd 
-
-[Zstandard](https://facebook.github.io/zstd/) is a real-time compression algorithm, providing high compression ratios. 
-It offers a very wide range of compression / speed trade-off, while being backed by a very fast decoder.
-A high performance compression algorithm is implemented. For now focused on speed. 
-
-This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
-
-This package is pure Go. Use `noasm` and `nounsafe` to disable relevant features.
-
-The `zstd` package is provided as open source software using a Go standard license.
-
-Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
-
-For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
-
-## Installation
-
-Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
-
-[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/compress/zstd.svg)](https://pkg.go.dev/github.com/klauspost/compress/zstd)
-
-## Compressor
-
-### Status: 
-
-STABLE - there may always be subtle bugs, a wide variety of content has been tested and the library is actively 
-used by several projects. This library is being [fuzz-tested](https://github.com/klauspost/compress-fuzz) for all updates.
-
-There may still be specific combinations of data types/size/settings that could lead to edge cases, 
-so as always, testing is recommended.  
-
-For now, a high speed (fastest) and medium-fast (default) compressor has been implemented. 
-
-* The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
-* The "Default" compression ratio is roughly equivalent to zstd level 3 (default).
-* The "Better" compression ratio is roughly equivalent to zstd level 7.
-* The "Best" compression ratio is roughly equivalent to zstd level 11.
-
-In terms of speed, it is typically 2x as fast as the stdlib deflate/gzip in its fastest mode. 
-The compression ratio compared to stdlib is around level 3, but usually 3x as fast.
-
- 
-### Usage
-
-An Encoder can be used for either compressing a stream via the
-`io.WriteCloser` interface supported by the Encoder or as multiple independent
-tasks via the `EncodeAll` function.
-Smaller encodes are encouraged to use the EncodeAll function.
-Use `NewWriter` to create a new instance that can be used for both.
-
-To create a writer with default options, do like this:
-
-```Go
-// Compress input to output.
-func Compress(in io.Reader, out io.Writer) error {
-    enc, err := zstd.NewWriter(out)
-    if err != nil {
-        return err
-    }
-    _, err = io.Copy(enc, in)
-    if err != nil {
-        enc.Close()
-        return err
-    }
-    return enc.Close()
-}
-```
-
-Now you can encode by writing data to `enc`. The output will be finished writing when `Close()` is called.
-Even if your encode fails, you should still call `Close()` to release any resources that may be held up.  
-
-The above is fine for big encodes. However, whenever possible try to *reuse* the writer.
-
-To reuse the encoder, you can use the `Reset(io.Writer)` function to change to another output. 
-This will allow the encoder to reuse all resources and avoid wasteful allocations. 
-
-Currently stream encoding has 'light' concurrency, meaning up to 2 goroutines can be working on part 
-of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is likely to change 
-in the future. So if you want to limit concurrency for future updates, specify the concurrency
-you would like.
-
-If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)`
-which will compress input as each block is completed, blocking on writes until each has completed.
-
-You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined 
-compression settings can be specified.
-
-#### Future Compatibility Guarantees
-
-This will be an evolving project. When using this package it is important to note that both the compression efficiency and speed may change.
-
-The goal will be to keep the default efficiency at the default zstd (level 3). 
-However the encoding should never be assumed to remain the same, 
-and you should not use hashes of compressed output for similarity checks.
-
-The Encoder can be assumed to produce the same output from the exact same code version.
-However, the may be modes in the future that break this, 
-although they will not be enabled without an explicit option.   
-
-This encoder is not designed to (and will probably never) output the exact same bitstream as the reference encoder.
-
-Also note, that the cgo decompressor currently does not [report all errors on invalid input](https://github.com/DataDog/zstd/issues/59),
-[omits error checks](https://github.com/DataDog/zstd/issues/61), [ignores checksums](https://github.com/DataDog/zstd/issues/43) 
-and seems to ignore concatenated streams, even though [it is part of the spec](https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frames).
-
-#### Blocks
-
-For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
-
-`EncodeAll` will encode all input in src and append it to dst.
-This function can be called concurrently. 
-Each call will only run on a same goroutine as the caller.
-
-Encoded blocks can be concatenated and the result will be the combined input stream.
-Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
-
-Especially when encoding blocks you should take special care to reuse the encoder. 
-This will effectively make it run without allocations after a warmup period. 
-To make it run completely without allocations, supply a destination buffer with space for all content.   
-
-```Go
-import "github.com/klauspost/compress/zstd"
-
-// Create a writer that caches compressors.
-// For this operation type we supply a nil Reader.
-var encoder, _ = zstd.NewWriter(nil)
-
-// Compress a buffer. 
-// If you have a destination buffer, the allocation in the call can also be eliminated.
-func Compress(src []byte) []byte {
-    return encoder.EncodeAll(src, make([]byte, 0, len(src)))
-} 
-```
-
-You can control the maximum number of concurrent encodes using the `WithEncoderConcurrency(n)` 
-option when creating the writer.
-
-Using the Encoder for both a stream and individual blocks concurrently is safe. 
-
-### Performance
-
-I have collected some speed examples to compare speed and compression against other compressors.
-
-* `file` is the input file.
-* `out` is the compressor used. `zskp` is this package. `zstd` is the Datadog cgo library. `gzstd/gzkp` is gzip standard and this library.
-* `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default"; 3 is "better", 4 is "best".
-* `insize`/`outsize` is the input/output size.
-* `millis` is the number of milliseconds used for compression.
-* `mb/s` is megabytes (2^20 bytes) per second.
-
-```
-Silesia Corpus:
-http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
-
-This package:
-file    out     level   insize      outsize     millis  mb/s
-silesia.tar zskp    1   211947520   73821326    634     318.47
-silesia.tar zskp    2   211947520   67655404    1508    133.96
-silesia.tar zskp    3   211947520   64746933    3000    67.37
-silesia.tar zskp    4   211947520   60073508    16926   11.94
-
-cgo zstd:
-silesia.tar zstd    1   211947520   73605392    543     371.56
-silesia.tar zstd    3   211947520   66793289    864     233.68
-silesia.tar zstd    6   211947520   62916450    1913    105.66
-silesia.tar zstd    9   211947520   60212393    5063    39.92
-
-gzip, stdlib/this package:
-silesia.tar gzstd   1   211947520   80007735    1498    134.87
-silesia.tar gzkp    1   211947520   80088272    1009    200.31
-
-GOB stream of binary data. Highly compressible.
-https://files.klauspost.com/compress/gob-stream.7z
-
-file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  233948096   3230    564.34
-gob-stream  zskp    2   1911399616  203997694   4997    364.73
-gob-stream  zskp    3   1911399616  173526523   13435   135.68
-gob-stream  zskp    4   1911399616  162195235   47559   38.33
-
-gob-stream  zstd    1   1911399616  249810424   2637    691.26
-gob-stream  zstd    3   1911399616  208192146   3490    522.31
-gob-stream  zstd    6   1911399616  193632038   6687    272.56
-gob-stream  zstd    9   1911399616  177620386   16175   112.70
-
-gob-stream  gzstd   1   1911399616  357382013   9046    201.49
-gob-stream  gzkp    1   1911399616  359136669   4885    373.08
-
-The test data for the Large Text Compression Benchmark is the first
-10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
-http://mattmahoney.net/dc/textdata.html
-
-file    out level   insize      outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343833605   3687    258.64
-enwik9  zskp    2   1000000000  317001237   7672    124.29
-enwik9  zskp    3   1000000000  291915823   15923   59.89
-enwik9  zskp    4   1000000000  261710291   77697   12.27
-
-enwik9  zstd    1   1000000000  358072021   3110    306.65
-enwik9  zstd    3   1000000000  313734672   4784    199.35
-enwik9  zstd    6   1000000000  295138875   10290   92.68
-enwik9  zstd    9   1000000000  278348700   28549   33.40
-
-enwik9  gzstd   1   1000000000  382578136   8608    110.78
-enwik9  gzkp    1   1000000000  382781160   5628    169.45
-
-Highly compressible JSON file.
-https://files.klauspost.com/compress/github-june-2days-2019.json.zst
-
-file                        out level   insize      outsize     millis  mb/s
-github-june-2days-2019.json zskp    1   6273951764  697439532   9789    611.17
-github-june-2days-2019.json zskp    2   6273951764  610876538   18553   322.49
-github-june-2days-2019.json zskp    3   6273951764  517662858   44186   135.41
-github-june-2days-2019.json zskp    4   6273951764  464617114   165373  36.18
-
-github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
-github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
-github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
-github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
-
-github-june-2days-2019.json gzstd   1   6273951764  1164397768  26793   223.32
-github-june-2days-2019.json gzkp    1   6273951764  1120631856  17693   338.16
-
-VM Image, Linux mint with a few installed applications:
-https://files.klauspost.com/compress/rawstudio-mint14.7z
-
-file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3718400221  18206   448.29
-rawstudio-mint14.tar    zskp    2   8558382592  3326118337  37074   220.15
-rawstudio-mint14.tar    zskp    3   8558382592  3163842361  87306   93.49
-rawstudio-mint14.tar    zskp    4   8558382592  2970480650  783862  10.41
-
-rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
-rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
-rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
-rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
-
-rawstudio-mint14.tar    gzstd   1   8558382592  3926234992  51345   158.96
-rawstudio-mint14.tar    gzkp    1   8558382592  3960117298  36722   222.26
-
-CSV data:
-https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
-
-file                    out level   insize      outsize     millis  mb/s
-nyc-taxi-data-10M.csv   zskp    1   3325605752  641319332   9462    335.17
-nyc-taxi-data-10M.csv   zskp    2   3325605752  588976126   17570   180.50
-nyc-taxi-data-10M.csv   zskp    3   3325605752  529329260   32432   97.79
-nyc-taxi-data-10M.csv   zskp    4   3325605752  474949772   138025  22.98
-
-nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
-nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
-nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
-nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
-
-nyc-taxi-data-10M.csv   gzstd   1   3325605752  928654908   21270   149.11
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  922273214   13929   227.68
-```
-
-## Decompressor
-
-Status: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
-
-This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
-kindly supplied by [fuzzit.dev](https://fuzzit.dev/). 
-The main purpose of the fuzz testing is to ensure that it is not possible to crash the decoder, 
-or run it past its limits with ANY input provided.  
- 
-### Usage
-
-The package has been designed for two main usages, big streams of data and smaller in-memory buffers. 
-There are two main usages of the package for these. Both of them are accessed by creating a `Decoder`.
-
-For streaming use a simple setup could look like this:
-
-```Go
-import "github.com/klauspost/compress/zstd"
-
-func Decompress(in io.Reader, out io.Writer) error {
-    d, err := zstd.NewReader(in)
-    if err != nil {
-        return err
-    }
-    defer d.Close()
-    
-    // Copy content...
-    _, err = io.Copy(out, d)
-    return err
-}
-```
-
-It is important to use the "Close" function when you no longer need the Reader to stop running goroutines, 
-when running with default settings.
-Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream.
-
-Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput.
-However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data 
-as it is being requested only.
-
-For decoding buffers, it could look something like this:
-
-```Go
-import "github.com/klauspost/compress/zstd"
-
-// Create a reader that caches decompressors.
-// For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil, zstd.WithDecoderConcurrency(0))
-
-// Decompress a buffer. We don't supply a destination buffer,
-// so it will be allocated by the decoder.
-func Decompress(src []byte) ([]byte, error) {
-    return decoder.DecodeAll(src, nil)
-} 
-```
-
-Both of these cases should provide the functionality needed. 
-The decoder can be used for *concurrent* decompression of multiple buffers.
-By default 4 decompressors will be created. 
-
-It will only allow a certain number of concurrent operations to run. 
-To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
-It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders.
-
-### Dictionaries
-
-Data compressed with [dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression) can be decompressed.
-
-Dictionaries are added individually to Decoders.
-Dictionaries are generated by the `zstd --train` command and contains an initial state for the decoder.
-To add a dictionary use the `WithDecoderDicts(dicts ...[]byte)` option with the dictionary data.
-Several dictionaries can be added at once.
-
-The dictionary will be used automatically for the data that specifies them.
-A re-used Decoder will still contain the dictionaries registered.
-
-When registering multiple dictionaries with the same ID, the last one will be used.
-
-It is possible to use dictionaries when compressing data.
-
-To enable a dictionary use `WithEncoderDict(dict []byte)`. Here only one dictionary will be used 
-and it will likely be used even if it doesn't improve compression. 
-
-The used dictionary must be used to decompress the content.
-
-For any real gains, the dictionary should be built with similar data. 
-If an unsuitable dictionary is used the output may be slightly larger than using no dictionary.
-Use the [zstd commandline tool](https://github.com/facebook/zstd/releases) to build a dictionary from sample data.
-For information see [zstd dictionary information](https://github.com/facebook/zstd#the-case-for-small-data-compression). 
-
-For now there is a fixed startup performance penalty for compressing content with dictionaries. 
-This will likely be improved over time. Just be aware to test performance when implementing.  
-
-### Allocation-less operation
-
-The decoder has been designed to operate without allocations after a warmup. 
-
-This means that you should *store* the decoder for best performance. 
-To re-use a stream decoder, use the `Reset(r io.Reader) error` to switch to another stream.
-A decoder can safely be re-used even if the previous stream failed.
-
-To release the resources, you must call the `Close()` function on a decoder.
-After this it can *no longer be reused*, but all running goroutines will be stopped.
-So you *must* use this if you will no longer need the Reader.
-
-For decompressing smaller buffers a single decoder can be used.
-When decoding buffers, you can supply a destination slice with length 0 and your expected capacity.
-In this case no unneeded allocations should be made. 
-
-### Concurrency
-
-The buffer decoder does everything on the same goroutine and does nothing concurrently.
-It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
-
-The stream decoder will create goroutines that:
-
-1) Reads input and splits the input into blocks.
-2) Decompression of literals.
-3) Decompression of sequences.
-4) Reconstruction of output stream.
-
-So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
-
-The concurrency level will, for streams, determine how many blocks ahead the compression will start.
-
-Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
-
-In practice this means that concurrency is often limited to utilizing about 3 cores effectively.
-  
-### Benchmarks
-
-The first two are streaming decodes and the last are smaller inputs. 
-
-Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
-
-```
-BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
-BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
-
-Concurrent blocks, performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
-```
-
-This reflects the performance around May 2022, but this may be out of date.
-
-## Zstd inside ZIP files
-
-It is possible to use zstandard to compress individual files inside zip archives.
-While this isn't widely supported it can be useful for internal files.
-
-To support the compression and decompression of these files you must register a compressor and decompressor.
-
-It is highly recommended registering the (de)compressors on individual zip Reader/Writer and NOT
-use the global registration functions. The main reason for this is that 2 registrations from 
-different packages will result in a panic.
-
-It is a good idea to only have a single compressor and decompressor, since they can be used for multiple zip
-files concurrently, and using a single instance will allow reusing some resources.
-
-See [this example](https://pkg.go.dev/github.com/klauspost/compress/zstd#example-ZipCompressor) for 
-how to compress and decompress files inside zip archives.
-
-# Contributions
-
-Contributions are always welcome. 
-For new features/fixes, remember to add tests and for performance enhancements include benchmarks.
-
-For general feedback and experience reports, feel free to open an issue or write me on [Twitter](https://twitter.com/sh0dan).
-
-This package includes the excellent [`github.com/cespare/xxhash`](https://github.com/cespare/xxhash) package Copyright (c) 2016 Caleb Spare.
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
deleted file mode 100644
index d41e3e170..000000000
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"io"
-	"math/bits"
-
-	"github.com/klauspost/compress/internal/le"
-)
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReader struct {
-	in       []byte
-	value    uint64 // Maybe use [16]byte, but shifting is awkward.
-	cursor   int    // offset where next read should end
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.cursor = len(in)
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBits(uint32(v)))
-	return nil
-}
-
-// getBits will return n bits. n can be 0.
-func (b *bitReader) getBits(n uint8) int {
-	if n == 0 /*|| b.bitsRead >= 64 */ {
-		return 0
-	}
-	return int(b.get32BitsFast(n))
-}
-
-// get32BitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) get32BitsFast(n uint8) uint32 {
-	const regMask = 64 - 1
-	v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	b.bitsRead += n
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-	b.cursor -= 4
-	b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
-	b.bitsRead -= 32
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	b.cursor -= 8
-	b.value = le.Load64(b.in, b.cursor)
-	b.bitsRead = 0
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.cursor >= 4 {
-		b.cursor -= 4
-		b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
-		b.bitsRead -= 32
-		return
-	}
-
-	b.bitsRead -= uint8(8 * b.cursor)
-	for b.cursor > 0 {
-		b.cursor -= 1
-		b.value = (b.value << 8) | uint64(b.in[b.cursor])
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.cursor == 0 && b.bitsRead >= 64
-}
-
-// overread returns true if more bits have been requested than is on the stream.
-func (b *bitReader) overread() bool {
-	return b.bitsRead > 64
-}
-
-// remain returns the number of bits remaining.
-func (b *bitReader) remain() uint {
-	return 8*uint(b.cursor) + 64 - uint(b.bitsRead)
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	b.cursor = 0
-	if !b.finished() {
-		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
-	}
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
-func highBits(val uint32) (n uint32) {
-	return uint32(bits.Len32(val) - 1)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
deleted file mode 100644
index 1952f175b..000000000
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package zstd
-
-// bitWriter will write bits.
-// First bit will be LSB of the first byte of output.
-type bitWriter struct {
-	bitContainer uint64
-	nBits        uint8
-	out          []byte
-}
-
-// bitMask16 is bitmasks. Has extra to avoid bounds check.
-var bitMask16 = [32]uint16{
-	0, 1, 3, 7, 0xF, 0x1F,
-	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
-	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF} /* up to 16 bits */
-
-var bitMask32 = [32]uint32{
-	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
-	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
-	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
-	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
-} // up to 32 bits
-
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// addBits32NC will add up to 31 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
-	b.bitContainer |= uint64(value&bitMask32[bits&31]) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// addBits64NC will add up to 64 bits.
-// There must be space for 32 bits.
-func (b *bitWriter) addBits64NC(value uint64, bits uint8) {
-	if bits <= 31 {
-		b.addBits32Clean(uint32(value), bits)
-		return
-	}
-	b.addBits32Clean(uint32(value), 32)
-	b.flush32()
-	b.addBits32Clean(uint32(value>>32), bits-32)
-}
-
-// addBits32Clean will add up to 32 bits.
-// It will not check if there is space for them.
-// The input must not contain more bits than specified.
-func (b *bitWriter) addBits32Clean(value uint32, bits uint8) {
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// flush32 will flush out, so there are at least 32 bits available for writing.
-func (b *bitWriter) flush32() {
-	if b.nBits < 32 {
-		return
-	}
-	b.out = append(b.out,
-		byte(b.bitContainer),
-		byte(b.bitContainer>>8),
-		byte(b.bitContainer>>16),
-		byte(b.bitContainer>>24))
-	b.nBits -= 32
-	b.bitContainer >>= 32
-}
-
-// flushAlign will flush remaining full bytes and align to next byte boundary.
-func (b *bitWriter) flushAlign() {
-	nbBytes := (b.nBits + 7) >> 3
-	for i := uint8(0); i < nbBytes; i++ {
-		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
-	}
-	b.nBits = 0
-	b.bitContainer = 0
-}
-
-// close will write the alignment bit and write the final byte(s)
-// to the output.
-func (b *bitWriter) close() {
-	// End mark
-	b.addBits16Clean(1, 1)
-	// flush until next byte.
-	b.flushAlign()
-}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
-	b.bitContainer = 0
-	b.nBits = 0
-	b.out = out
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
deleted file mode 100644
index 0dd742fd2..000000000
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ /dev/null
@@ -1,712 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"hash/crc32"
-	"io"
-	"sync"
-
-	"github.com/klauspost/compress/huff0"
-	"github.com/klauspost/compress/zstd/internal/xxhash"
-)
-
-type blockType uint8
-
-//go:generate stringer -type=blockType,literalsBlockType,seqCompMode,tableIndex
-
-const (
-	blockTypeRaw blockType = iota
-	blockTypeRLE
-	blockTypeCompressed
-	blockTypeReserved
-)
-
-type literalsBlockType uint8
-
-const (
-	literalsBlockRaw literalsBlockType = iota
-	literalsBlockRLE
-	literalsBlockCompressed
-	literalsBlockTreeless
-)
-
-const (
-	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
-	maxCompressedBlockSize = 128 << 10
-
-	compressedBlockOverAlloc    = 16
-	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
-
-	// Maximum possible block size (all Raw+Uncompressed).
-	maxBlockSize = (1 << 21) - 1
-
-	maxMatchLen  = 131074
-	maxSequences = 0x7f00 + 0xffff
-
-	// We support slightly less than the reference decoder to be able to
-	// use ints on 32 bit archs.
-	maxOffsetBits = 30
-)
-
-var (
-	huffDecoderPool = sync.Pool{New: func() interface{} {
-		return &huff0.Scratch{}
-	}}
-
-	fseDecoderPool = sync.Pool{New: func() interface{} {
-		return &fseDecoder{}
-	}}
-)
-
-type blockDec struct {
-	// Raw source data of the block.
-	data        []byte
-	dataStorage []byte
-
-	// Destination of the decoded data.
-	dst []byte
-
-	// Buffer for literals data.
-	literalBuf []byte
-
-	// Window size of the block.
-	WindowSize uint64
-
-	err error
-
-	// Check against this crc, if hasCRC is true.
-	checkCRC uint32
-	hasCRC   bool
-
-	// Frame to use for singlethreaded decoding.
-	// Should not be used by the decoder itself since parent may be another frame.
-	localFrame *frameDec
-
-	sequence []seqVals
-
-	async struct {
-		newHist  *history
-		literals []byte
-		seqData  []byte
-		seqSize  int // Size of uncompressed sequences
-		fcs      uint64
-	}
-
-	// Block is RLE, this is the size.
-	RLESize uint32
-
-	Type blockType
-
-	// Is this the last block of a frame?
-	Last bool
-
-	// Use less memory
-	lowMem bool
-}
-
-func (b *blockDec) String() string {
-	if b == nil {
-		return "<nil>"
-	}
-	return fmt.Sprintf("Steam Size: %d, Type: %v, Last: %t, Window: %d", len(b.data), b.Type, b.Last, b.WindowSize)
-}
-
-func newBlockDec(lowMem bool) *blockDec {
-	b := blockDec{
-		lowMem: lowMem,
-	}
-	return &b
-}
-
-// reset will reset the block.
-// Input must be a start of a block and will be at the end of the block when returned.
-func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
-	b.WindowSize = windowSize
-	tmp, err := br.readSmall(3)
-	if err != nil {
-		println("Reading block header:", err)
-		return err
-	}
-	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
-	b.Last = bh&1 != 0
-	b.Type = blockType((bh >> 1) & 3)
-	// find size.
-	cSize := int(bh >> 3)
-	maxSize := maxCompressedBlockSizeAlloc
-	switch b.Type {
-	case blockTypeReserved:
-		return ErrReservedBlockType
-	case blockTypeRLE:
-		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
-			if debugDecoder {
-				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
-			}
-			return ErrWindowSizeExceeded
-		}
-		b.RLESize = uint32(cSize)
-		if b.lowMem {
-			maxSize = cSize
-		}
-		cSize = 1
-	case blockTypeCompressed:
-		if debugDecoder {
-			println("Data size on stream:", cSize)
-		}
-		b.RLESize = 0
-		maxSize = maxCompressedBlockSizeAlloc
-		if windowSize < maxCompressedBlockSize && b.lowMem {
-			maxSize = int(windowSize) + compressedBlockOverAlloc
-		}
-		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
-			if debugDecoder {
-				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
-			}
-			return ErrCompressedSizeTooBig
-		}
-		// Empty compressed blocks must at least be 2 bytes
-		// for Literals_Block_Type and one for Sequences_Section_Header.
-		if cSize < 2 {
-			return ErrBlockTooSmall
-		}
-	case blockTypeRaw:
-		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
-			if debugDecoder {
-				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
-			}
-			return ErrWindowSizeExceeded
-		}
-
-		b.RLESize = 0
-		// We do not need a destination for raw blocks.
-		maxSize = -1
-	default:
-		panic("Invalid block type")
-	}
-
-	// Read block data.
-	if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
-		// byteBuf doesn't need a destination buffer.
-		if b.lowMem || cSize > maxCompressedBlockSize {
-			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
-		} else {
-			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
-		}
-	}
-	b.data, err = br.readBig(cSize, b.dataStorage)
-	if err != nil {
-		if debugDecoder {
-			println("Reading block:", err, "(", cSize, ")", len(b.data))
-			printf("%T", br)
-		}
-		return err
-	}
-	if cap(b.dst) <= maxSize {
-		b.dst = make([]byte, 0, maxSize+1)
-	}
-	return nil
-}
-
-// sendEOF will make the decoder send EOF on this frame.
-func (b *blockDec) sendErr(err error) {
-	b.Last = true
-	b.Type = blockTypeReserved
-	b.err = err
-}
-
-// Close will release resources.
-// Closed blockDec cannot be reset.
-func (b *blockDec) Close() {
-}
-
-// decodeBuf
-func (b *blockDec) decodeBuf(hist *history) error {
-	switch b.Type {
-	case blockTypeRLE:
-		if cap(b.dst) < int(b.RLESize) {
-			if b.lowMem {
-				b.dst = make([]byte, b.RLESize)
-			} else {
-				b.dst = make([]byte, maxCompressedBlockSize)
-			}
-		}
-		b.dst = b.dst[:b.RLESize]
-		v := b.data[0]
-		for i := range b.dst {
-			b.dst[i] = v
-		}
-		hist.appendKeep(b.dst)
-		return nil
-	case blockTypeRaw:
-		hist.appendKeep(b.data)
-		return nil
-	case blockTypeCompressed:
-		saved := b.dst
-		// Append directly to history
-		if hist.ignoreBuffer == 0 {
-			b.dst = hist.b
-			hist.b = nil
-		} else {
-			b.dst = b.dst[:0]
-		}
-		err := b.decodeCompressed(hist)
-		if debugDecoder {
-			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
-		}
-		if hist.ignoreBuffer == 0 {
-			hist.b = b.dst
-			b.dst = saved
-		} else {
-			hist.appendKeep(b.dst)
-		}
-		return err
-	case blockTypeReserved:
-		// Used for returning errors.
-		return b.err
-	default:
-		panic("Invalid block type")
-	}
-}
-
-func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
-	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
-	if len(in) < 2 {
-		return in, ErrBlockTooSmall
-	}
-
-	litType := literalsBlockType(in[0] & 3)
-	var litRegenSize int
-	var litCompSize int
-	sizeFormat := (in[0] >> 2) & 3
-	var fourStreams bool
-	var literals []byte
-	switch litType {
-	case literalsBlockRaw, literalsBlockRLE:
-		switch sizeFormat {
-		case 0, 2:
-			// Regenerated_Size uses 5 bits (0-31). Literals_Section_Header uses 1 byte.
-			litRegenSize = int(in[0] >> 3)
-			in = in[1:]
-		case 1:
-			// Regenerated_Size uses 12 bits (0-4095). Literals_Section_Header uses 2 bytes.
-			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4)
-			in = in[2:]
-		case 3:
-			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
-			if len(in) < 3 {
-				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return in, ErrBlockTooSmall
-			}
-			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
-			in = in[3:]
-		}
-	case literalsBlockCompressed, literalsBlockTreeless:
-		switch sizeFormat {
-		case 0, 1:
-			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
-			if len(in) < 3 {
-				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return in, ErrBlockTooSmall
-			}
-			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
-			litRegenSize = int(n & 1023)
-			litCompSize = int(n >> 10)
-			fourStreams = sizeFormat == 1
-			in = in[3:]
-		case 2:
-			fourStreams = true
-			if len(in) < 4 {
-				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return in, ErrBlockTooSmall
-			}
-			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
-			litRegenSize = int(n & 16383)
-			litCompSize = int(n >> 14)
-			in = in[4:]
-		case 3:
-			fourStreams = true
-			if len(in) < 5 {
-				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return in, ErrBlockTooSmall
-			}
-			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
-			litRegenSize = int(n & 262143)
-			litCompSize = int(n >> 18)
-			in = in[5:]
-		}
-	}
-	if debugDecoder {
-		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
-	}
-	if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
-		return in, ErrWindowSizeExceeded
-	}
-
-	switch litType {
-	case literalsBlockRaw:
-		if len(in) < litRegenSize {
-			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
-			return in, ErrBlockTooSmall
-		}
-		literals = in[:litRegenSize]
-		in = in[litRegenSize:]
-		//printf("Found %d uncompressed literals\n", litRegenSize)
-	case literalsBlockRLE:
-		if len(in) < 1 {
-			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
-			return in, ErrBlockTooSmall
-		}
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
-			} else {
-				b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
-			}
-		}
-		literals = b.literalBuf[:litRegenSize]
-		v := in[0]
-		for i := range literals {
-			literals[i] = v
-		}
-		in = in[1:]
-		if debugDecoder {
-			printf("Found %d RLE compressed literals\n", litRegenSize)
-		}
-	case literalsBlockTreeless:
-		if len(in) < litCompSize {
-			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return in, ErrBlockTooSmall
-		}
-		// Store compressed literals, so we defer decoding until we get history.
-		literals = in[:litCompSize]
-		in = in[litCompSize:]
-		if debugDecoder {
-			printf("Found %d compressed literals\n", litCompSize)
-		}
-		huff := hist.huffTree
-		if huff == nil {
-			return in, errors.New("literal block was treeless, but no history was defined")
-		}
-		// Ensure we have space to store it.
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
-			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
-			}
-		}
-		var err error
-		// Use our out buffer.
-		huff.MaxDecodedSize = litRegenSize
-		if fourStreams {
-			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
-		} else {
-			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
-		}
-		// Make sure we don't leak our literals buffer
-		if err != nil {
-			println("decompressing literals:", err)
-			return in, err
-		}
-		if len(literals) != litRegenSize {
-			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
-		}
-
-	case literalsBlockCompressed:
-		if len(in) < litCompSize {
-			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return in, ErrBlockTooSmall
-		}
-		literals = in[:litCompSize]
-		in = in[litCompSize:]
-		// Ensure we have space to store it.
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
-			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
-			}
-		}
-		huff := hist.huffTree
-		if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
-			huff = huffDecoderPool.Get().(*huff0.Scratch)
-			if huff == nil {
-				huff = &huff0.Scratch{}
-			}
-		}
-		var err error
-		if debugDecoder {
-			println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
-		}
-		huff, literals, err = huff0.ReadTable(literals, huff)
-		if err != nil {
-			println("reading huffman table:", err)
-			return in, err
-		}
-		hist.huffTree = huff
-		huff.MaxDecodedSize = litRegenSize
-		// Use our out buffer.
-		if fourStreams {
-			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
-		} else {
-			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
-		}
-		if err != nil {
-			println("decoding compressed literals:", err)
-			return in, err
-		}
-		// Make sure we don't leak our literals buffer
-		if len(literals) != litRegenSize {
-			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
-		}
-		// Re-cap to get extra size.
-		literals = b.literalBuf[:len(literals)]
-		if debugDecoder {
-			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
-		}
-	}
-	hist.decoders.literals = literals
-	return in, nil
-}
-
-// decodeCompressed will start decompressing a block.
-func (b *blockDec) decodeCompressed(hist *history) error {
-	in := b.data
-	in, err := b.decodeLiterals(in, hist)
-	if err != nil {
-		return err
-	}
-	err = b.prepareSequences(in, hist)
-	if err != nil {
-		return err
-	}
-	if hist.decoders.nSeqs == 0 {
-		b.dst = append(b.dst, hist.decoders.literals...)
-		return nil
-	}
-	before := len(hist.decoders.out)
-	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
-	if err != nil {
-		return err
-	}
-	if hist.decoders.maxSyncLen > 0 {
-		hist.decoders.maxSyncLen += uint64(before)
-		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
-	}
-	b.dst = hist.decoders.out
-	hist.recentOffsets = hist.decoders.prevOffset
-	return nil
-}
-
-func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
-	if debugDecoder {
-		printf("prepareSequences: %d byte(s) input\n", len(in))
-	}
-	// Decode Sequences
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
-	if len(in) < 1 {
-		return ErrBlockTooSmall
-	}
-	var nSeqs int
-	seqHeader := in[0]
-	switch {
-	case seqHeader < 128:
-		nSeqs = int(seqHeader)
-		in = in[1:]
-	case seqHeader < 255:
-		if len(in) < 2 {
-			return ErrBlockTooSmall
-		}
-		nSeqs = int(seqHeader-128)<<8 | int(in[1])
-		in = in[2:]
-	case seqHeader == 255:
-		if len(in) < 3 {
-			return ErrBlockTooSmall
-		}
-		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
-		in = in[3:]
-	}
-	if nSeqs == 0 && len(in) != 0 {
-		// When no sequences, there should not be any more data...
-		if debugDecoder {
-			printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
-		}
-		return ErrUnexpectedBlockSize
-	}
-
-	var seqs = &hist.decoders
-	seqs.nSeqs = nSeqs
-	if nSeqs > 0 {
-		if len(in) < 1 {
-			return ErrBlockTooSmall
-		}
-		br := byteReader{b: in, off: 0}
-		compMode := br.Uint8()
-		br.advance(1)
-		if debugDecoder {
-			printf("Compression modes: 0b%b", compMode)
-		}
-		if compMode&3 != 0 {
-			return errors.New("corrupt block: reserved bits not zero")
-		}
-		for i := uint(0); i < 3; i++ {
-			mode := seqCompMode((compMode >> (6 - i*2)) & 3)
-			if debugDecoder {
-				println("Table", tableIndex(i), "is", mode)
-			}
-			var seq *sequenceDec
-			switch tableIndex(i) {
-			case tableLiteralLengths:
-				seq = &seqs.litLengths
-			case tableOffsets:
-				seq = &seqs.offsets
-			case tableMatchLengths:
-				seq = &seqs.matchLengths
-			default:
-				panic("unknown table")
-			}
-			switch mode {
-			case compModePredefined:
-				if seq.fse != nil && !seq.fse.preDefined {
-					fseDecoderPool.Put(seq.fse)
-				}
-				seq.fse = &fsePredef[i]
-			case compModeRLE:
-				if br.remain() < 1 {
-					return ErrBlockTooSmall
-				}
-				v := br.Uint8()
-				br.advance(1)
-				if seq.fse == nil || seq.fse.preDefined {
-					seq.fse = fseDecoderPool.Get().(*fseDecoder)
-				}
-				symb, err := decSymbolValue(v, symbolTableX[i])
-				if err != nil {
-					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
-					return err
-				}
-				seq.fse.setRLE(symb)
-				if debugDecoder {
-					printf("RLE set to 0x%x, code: %v", symb, v)
-				}
-			case compModeFSE:
-				if debugDecoder {
-					println("Reading table for", tableIndex(i))
-				}
-				if seq.fse == nil || seq.fse.preDefined {
-					seq.fse = fseDecoderPool.Get().(*fseDecoder)
-				}
-				err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
-				if err != nil {
-					println("Read table error:", err)
-					return err
-				}
-				err = seq.fse.transform(symbolTableX[i])
-				if err != nil {
-					println("Transform table error:", err)
-					return err
-				}
-				if debugDecoder {
-					println("Read table ok", "symbolLen:", seq.fse.symbolLen)
-				}
-			case compModeRepeat:
-				seq.repeat = true
-			}
-			if br.overread() {
-				return io.ErrUnexpectedEOF
-			}
-		}
-		in = br.unread()
-	}
-	if debugDecoder {
-		println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
-	}
-
-	if nSeqs == 0 {
-		if len(b.sequence) > 0 {
-			b.sequence = b.sequence[:0]
-		}
-		return nil
-	}
-	br := seqs.br
-	if br == nil {
-		br = &bitReader{}
-	}
-	if err := br.init(in); err != nil {
-		return err
-	}
-
-	if err := seqs.initialize(br, hist, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
-
-	return nil
-}
-
-func (b *blockDec) decodeSequences(hist *history) error {
-	if cap(b.sequence) < hist.decoders.nSeqs {
-		if b.lowMem {
-			b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
-		} else {
-			b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
-		}
-	}
-	b.sequence = b.sequence[:hist.decoders.nSeqs]
-	if hist.decoders.nSeqs == 0 {
-		hist.decoders.seqSize = len(hist.decoders.literals)
-		return nil
-	}
-	hist.decoders.windowSize = hist.windowSize
-	hist.decoders.prevOffset = hist.recentOffsets
-
-	err := hist.decoders.decode(b.sequence)
-	hist.recentOffsets = hist.decoders.prevOffset
-	return err
-}
-
-func (b *blockDec) executeSequences(hist *history) error {
-	hbytes := hist.b
-	if len(hbytes) > hist.windowSize {
-		hbytes = hbytes[len(hbytes)-hist.windowSize:]
-		// We do not need history anymore.
-		if hist.dict != nil {
-			hist.dict.content = nil
-		}
-	}
-	hist.decoders.windowSize = hist.windowSize
-	hist.decoders.out = b.dst[:0]
-	err := hist.decoders.execute(b.sequence, hbytes)
-	if err != nil {
-		return err
-	}
-	return b.updateHistory(hist)
-}
-
-func (b *blockDec) updateHistory(hist *history) error {
-	if len(b.data) > maxCompressedBlockSize {
-		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
-	}
-	// Set output and release references.
-	b.dst = hist.decoders.out
-	hist.recentOffsets = hist.decoders.prevOffset
-
-	if b.Last {
-		// if last block we don't care about history.
-		println("Last block, no history returned")
-		hist.b = hist.b[:0]
-		return nil
-	} else {
-		hist.append(b.dst)
-		if debugDecoder {
-			println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
-		}
-	}
-	hist.decoders.out, hist.decoders.literals = nil, nil
-
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
deleted file mode 100644
index fd35ea148..000000000
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ /dev/null
@@ -1,892 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"math"
-	"math/bits"
-	"slices"
-
-	"github.com/klauspost/compress/huff0"
-)
-
-type blockEnc struct {
-	size       int
-	literals   []byte
-	sequences  []seq
-	coders     seqCoders
-	litEnc     *huff0.Scratch
-	dictLitEnc *huff0.Scratch
-	wr         bitWriter
-
-	extraLits         int
-	output            []byte
-	recentOffsets     [3]uint32
-	prevRecentOffsets [3]uint32
-
-	last   bool
-	lowMem bool
-}
-
-// init should be used once the block has been created.
-// If called more than once, the effect is the same as calling reset.
-func (b *blockEnc) init() {
-	if b.lowMem {
-		// 1K literals
-		if cap(b.literals) < 1<<10 {
-			b.literals = make([]byte, 0, 1<<10)
-		}
-		const defSeqs = 20
-		if cap(b.sequences) < defSeqs {
-			b.sequences = make([]seq, 0, defSeqs)
-		}
-		// 1K
-		if cap(b.output) < 1<<10 {
-			b.output = make([]byte, 0, 1<<10)
-		}
-	} else {
-		if cap(b.literals) < maxCompressedBlockSize {
-			b.literals = make([]byte, 0, maxCompressedBlockSize)
-		}
-		const defSeqs = 2000
-		if cap(b.sequences) < defSeqs {
-			b.sequences = make([]seq, 0, defSeqs)
-		}
-		if cap(b.output) < maxCompressedBlockSize {
-			b.output = make([]byte, 0, maxCompressedBlockSize)
-		}
-	}
-
-	if b.coders.mlEnc == nil {
-		b.coders.mlEnc = &fseEncoder{}
-		b.coders.mlPrev = &fseEncoder{}
-		b.coders.ofEnc = &fseEncoder{}
-		b.coders.ofPrev = &fseEncoder{}
-		b.coders.llEnc = &fseEncoder{}
-		b.coders.llPrev = &fseEncoder{}
-	}
-	b.litEnc = &huff0.Scratch{WantLogLess: 4}
-	b.reset(nil)
-}
-
-// initNewEncode can be used to reset offsets and encoders to the initial state.
-func (b *blockEnc) initNewEncode() {
-	b.recentOffsets = [3]uint32{1, 4, 8}
-	b.litEnc.Reuse = huff0.ReusePolicyNone
-	b.coders.setPrev(nil, nil, nil)
-}
-
-// reset will reset the block for a new encode, but in the same stream,
-// meaning that state will be carried over, but the block content is reset.
-// If a previous block is provided, the recent offsets are carried over.
-func (b *blockEnc) reset(prev *blockEnc) {
-	b.extraLits = 0
-	b.literals = b.literals[:0]
-	b.size = 0
-	b.sequences = b.sequences[:0]
-	b.output = b.output[:0]
-	b.last = false
-	if prev != nil {
-		b.recentOffsets = prev.prevRecentOffsets
-	}
-	b.dictLitEnc = nil
-}
-
-// reset will reset the block for a new encode, but in the same stream,
-// meaning that state will be carried over, but the block content is reset.
-// If a previous block is provided, the recent offsets are carried over.
-func (b *blockEnc) swapEncoders(prev *blockEnc) {
-	b.coders.swap(&prev.coders)
-	b.litEnc, prev.litEnc = prev.litEnc, b.litEnc
-}
-
-// blockHeader contains the information for a block header.
-type blockHeader uint32
-
-// setLast sets the 'last' indicator on a block.
-func (h *blockHeader) setLast(b bool) {
-	if b {
-		*h = *h | 1
-	} else {
-		const mask = (1 << 24) - 2
-		*h = *h & mask
-	}
-}
-
-// setSize will store the compressed size of a block.
-func (h *blockHeader) setSize(v uint32) {
-	const mask = 7
-	*h = (*h)&mask | blockHeader(v<<3)
-}
-
-// setType sets the block type.
-func (h *blockHeader) setType(t blockType) {
-	const mask = 1 | (((1 << 24) - 1) ^ 7)
-	*h = (*h & mask) | blockHeader(t<<1)
-}
-
-// appendTo will append the block header to a slice.
-func (h blockHeader) appendTo(b []byte) []byte {
-	return append(b, uint8(h), uint8(h>>8), uint8(h>>16))
-}
-
-// String returns a string representation of the block.
-func (h blockHeader) String() string {
-	return fmt.Sprintf("Type: %d, Size: %d, Last:%t", (h>>1)&3, h>>3, h&1 == 1)
-}
-
-// literalsHeader contains literals header information.
-type literalsHeader uint64
-
-// setType can be used to set the type of literal block.
-func (h *literalsHeader) setType(t literalsBlockType) {
-	const mask = math.MaxUint64 - 3
-	*h = (*h & mask) | literalsHeader(t)
-}
-
-// setSize can be used to set a single size, for uncompressed and RLE content.
-func (h *literalsHeader) setSize(regenLen int) {
-	inBits := bits.Len32(uint32(regenLen))
-	// Only retain 2 bits
-	const mask = 3
-	lh := uint64(*h & mask)
-	switch {
-	case inBits < 5:
-		lh |= (uint64(regenLen) << 3) | (1 << 60)
-		if debugEncoder {
-			got := int(lh>>3) & 0xff
-			if got != regenLen {
-				panic(fmt.Sprint("litRegenSize = ", regenLen, "(want) != ", got, "(got)"))
-			}
-		}
-	case inBits < 12:
-		lh |= (1 << 2) | (uint64(regenLen) << 4) | (2 << 60)
-	case inBits < 20:
-		lh |= (3 << 2) | (uint64(regenLen) << 4) | (3 << 60)
-	default:
-		panic(fmt.Errorf("internal error: block too big (%d)", regenLen))
-	}
-	*h = literalsHeader(lh)
-}
-
-// setSizes will set the size of a compressed literals section and the input length.
-func (h *literalsHeader) setSizes(compLen, inLen int, single bool) {
-	compBits, inBits := bits.Len32(uint32(compLen)), bits.Len32(uint32(inLen))
-	// Only retain 2 bits
-	const mask = 3
-	lh := uint64(*h & mask)
-	switch {
-	case compBits <= 10 && inBits <= 10:
-		if !single {
-			lh |= 1 << 2
-		}
-		lh |= (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
-		if debugEncoder {
-			const mmask = (1 << 24) - 1
-			n := (lh >> 4) & mmask
-			if int(n&1023) != inLen {
-				panic(fmt.Sprint("regensize:", int(n&1023), "!=", inLen, inBits))
-			}
-			if int(n>>10) != compLen {
-				panic(fmt.Sprint("compsize:", int(n>>10), "!=", compLen, compBits))
-			}
-		}
-	case compBits <= 14 && inBits <= 14:
-		lh |= (2 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (14 + 4)) | (4 << 60)
-		if single {
-			panic("single stream used with more than 10 bits length.")
-		}
-	case compBits <= 18 && inBits <= 18:
-		lh |= (3 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (18 + 4)) | (5 << 60)
-		if single {
-			panic("single stream used with more than 10 bits length.")
-		}
-	default:
-		panic("internal error: block too big")
-	}
-	*h = literalsHeader(lh)
-}
-
-// appendTo will append the literals header to a byte slice.
-func (h literalsHeader) appendTo(b []byte) []byte {
-	size := uint8(h >> 60)
-	switch size {
-	case 1:
-		b = append(b, uint8(h))
-	case 2:
-		b = append(b, uint8(h), uint8(h>>8))
-	case 3:
-		b = append(b, uint8(h), uint8(h>>8), uint8(h>>16))
-	case 4:
-		b = append(b, uint8(h), uint8(h>>8), uint8(h>>16), uint8(h>>24))
-	case 5:
-		b = append(b, uint8(h), uint8(h>>8), uint8(h>>16), uint8(h>>24), uint8(h>>32))
-	default:
-		panic(fmt.Errorf("internal error: literalsHeader has invalid size (%d)", size))
-	}
-	return b
-}
-
-// size returns the output size with currently set values.
-func (h literalsHeader) size() int {
-	return int(h >> 60)
-}
-
-func (h literalsHeader) String() string {
-	return fmt.Sprintf("Type: %d, SizeFormat: %d, Size: 0x%d, Bytes:%d", literalsBlockType(h&3), (h>>2)&3, h&((1<<60)-1)>>4, h>>60)
-}
-
-// pushOffsets will push the recent offsets to the backup store.
-func (b *blockEnc) pushOffsets() {
-	b.prevRecentOffsets = b.recentOffsets
-}
-
-// pushOffsets will push the recent offsets to the backup store.
-func (b *blockEnc) popOffsets() {
-	b.recentOffsets = b.prevRecentOffsets
-}
-
-// matchOffset will adjust recent offsets and return the adjusted one,
-// if it matches a previous offset.
-func (b *blockEnc) matchOffset(offset, lits uint32) uint32 {
-	// Check if offset is one of the recent offsets.
-	// Adjusts the output offset accordingly.
-	// Gives a tiny bit of compression, typically around 1%.
-	if true {
-		if lits > 0 {
-			switch offset {
-			case b.recentOffsets[0]:
-				offset = 1
-			case b.recentOffsets[1]:
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset = 2
-			case b.recentOffsets[2]:
-				b.recentOffsets[2] = b.recentOffsets[1]
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset = 3
-			default:
-				b.recentOffsets[2] = b.recentOffsets[1]
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset += 3
-			}
-		} else {
-			switch offset {
-			case b.recentOffsets[1]:
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset = 1
-			case b.recentOffsets[2]:
-				b.recentOffsets[2] = b.recentOffsets[1]
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset = 2
-			case b.recentOffsets[0] - 1:
-				b.recentOffsets[2] = b.recentOffsets[1]
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset = 3
-			default:
-				b.recentOffsets[2] = b.recentOffsets[1]
-				b.recentOffsets[1] = b.recentOffsets[0]
-				b.recentOffsets[0] = offset
-				offset += 3
-			}
-		}
-	} else {
-		offset += 3
-	}
-	return offset
-}
-
-// encodeRaw can be used to set the output to a raw representation of supplied bytes.
-func (b *blockEnc) encodeRaw(a []byte) {
-	var bh blockHeader
-	bh.setLast(b.last)
-	bh.setSize(uint32(len(a)))
-	bh.setType(blockTypeRaw)
-	b.output = bh.appendTo(b.output[:0])
-	b.output = append(b.output, a...)
-	if debugEncoder {
-		println("Adding RAW block, length", len(a), "last:", b.last)
-	}
-}
-
-// encodeRaw can be used to set the output to a raw representation of supplied bytes.
-func (b *blockEnc) encodeRawTo(dst, src []byte) []byte {
-	var bh blockHeader
-	bh.setLast(b.last)
-	bh.setSize(uint32(len(src)))
-	bh.setType(blockTypeRaw)
-	dst = bh.appendTo(dst)
-	dst = append(dst, src...)
-	if debugEncoder {
-		println("Adding RAW block, length", len(src), "last:", b.last)
-	}
-	return dst
-}
-
-// encodeLits can be used if the block is only litLen.
-func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
-	var bh blockHeader
-	bh.setLast(b.last)
-	bh.setSize(uint32(len(lits)))
-
-	// Don't compress extremely small blocks
-	if len(lits) < 8 || (len(lits) < 32 && b.dictLitEnc == nil) || raw {
-		if debugEncoder {
-			println("Adding RAW block, length", len(lits), "last:", b.last)
-		}
-		bh.setType(blockTypeRaw)
-		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, lits...)
-		return nil
-	}
-
-	var (
-		out            []byte
-		reUsed, single bool
-		err            error
-	)
-	if b.dictLitEnc != nil {
-		b.litEnc.TransferCTable(b.dictLitEnc)
-		b.litEnc.Reuse = huff0.ReusePolicyAllow
-		b.dictLitEnc = nil
-	}
-	if len(lits) >= 1024 {
-		// Use 4 Streams.
-		out, reUsed, err = huff0.Compress4X(lits, b.litEnc)
-	} else if len(lits) > 16 {
-		// Use 1 stream
-		single = true
-		out, reUsed, err = huff0.Compress1X(lits, b.litEnc)
-	} else {
-		err = huff0.ErrIncompressible
-	}
-	if err == nil && len(out)+5 > len(lits) {
-		// If we are close, we may still be worse or equal to raw.
-		var lh literalsHeader
-		lh.setSizes(len(out), len(lits), single)
-		if len(out)+lh.size() >= len(lits) {
-			err = huff0.ErrIncompressible
-		}
-	}
-	switch err {
-	case huff0.ErrIncompressible:
-		if debugEncoder {
-			println("Adding RAW block, length", len(lits), "last:", b.last)
-		}
-		bh.setType(blockTypeRaw)
-		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, lits...)
-		return nil
-	case huff0.ErrUseRLE:
-		if debugEncoder {
-			println("Adding RLE block, length", len(lits))
-		}
-		bh.setType(blockTypeRLE)
-		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, lits[0])
-		return nil
-	case nil:
-	default:
-		return err
-	}
-	// Compressed...
-	// Now, allow reuse
-	b.litEnc.Reuse = huff0.ReusePolicyAllow
-	bh.setType(blockTypeCompressed)
-	var lh literalsHeader
-	if reUsed {
-		if debugEncoder {
-			println("Reused tree, compressed to", len(out))
-		}
-		lh.setType(literalsBlockTreeless)
-	} else {
-		if debugEncoder {
-			println("New tree, compressed to", len(out), "tree size:", len(b.litEnc.OutTable))
-		}
-		lh.setType(literalsBlockCompressed)
-	}
-	// Set sizes
-	lh.setSizes(len(out), len(lits), single)
-	bh.setSize(uint32(len(out) + lh.size() + 1))
-
-	// Write block headers.
-	b.output = bh.appendTo(b.output)
-	b.output = lh.appendTo(b.output)
-	// Add compressed data.
-	b.output = append(b.output, out...)
-	// No sequences.
-	b.output = append(b.output, 0)
-	return nil
-}
-
-// encodeRLE will encode an RLE block.
-func (b *blockEnc) encodeRLE(val byte, length uint32) {
-	var bh blockHeader
-	bh.setLast(b.last)
-	bh.setSize(length)
-	bh.setType(blockTypeRLE)
-	b.output = bh.appendTo(b.output)
-	b.output = append(b.output, val)
-}
-
-// fuzzFseEncoder can be used to fuzz the FSE encoder.
-func fuzzFseEncoder(data []byte) int {
-	if len(data) > maxSequences || len(data) < 2 {
-		return 0
-	}
-	enc := fseEncoder{}
-	hist := enc.Histogram()
-	maxSym := uint8(0)
-	for i, v := range data {
-		v = v & 63
-		data[i] = v
-		hist[v]++
-		if v > maxSym {
-			maxSym = v
-		}
-	}
-	if maxSym == 0 {
-		// All 0
-		return 0
-	}
-	cnt := int(slices.Max(hist[:maxSym]))
-	if cnt == len(data) {
-		// RLE
-		return 0
-	}
-	enc.HistogramFinished(maxSym, cnt)
-	err := enc.normalizeCount(len(data))
-	if err != nil {
-		return 0
-	}
-	_, err = enc.writeCount(nil)
-	if err != nil {
-		panic(err)
-	}
-	return 1
-}
-
-// encode will encode the block and append the output in b.output.
-// Previous offset codes must be pushed if more blocks are expected.
-func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
-	if len(b.sequences) == 0 {
-		return b.encodeLits(b.literals, rawAllLits)
-	}
-	if len(b.sequences) == 1 && len(org) > 0 && len(b.literals) <= 1 {
-		// Check common RLE cases.
-		seq := b.sequences[0]
-		if seq.litLen == uint32(len(b.literals)) && seq.offset-3 == 1 {
-			// Offset == 1 and 0 or 1 literals.
-			b.encodeRLE(org[0], b.sequences[0].matchLen+zstdMinMatch+seq.litLen)
-			return nil
-		}
-	}
-
-	// We want some difference to at least account for the headers.
-	saved := b.size - len(b.literals) - (b.size >> 6)
-	if saved < 16 {
-		if org == nil {
-			return errIncompressible
-		}
-		b.popOffsets()
-		return b.encodeLits(org, rawAllLits)
-	}
-
-	var bh blockHeader
-	var lh literalsHeader
-	bh.setLast(b.last)
-	bh.setType(blockTypeCompressed)
-	// Store offset of the block header. Needed when we know the size.
-	bhOffset := len(b.output)
-	b.output = bh.appendTo(b.output)
-
-	var (
-		out            []byte
-		reUsed, single bool
-		err            error
-	)
-	if b.dictLitEnc != nil {
-		b.litEnc.TransferCTable(b.dictLitEnc)
-		b.litEnc.Reuse = huff0.ReusePolicyAllow
-		b.dictLitEnc = nil
-	}
-	if len(b.literals) >= 1024 && !raw {
-		// Use 4 Streams.
-		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
-	} else if len(b.literals) > 16 && !raw {
-		// Use 1 stream
-		single = true
-		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
-	} else {
-		err = huff0.ErrIncompressible
-	}
-
-	if err == nil && len(out)+5 > len(b.literals) {
-		// If we are close, we may still be worse or equal to raw.
-		var lh literalsHeader
-		lh.setSize(len(b.literals))
-		szRaw := lh.size()
-		lh.setSizes(len(out), len(b.literals), single)
-		szComp := lh.size()
-		if len(out)+szComp >= len(b.literals)+szRaw {
-			err = huff0.ErrIncompressible
-		}
-	}
-	switch err {
-	case huff0.ErrIncompressible:
-		lh.setType(literalsBlockRaw)
-		lh.setSize(len(b.literals))
-		b.output = lh.appendTo(b.output)
-		b.output = append(b.output, b.literals...)
-		if debugEncoder {
-			println("Adding literals RAW, length", len(b.literals))
-		}
-	case huff0.ErrUseRLE:
-		lh.setType(literalsBlockRLE)
-		lh.setSize(len(b.literals))
-		b.output = lh.appendTo(b.output)
-		b.output = append(b.output, b.literals[0])
-		if debugEncoder {
-			println("Adding literals RLE")
-		}
-	case nil:
-		// Compressed litLen...
-		if reUsed {
-			if debugEncoder {
-				println("reused tree")
-			}
-			lh.setType(literalsBlockTreeless)
-		} else {
-			if debugEncoder {
-				println("new tree, size:", len(b.litEnc.OutTable))
-			}
-			lh.setType(literalsBlockCompressed)
-			if debugEncoder {
-				_, _, err := huff0.ReadTable(out, nil)
-				if err != nil {
-					panic(err)
-				}
-			}
-		}
-		lh.setSizes(len(out), len(b.literals), single)
-		if debugEncoder {
-			printf("Compressed %d literals to %d bytes", len(b.literals), len(out))
-			println("Adding literal header:", lh)
-		}
-		b.output = lh.appendTo(b.output)
-		b.output = append(b.output, out...)
-		b.litEnc.Reuse = huff0.ReusePolicyAllow
-		if debugEncoder {
-			println("Adding literals compressed")
-		}
-	default:
-		if debugEncoder {
-			println("Adding literals ERROR:", err)
-		}
-		return err
-	}
-	// Sequence compression
-
-	// Write the number of sequences
-	switch {
-	case len(b.sequences) < 128:
-		b.output = append(b.output, uint8(len(b.sequences)))
-	case len(b.sequences) < 0x7f00: // TODO: this could be wrong
-		n := len(b.sequences)
-		b.output = append(b.output, 128+uint8(n>>8), uint8(n))
-	default:
-		n := len(b.sequences) - 0x7f00
-		b.output = append(b.output, 255, uint8(n), uint8(n>>8))
-	}
-	if debugEncoder {
-		println("Encoding", len(b.sequences), "sequences")
-	}
-	b.genCodes()
-	llEnc := b.coders.llEnc
-	ofEnc := b.coders.ofEnc
-	mlEnc := b.coders.mlEnc
-	err = llEnc.normalizeCount(len(b.sequences))
-	if err != nil {
-		return err
-	}
-	err = ofEnc.normalizeCount(len(b.sequences))
-	if err != nil {
-		return err
-	}
-	err = mlEnc.normalizeCount(len(b.sequences))
-	if err != nil {
-		return err
-	}
-
-	// Choose the best compression mode for each type.
-	// Will evaluate the new vs predefined and previous.
-	chooseComp := func(cur, prev, preDef *fseEncoder) (*fseEncoder, seqCompMode) {
-		// See if predefined/previous is better
-		hist := cur.count[:cur.symbolLen]
-		nSize := cur.approxSize(hist) + cur.maxHeaderSize()
-		predefSize := preDef.approxSize(hist)
-		prevSize := prev.approxSize(hist)
-
-		// Add a small penalty for new encoders.
-		// Don't bother with extremely small (<2 byte gains).
-		nSize = nSize + (nSize+2*8*16)>>4
-		switch {
-		case predefSize <= prevSize && predefSize <= nSize || forcePreDef:
-			if debugEncoder {
-				println("Using predefined", predefSize>>3, "<=", nSize>>3)
-			}
-			return preDef, compModePredefined
-		case prevSize <= nSize:
-			if debugEncoder {
-				println("Using previous", prevSize>>3, "<=", nSize>>3)
-			}
-			return prev, compModeRepeat
-		default:
-			if debugEncoder {
-				println("Using new, predef", predefSize>>3, ". previous:", prevSize>>3, ">", nSize>>3, "header max:", cur.maxHeaderSize()>>3, "bytes")
-				println("tl:", cur.actualTableLog, "symbolLen:", cur.symbolLen, "norm:", cur.norm[:cur.symbolLen], "hist", cur.count[:cur.symbolLen])
-			}
-			return cur, compModeFSE
-		}
-	}
-
-	// Write compression mode
-	var mode uint8
-	if llEnc.useRLE {
-		mode |= uint8(compModeRLE) << 6
-		llEnc.setRLE(b.sequences[0].llCode)
-		if debugEncoder {
-			println("llEnc.useRLE")
-		}
-	} else {
-		var m seqCompMode
-		llEnc, m = chooseComp(llEnc, b.coders.llPrev, &fsePredefEnc[tableLiteralLengths])
-		mode |= uint8(m) << 6
-	}
-	if ofEnc.useRLE {
-		mode |= uint8(compModeRLE) << 4
-		ofEnc.setRLE(b.sequences[0].ofCode)
-		if debugEncoder {
-			println("ofEnc.useRLE")
-		}
-	} else {
-		var m seqCompMode
-		ofEnc, m = chooseComp(ofEnc, b.coders.ofPrev, &fsePredefEnc[tableOffsets])
-		mode |= uint8(m) << 4
-	}
-
-	if mlEnc.useRLE {
-		mode |= uint8(compModeRLE) << 2
-		mlEnc.setRLE(b.sequences[0].mlCode)
-		if debugEncoder {
-			println("mlEnc.useRLE, code: ", b.sequences[0].mlCode, "value", b.sequences[0].matchLen)
-		}
-	} else {
-		var m seqCompMode
-		mlEnc, m = chooseComp(mlEnc, b.coders.mlPrev, &fsePredefEnc[tableMatchLengths])
-		mode |= uint8(m) << 2
-	}
-	b.output = append(b.output, mode)
-	if debugEncoder {
-		printf("Compression modes: 0b%b", mode)
-	}
-	b.output, err = llEnc.writeCount(b.output)
-	if err != nil {
-		return err
-	}
-	start := len(b.output)
-	b.output, err = ofEnc.writeCount(b.output)
-	if err != nil {
-		return err
-	}
-	if false {
-		println("block:", b.output[start:], "tablelog", ofEnc.actualTableLog, "maxcount:", ofEnc.maxCount)
-		fmt.Printf("selected TableLog: %d, Symbol length: %d\n", ofEnc.actualTableLog, ofEnc.symbolLen)
-		for i, v := range ofEnc.norm[:ofEnc.symbolLen] {
-			fmt.Printf("%3d: %5d -> %4d \n", i, ofEnc.count[i], v)
-		}
-	}
-	b.output, err = mlEnc.writeCount(b.output)
-	if err != nil {
-		return err
-	}
-
-	// Maybe in block?
-	wr := &b.wr
-	wr.reset(b.output)
-
-	var ll, of, ml cState
-
-	// Current sequence
-	seq := len(b.sequences) - 1
-	s := b.sequences[seq]
-	llEnc.setBits(llBitsTable[:])
-	mlEnc.setBits(mlBitsTable[:])
-	ofEnc.setBits(nil)
-
-	llTT, ofTT, mlTT := llEnc.ct.symbolTT[:256], ofEnc.ct.symbolTT[:256], mlEnc.ct.symbolTT[:256]
-
-	// We have 3 bounds checks here (and in the loop).
-	// Since we are iterating backwards it is kinda hard to avoid.
-	llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-	ll.init(wr, &llEnc.ct, llB)
-	of.init(wr, &ofEnc.ct, ofB)
-	wr.flush32()
-	ml.init(wr, &mlEnc.ct, mlB)
-
-	// Each of these lookups also generates a bounds check.
-	wr.addBits32NC(s.litLen, llB.outBits)
-	wr.addBits32NC(s.matchLen, mlB.outBits)
-	wr.flush32()
-	wr.addBits32NC(s.offset, ofB.outBits)
-	if debugSequences {
-		println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB)
-	}
-	seq--
-	// Store sequences in reverse...
-	for seq >= 0 {
-		s = b.sequences[seq]
-
-		ofB := ofTT[s.ofCode]
-		wr.flush32() // tablelog max is below 8 for each, so it will fill max 24 bits.
-		//of.encode(ofB)
-		nbBitsOut := (uint32(of.state) + ofB.deltaNbBits) >> 16
-		dstState := int32(of.state>>(nbBitsOut&15)) + int32(ofB.deltaFindState)
-		wr.addBits16NC(of.state, uint8(nbBitsOut))
-		of.state = of.stateTable[dstState]
-
-		// Accumulate extra bits.
-		outBits := ofB.outBits & 31
-		extraBits := uint64(s.offset & bitMask32[outBits])
-		extraBitsN := outBits
-
-		mlB := mlTT[s.mlCode]
-		//ml.encode(mlB)
-		nbBitsOut = (uint32(ml.state) + mlB.deltaNbBits) >> 16
-		dstState = int32(ml.state>>(nbBitsOut&15)) + int32(mlB.deltaFindState)
-		wr.addBits16NC(ml.state, uint8(nbBitsOut))
-		ml.state = ml.stateTable[dstState]
-
-		outBits = mlB.outBits & 31
-		extraBits = extraBits<<outBits | uint64(s.matchLen&bitMask32[outBits])
-		extraBitsN += outBits
-
-		llB := llTT[s.llCode]
-		//ll.encode(llB)
-		nbBitsOut = (uint32(ll.state) + llB.deltaNbBits) >> 16
-		dstState = int32(ll.state>>(nbBitsOut&15)) + int32(llB.deltaFindState)
-		wr.addBits16NC(ll.state, uint8(nbBitsOut))
-		ll.state = ll.stateTable[dstState]
-
-		outBits = llB.outBits & 31
-		extraBits = extraBits<<outBits | uint64(s.litLen&bitMask32[outBits])
-		extraBitsN += outBits
-
-		wr.flush32()
-		wr.addBits64NC(extraBits, extraBitsN)
-
-		if debugSequences {
-			println("Encoded seq", seq, s)
-		}
-
-		seq--
-	}
-	ml.flush(mlEnc.actualTableLog)
-	of.flush(ofEnc.actualTableLog)
-	ll.flush(llEnc.actualTableLog)
-	wr.close()
-	b.output = wr.out
-
-	// Maybe even add a bigger margin.
-	if len(b.output)-3-bhOffset >= b.size {
-		// Discard and encode as raw block.
-		b.output = b.encodeRawTo(b.output[:bhOffset], org)
-		b.popOffsets()
-		b.litEnc.Reuse = huff0.ReusePolicyNone
-		return nil
-	}
-
-	// Size is output minus block header.
-	bh.setSize(uint32(len(b.output)-bhOffset) - 3)
-	if debugEncoder {
-		println("Rewriting block header", bh)
-	}
-	_ = bh.appendTo(b.output[bhOffset:bhOffset])
-	b.coders.setPrev(llEnc, mlEnc, ofEnc)
-	return nil
-}
-
-var errIncompressible = errors.New("incompressible")
-
-func (b *blockEnc) genCodes() {
-	if len(b.sequences) == 0 {
-		// nothing to do
-		return
-	}
-	if len(b.sequences) > math.MaxUint16 {
-		panic("can only encode up to 64K sequences")
-	}
-	// No bounds checks after here:
-	llH := b.coders.llEnc.Histogram()
-	ofH := b.coders.ofEnc.Histogram()
-	mlH := b.coders.mlEnc.Histogram()
-	for i := range llH {
-		llH[i] = 0
-	}
-	for i := range ofH {
-		ofH[i] = 0
-	}
-	for i := range mlH {
-		mlH[i] = 0
-	}
-
-	var llMax, ofMax, mlMax uint8
-	for i := range b.sequences {
-		seq := &b.sequences[i]
-		v := llCode(seq.litLen)
-		seq.llCode = v
-		llH[v]++
-		if v > llMax {
-			llMax = v
-		}
-
-		v = ofCode(seq.offset)
-		seq.ofCode = v
-		ofH[v]++
-		if v > ofMax {
-			ofMax = v
-		}
-
-		v = mlCode(seq.matchLen)
-		seq.mlCode = v
-		mlH[v]++
-		if v > mlMax {
-			mlMax = v
-			if debugAsserts && mlMax > maxMatchLengthSymbol {
-				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
-			}
-		}
-	}
-	if debugAsserts && mlMax > maxMatchLengthSymbol {
-		panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d)", mlMax))
-	}
-	if debugAsserts && ofMax > maxOffsetBits {
-		panic(fmt.Errorf("ofMax > maxOffsetBits (%d)", ofMax))
-	}
-	if debugAsserts && llMax > maxLiteralLengthSymbol {
-		panic(fmt.Errorf("llMax > maxLiteralLengthSymbol (%d)", llMax))
-	}
-
-	b.coders.mlEnc.HistogramFinished(mlMax, int(slices.Max(mlH[:mlMax+1])))
-	b.coders.ofEnc.HistogramFinished(ofMax, int(slices.Max(ofH[:ofMax+1])))
-	b.coders.llEnc.HistogramFinished(llMax, int(slices.Max(llH[:llMax+1])))
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/blocktype_string.go b/vendor/github.com/klauspost/compress/zstd/blocktype_string.go
deleted file mode 100644
index 01a01e486..000000000
--- a/vendor/github.com/klauspost/compress/zstd/blocktype_string.go
+++ /dev/null
@@ -1,85 +0,0 @@
-// Code generated by "stringer -type=blockType,literalsBlockType,seqCompMode,tableIndex"; DO NOT EDIT.
-
-package zstd
-
-import "strconv"
-
-func _() {
-	// An "invalid array index" compiler error signifies that the constant values have changed.
-	// Re-run the stringer command to generate them again.
-	var x [1]struct{}
-	_ = x[blockTypeRaw-0]
-	_ = x[blockTypeRLE-1]
-	_ = x[blockTypeCompressed-2]
-	_ = x[blockTypeReserved-3]
-}
-
-const _blockType_name = "blockTypeRawblockTypeRLEblockTypeCompressedblockTypeReserved"
-
-var _blockType_index = [...]uint8{0, 12, 24, 43, 60}
-
-func (i blockType) String() string {
-	if i >= blockType(len(_blockType_index)-1) {
-		return "blockType(" + strconv.FormatInt(int64(i), 10) + ")"
-	}
-	return _blockType_name[_blockType_index[i]:_blockType_index[i+1]]
-}
-func _() {
-	// An "invalid array index" compiler error signifies that the constant values have changed.
-	// Re-run the stringer command to generate them again.
-	var x [1]struct{}
-	_ = x[literalsBlockRaw-0]
-	_ = x[literalsBlockRLE-1]
-	_ = x[literalsBlockCompressed-2]
-	_ = x[literalsBlockTreeless-3]
-}
-
-const _literalsBlockType_name = "literalsBlockRawliteralsBlockRLEliteralsBlockCompressedliteralsBlockTreeless"
-
-var _literalsBlockType_index = [...]uint8{0, 16, 32, 55, 76}
-
-func (i literalsBlockType) String() string {
-	if i >= literalsBlockType(len(_literalsBlockType_index)-1) {
-		return "literalsBlockType(" + strconv.FormatInt(int64(i), 10) + ")"
-	}
-	return _literalsBlockType_name[_literalsBlockType_index[i]:_literalsBlockType_index[i+1]]
-}
-func _() {
-	// An "invalid array index" compiler error signifies that the constant values have changed.
-	// Re-run the stringer command to generate them again.
-	var x [1]struct{}
-	_ = x[compModePredefined-0]
-	_ = x[compModeRLE-1]
-	_ = x[compModeFSE-2]
-	_ = x[compModeRepeat-3]
-}
-
-const _seqCompMode_name = "compModePredefinedcompModeRLEcompModeFSEcompModeRepeat"
-
-var _seqCompMode_index = [...]uint8{0, 18, 29, 40, 54}
-
-func (i seqCompMode) String() string {
-	if i >= seqCompMode(len(_seqCompMode_index)-1) {
-		return "seqCompMode(" + strconv.FormatInt(int64(i), 10) + ")"
-	}
-	return _seqCompMode_name[_seqCompMode_index[i]:_seqCompMode_index[i+1]]
-}
-func _() {
-	// An "invalid array index" compiler error signifies that the constant values have changed.
-	// Re-run the stringer command to generate them again.
-	var x [1]struct{}
-	_ = x[tableLiteralLengths-0]
-	_ = x[tableOffsets-1]
-	_ = x[tableMatchLengths-2]
-}
-
-const _tableIndex_name = "tableLiteralLengthstableOffsetstableMatchLengths"
-
-var _tableIndex_index = [...]uint8{0, 19, 31, 48}
-
-func (i tableIndex) String() string {
-	if i >= tableIndex(len(_tableIndex_index)-1) {
-		return "tableIndex(" + strconv.FormatInt(int64(i), 10) + ")"
-	}
-	return _tableIndex_name[_tableIndex_index[i]:_tableIndex_index[i+1]]
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
deleted file mode 100644
index 55a388553..000000000
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"fmt"
-	"io"
-)
-
-type byteBuffer interface {
-	// Read up to 8 bytes.
-	// Returns io.ErrUnexpectedEOF if this cannot be satisfied.
-	readSmall(n int) ([]byte, error)
-
-	// Read >8 bytes.
-	// MAY use the destination slice.
-	readBig(n int, dst []byte) ([]byte, error)
-
-	// Read a single byte.
-	readByte() (byte, error)
-
-	// Skip n bytes.
-	skipN(n int64) error
-}
-
-// in-memory buffer
-type byteBuf []byte
-
-func (b *byteBuf) readSmall(n int) ([]byte, error) {
-	if debugAsserts && n > 8 {
-		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
-	}
-	bb := *b
-	if len(bb) < n {
-		return nil, io.ErrUnexpectedEOF
-	}
-	r := bb[:n]
-	*b = bb[n:]
-	return r, nil
-}
-
-func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
-	bb := *b
-	if len(bb) < n {
-		return nil, io.ErrUnexpectedEOF
-	}
-	r := bb[:n]
-	*b = bb[n:]
-	return r, nil
-}
-
-func (b *byteBuf) readByte() (byte, error) {
-	bb := *b
-	if len(bb) < 1 {
-		return 0, io.ErrUnexpectedEOF
-	}
-	r := bb[0]
-	*b = bb[1:]
-	return r, nil
-}
-
-func (b *byteBuf) skipN(n int64) error {
-	bb := *b
-	if n < 0 {
-		return fmt.Errorf("negative skip (%d) requested", n)
-	}
-	if int64(len(bb)) < n {
-		return io.ErrUnexpectedEOF
-	}
-	*b = bb[n:]
-	return nil
-}
-
-// wrapper around a reader.
-type readerWrapper struct {
-	r   io.Reader
-	tmp [8]byte
-}
-
-func (r *readerWrapper) readSmall(n int) ([]byte, error) {
-	if debugAsserts && n > 8 {
-		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
-	}
-	n2, err := io.ReadFull(r.r, r.tmp[:n])
-	// We only really care about the actual bytes read.
-	if err != nil {
-		if err == io.EOF {
-			return nil, io.ErrUnexpectedEOF
-		}
-		if debugDecoder {
-			println("readSmall: got", n2, "want", n, "err", err)
-		}
-		return nil, err
-	}
-	return r.tmp[:n], nil
-}
-
-func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
-	if cap(dst) < n {
-		dst = make([]byte, n)
-	}
-	n2, err := io.ReadFull(r.r, dst[:n])
-	if err == io.EOF && n > 0 {
-		err = io.ErrUnexpectedEOF
-	}
-	return dst[:n2], err
-}
-
-func (r *readerWrapper) readByte() (byte, error) {
-	n2, err := io.ReadFull(r.r, r.tmp[:1])
-	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		return 0, err
-	}
-	if n2 != 1 {
-		return 0, io.ErrUnexpectedEOF
-	}
-	return r.tmp[0], nil
-}
-
-func (r *readerWrapper) skipN(n int64) error {
-	n2, err := io.CopyN(io.Discard, r.r, n)
-	if n2 != n {
-		err = io.ErrUnexpectedEOF
-	}
-	return err
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
deleted file mode 100644
index 0e59a242d..000000000
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// byteReader provides a byte reader that reads
-// little endian values from a byte stream.
-// The input stream is manually advanced.
-// The reader performs no bounds checks.
-type byteReader struct {
-	b   []byte
-	off int
-}
-
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
-	b.off += int(n)
-}
-
-// overread returns whether we have advanced too far.
-func (b *byteReader) overread() bool {
-	return b.off > len(b.b)
-}
-
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off:]
-	b2 = b2[:4]
-	v3 := int32(b2[3])
-	v2 := int32(b2[2])
-	v1 := int32(b2[1])
-	v0 := int32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
-}
-
-// Uint8 returns the next byte
-func (b *byteReader) Uint8() uint8 {
-	v := b.b[b.off]
-	return v
-}
-
-// Uint32 returns a little endian uint32 starting at current offset.
-func (b byteReader) Uint32() uint32 {
-	if r := b.remain(); r < 4 {
-		// Very rare
-		v := uint32(0)
-		for i := 1; i <= r; i++ {
-			v = (v << 8) | uint32(b.b[len(b.b)-i])
-		}
-		return v
-	}
-	b2 := b.b[b.off:]
-	b2 = b2[:4]
-	v3 := uint32(b2[3])
-	v2 := uint32(b2[2])
-	v1 := uint32(b2[1])
-	v0 := uint32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
-}
-
-// Uint32NC returns a little endian uint32 starting at current offset.
-// The caller must be sure if there are at least 4 bytes left.
-func (b byteReader) Uint32NC() uint32 {
-	b2 := b.b[b.off:]
-	b2 = b2[:4]
-	v3 := uint32(b2[3])
-	v2 := uint32(b2[2])
-	v1 := uint32(b2[1])
-	v0 := uint32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
-}
-
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
-	return b.b[b.off:]
-}
-
-// remain will return the number of bytes remaining.
-func (b byteReader) remain() int {
-	return len(b.b) - b.off
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
deleted file mode 100644
index 6a5a2988b..000000000
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright 2020+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-
-package zstd
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-// HeaderMaxSize is the maximum size of a Frame and Block Header.
-// If less is sent to Header.Decode it *may* still contain enough information.
-const HeaderMaxSize = 14 + 3
-
-// Header contains information about the first frame and block within that.
-type Header struct {
-	// SingleSegment specifies whether the data is to be decompressed into a
-	// single contiguous memory segment.
-	// It implies that WindowSize is invalid and that FrameContentSize is valid.
-	SingleSegment bool
-
-	// WindowSize is the window of data to keep while decoding.
-	// Will only be set if SingleSegment is false.
-	WindowSize uint64
-
-	// Dictionary ID.
-	// If 0, no dictionary.
-	DictionaryID uint32
-
-	// HasFCS specifies whether FrameContentSize has a valid value.
-	HasFCS bool
-
-	// FrameContentSize is the expected uncompressed size of the entire frame.
-	FrameContentSize uint64
-
-	// Skippable will be true if the frame is meant to be skipped.
-	// This implies that FirstBlock.OK is false.
-	Skippable bool
-
-	// SkippableID is the user-specific ID for the skippable frame.
-	// Valid values are between 0 to 15, inclusive.
-	SkippableID int
-
-	// SkippableSize is the length of the user data to skip following
-	// the header.
-	SkippableSize uint32
-
-	// HeaderSize is the raw size of the frame header.
-	//
-	// For normal frames, it includes the size of the magic number and
-	// the size of the header (per section 3.1.1.1).
-	// It does not include the size for any data blocks (section 3.1.1.2) nor
-	// the size for the trailing content checksum.
-	//
-	// For skippable frames, this counts the size of the magic number
-	// along with the size of the size field of the payload.
-	// It does not include the size of the skippable payload itself.
-	// The total frame size is the HeaderSize plus the SkippableSize.
-	HeaderSize int
-
-	// First block information.
-	FirstBlock struct {
-		// OK will be set if first block could be decoded.
-		OK bool
-
-		// Is this the last block of a frame?
-		Last bool
-
-		// Is the data compressed?
-		// If true CompressedSize will be populated.
-		// Unfortunately DecompressedSize cannot be determined
-		// without decoding the blocks.
-		Compressed bool
-
-		// DecompressedSize is the expected decompressed size of the block.
-		// Will be 0 if it cannot be determined.
-		DecompressedSize int
-
-		// CompressedSize of the data in the block.
-		// Does not include the block header.
-		// Will be equal to DecompressedSize if not Compressed.
-		CompressedSize int
-	}
-
-	// If set there is a checksum present for the block content.
-	// The checksum field at the end is always 4 bytes long.
-	HasCheckSum bool
-}
-
-// Decode the header from the beginning of the stream.
-// This will decode the frame header and the first block header if enough bytes are provided.
-// It is recommended to provide at least HeaderMaxSize bytes.
-// If the frame header cannot be read an error will be returned.
-// If there isn't enough input, io.ErrUnexpectedEOF is returned.
-// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
-func (h *Header) Decode(in []byte) error {
-	_, err := h.DecodeAndStrip(in)
-	return err
-}
-
-// DecodeAndStrip will decode the header from the beginning of the stream
-// and on success return the remaining bytes.
-// This will decode the frame header and the first block header if enough bytes are provided.
-// It is recommended to provide at least HeaderMaxSize bytes.
-// If the frame header cannot be read an error will be returned.
-// If there isn't enough input, io.ErrUnexpectedEOF is returned.
-// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
-func (h *Header) DecodeAndStrip(in []byte) (remain []byte, err error) {
-	*h = Header{}
-	if len(in) < 4 {
-		return nil, io.ErrUnexpectedEOF
-	}
-	h.HeaderSize += 4
-	b, in := in[:4], in[4:]
-	if string(b) != frameMagic {
-		if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
-			return nil, ErrMagicMismatch
-		}
-		if len(in) < 4 {
-			return nil, io.ErrUnexpectedEOF
-		}
-		h.HeaderSize += 4
-		h.Skippable = true
-		h.SkippableID = int(b[0] & 0xf)
-		h.SkippableSize = binary.LittleEndian.Uint32(in)
-		return in[4:], nil
-	}
-
-	// Read Window_Descriptor
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
-	if len(in) < 1 {
-		return nil, io.ErrUnexpectedEOF
-	}
-	fhd, in := in[0], in[1:]
-	h.HeaderSize++
-	h.SingleSegment = fhd&(1<<5) != 0
-	h.HasCheckSum = fhd&(1<<2) != 0
-	if fhd&(1<<3) != 0 {
-		return nil, errors.New("reserved bit set on frame header")
-	}
-
-	if !h.SingleSegment {
-		if len(in) < 1 {
-			return nil, io.ErrUnexpectedEOF
-		}
-		var wd byte
-		wd, in = in[0], in[1:]
-		h.HeaderSize++
-		windowLog := 10 + (wd >> 3)
-		windowBase := uint64(1) << windowLog
-		windowAdd := (windowBase / 8) * uint64(wd&0x7)
-		h.WindowSize = windowBase + windowAdd
-	}
-
-	// Read Dictionary_ID
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	if size := fhd & 3; size != 0 {
-		if size == 3 {
-			size = 4
-		}
-		if len(in) < int(size) {
-			return nil, io.ErrUnexpectedEOF
-		}
-		b, in = in[:size], in[size:]
-		h.HeaderSize += int(size)
-		switch len(b) {
-		case 1:
-			h.DictionaryID = uint32(b[0])
-		case 2:
-			h.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
-		case 4:
-			h.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
-		}
-	}
-
-	// Read Frame_Content_Size
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
-	var fcsSize int
-	v := fhd >> 6
-	switch v {
-	case 0:
-		if h.SingleSegment {
-			fcsSize = 1
-		}
-	default:
-		fcsSize = 1 << v
-	}
-
-	if fcsSize > 0 {
-		h.HasFCS = true
-		if len(in) < fcsSize {
-			return nil, io.ErrUnexpectedEOF
-		}
-		b, in = in[:fcsSize], in[fcsSize:]
-		h.HeaderSize += int(fcsSize)
-		switch len(b) {
-		case 1:
-			h.FrameContentSize = uint64(b[0])
-		case 2:
-			// When FCS_Field_Size is 2, the offset of 256 is added.
-			h.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) + 256
-		case 4:
-			h.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24)
-		case 8:
-			d1 := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
-			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
-			h.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
-		}
-	}
-
-	// Frame Header done, we will not fail from now on.
-	if len(in) < 3 {
-		return in, nil
-	}
-	tmp := in[:3]
-	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
-	h.FirstBlock.Last = bh&1 != 0
-	blockType := blockType((bh >> 1) & 3)
-	// find size.
-	cSize := int(bh >> 3)
-	switch blockType {
-	case blockTypeReserved:
-		return in, nil
-	case blockTypeRLE:
-		h.FirstBlock.Compressed = true
-		h.FirstBlock.DecompressedSize = cSize
-		h.FirstBlock.CompressedSize = 1
-	case blockTypeCompressed:
-		h.FirstBlock.Compressed = true
-		h.FirstBlock.CompressedSize = cSize
-	case blockTypeRaw:
-		h.FirstBlock.DecompressedSize = cSize
-		h.FirstBlock.CompressedSize = cSize
-	default:
-		panic("Invalid block type")
-	}
-
-	h.FirstBlock.OK = true
-	return in, nil
-}
-
-// AppendTo will append the encoded header to the dst slice.
-// There is no error checking performed on the header values.
-func (h *Header) AppendTo(dst []byte) ([]byte, error) {
-	if h.Skippable {
-		magic := [4]byte{0x50, 0x2a, 0x4d, 0x18}
-		magic[0] |= byte(h.SkippableID & 0xf)
-		dst = append(dst, magic[:]...)
-		f := h.SkippableSize
-		return append(dst, uint8(f), uint8(f>>8), uint8(f>>16), uint8(f>>24)), nil
-	}
-	f := frameHeader{
-		ContentSize:   h.FrameContentSize,
-		WindowSize:    uint32(h.WindowSize),
-		SingleSegment: h.SingleSegment,
-		Checksum:      h.HasCheckSum,
-		DictID:        h.DictionaryID,
-	}
-	return f.appendTo(dst), nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
deleted file mode 100644
index ea2a19376..000000000
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ /dev/null
@@ -1,949 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"context"
-	"encoding/binary"
-	"io"
-	"sync"
-
-	"github.com/klauspost/compress/zstd/internal/xxhash"
-)
-
-// Decoder provides decoding of zstandard streams.
-// The decoder has been designed to operate without allocations after a warmup.
-// This means that you should store the decoder for best performance.
-// To re-use a stream decoder, use the Reset(r io.Reader) error to switch to another stream.
-// A decoder can safely be re-used even if the previous stream failed.
-// To release the resources, you must call the Close() function on a decoder.
-type Decoder struct {
-	o decoderOptions
-
-	// Unreferenced decoders, ready for use.
-	decoders chan *blockDec
-
-	// Current read position used for Reader functionality.
-	current decoderState
-
-	// sync stream decoding
-	syncStream struct {
-		decodedFrame uint64
-		br           readerWrapper
-		enabled      bool
-		inFrame      bool
-		dstBuf       []byte
-	}
-
-	frame *frameDec
-
-	// Custom dictionaries.
-	dicts map[uint32]*dict
-
-	// streamWg is the waitgroup for all streams
-	streamWg sync.WaitGroup
-}
-
-// decoderState is used for maintaining state when the decoder
-// is used for streaming.
-type decoderState struct {
-	// current block being written to stream.
-	decodeOutput
-
-	// output in order to be written to stream.
-	output chan decodeOutput
-
-	// cancel remaining output.
-	cancel context.CancelFunc
-
-	// crc of current frame
-	crc *xxhash.Digest
-
-	flushed bool
-}
-
-var (
-	// Check the interfaces we want to support.
-	_ = io.WriterTo(&Decoder{})
-	_ = io.Reader(&Decoder{})
-)
-
-// NewReader creates a new decoder.
-// A nil Reader can be provided in which case Reset can be used to start a decode.
-//
-// A Decoder can be used in two modes:
-//
-// 1) As a stream, or
-// 2) For stateless decoding using DecodeAll.
-//
-// Only a single stream can be decoded concurrently, but the same decoder
-// can run multiple concurrent stateless decodes. It is even possible to
-// use stateless decodes while a stream is being decoded.
-//
-// The Reset function can be used to initiate a new stream, which will considerably
-// reduce the allocations normally caused by NewReader.
-func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
-	initPredefined()
-	var d Decoder
-	d.o.setDefault()
-	for _, o := range opts {
-		err := o(&d.o)
-		if err != nil {
-			return nil, err
-		}
-	}
-	d.current.crc = xxhash.New()
-	d.current.flushed = true
-
-	if r == nil {
-		d.current.err = ErrDecoderNilInput
-	}
-
-	// Transfer option dicts.
-	d.dicts = make(map[uint32]*dict, len(d.o.dicts))
-	for _, dc := range d.o.dicts {
-		d.dicts[dc.id] = dc
-	}
-	d.o.dicts = nil
-
-	// Create decoders
-	d.decoders = make(chan *blockDec, d.o.concurrent)
-	for i := 0; i < d.o.concurrent; i++ {
-		dec := newBlockDec(d.o.lowMem)
-		dec.localFrame = newFrameDec(d.o)
-		d.decoders <- dec
-	}
-
-	if r == nil {
-		return &d, nil
-	}
-	return &d, d.Reset(r)
-}
-
-// Read bytes from the decompressed stream into p.
-// Returns the number of bytes read and any error that occurred.
-// When the stream is done, io.EOF will be returned.
-func (d *Decoder) Read(p []byte) (int, error) {
-	var n int
-	for {
-		if len(d.current.b) > 0 {
-			filled := copy(p, d.current.b)
-			p = p[filled:]
-			d.current.b = d.current.b[filled:]
-			n += filled
-		}
-		if len(p) == 0 {
-			break
-		}
-		if len(d.current.b) == 0 {
-			// We have an error and no more data
-			if d.current.err != nil {
-				break
-			}
-			if !d.nextBlock(n == 0) {
-				return n, d.current.err
-			}
-		}
-	}
-	if len(d.current.b) > 0 {
-		if debugDecoder {
-			println("returning", n, "still bytes left:", len(d.current.b))
-		}
-		// Only return error at end of block
-		return n, nil
-	}
-	if d.current.err != nil {
-		d.drainOutput()
-	}
-	if debugDecoder {
-		println("returning", n, d.current.err, len(d.decoders))
-	}
-	return n, d.current.err
-}
-
-// Reset will reset the decoder the supplied stream after the current has finished processing.
-// Note that this functionality cannot be used after Close has been called.
-// Reset can be called with a nil reader to release references to the previous reader.
-// After being called with a nil reader, no other operations than Reset or DecodeAll or Close
-// should be used.
-func (d *Decoder) Reset(r io.Reader) error {
-	if d.current.err == ErrDecoderClosed {
-		return d.current.err
-	}
-
-	d.drainOutput()
-
-	d.syncStream.br.r = nil
-	if r == nil {
-		d.current.err = ErrDecoderNilInput
-		if len(d.current.b) > 0 {
-			d.current.b = d.current.b[:0]
-		}
-		d.current.flushed = true
-		return nil
-	}
-
-	// If bytes buffer and < 5MB, do sync decoding anyway.
-	if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
-		bb2 := bb
-		if debugDecoder {
-			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
-		}
-		b := bb2.Bytes()
-		var dst []byte
-		if cap(d.syncStream.dstBuf) > 0 {
-			dst = d.syncStream.dstBuf[:0]
-		}
-
-		dst, err := d.DecodeAll(b, dst)
-		if err == nil {
-			err = io.EOF
-		}
-		// Save output buffer
-		d.syncStream.dstBuf = dst
-		d.current.b = dst
-		d.current.err = err
-		d.current.flushed = true
-		if debugDecoder {
-			println("sync decode to", len(dst), "bytes, err:", err)
-		}
-		return nil
-	}
-	// Remove current block.
-	d.stashDecoder()
-	d.current.decodeOutput = decodeOutput{}
-	d.current.err = nil
-	d.current.flushed = false
-	d.current.d = nil
-	d.syncStream.dstBuf = nil
-
-	// Ensure no-one else is still running...
-	d.streamWg.Wait()
-	if d.frame == nil {
-		d.frame = newFrameDec(d.o)
-	}
-
-	if d.o.concurrent == 1 {
-		return d.startSyncDecoder(r)
-	}
-
-	d.current.output = make(chan decodeOutput, d.o.concurrent)
-	ctx, cancel := context.WithCancel(context.Background())
-	d.current.cancel = cancel
-	d.streamWg.Add(1)
-	go d.startStreamDecoder(ctx, r, d.current.output)
-
-	return nil
-}
-
-// drainOutput will drain the output until errEndOfStream is sent.
-func (d *Decoder) drainOutput() {
-	if d.current.cancel != nil {
-		if debugDecoder {
-			println("cancelling current")
-		}
-		d.current.cancel()
-		d.current.cancel = nil
-	}
-	if d.current.d != nil {
-		if debugDecoder {
-			printf("re-adding current decoder %p, decoders: %d", d.current.d, len(d.decoders))
-		}
-		d.decoders <- d.current.d
-		d.current.d = nil
-		d.current.b = nil
-	}
-	if d.current.output == nil || d.current.flushed {
-		println("current already flushed")
-		return
-	}
-	for v := range d.current.output {
-		if v.d != nil {
-			if debugDecoder {
-				printf("re-adding decoder %p", v.d)
-			}
-			d.decoders <- v.d
-		}
-	}
-	d.current.output = nil
-	d.current.flushed = true
-}
-
-// WriteTo writes data to w until there's no more data to write or when an error occurs.
-// The return value n is the number of bytes written.
-// Any error encountered during the write is also returned.
-func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
-	var n int64
-	for {
-		if len(d.current.b) > 0 {
-			n2, err2 := w.Write(d.current.b)
-			n += int64(n2)
-			if err2 != nil && (d.current.err == nil || d.current.err == io.EOF) {
-				d.current.err = err2
-			} else if n2 != len(d.current.b) {
-				d.current.err = io.ErrShortWrite
-			}
-		}
-		if d.current.err != nil {
-			break
-		}
-		d.nextBlock(true)
-	}
-	err := d.current.err
-	if err != nil {
-		d.drainOutput()
-	}
-	if err == io.EOF {
-		err = nil
-	}
-	return n, err
-}
-
-// DecodeAll allows stateless decoding of a blob of bytes.
-// Output will be appended to dst, so if the destination size is known
-// you can pre-allocate the destination slice to avoid allocations.
-// DecodeAll can be used concurrently.
-// The Decoder concurrency limits will be respected.
-func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
-	if d.decoders == nil {
-		return dst, ErrDecoderClosed
-	}
-
-	// Grab a block decoder and frame decoder.
-	block := <-d.decoders
-	frame := block.localFrame
-	initialSize := len(dst)
-	defer func() {
-		if debugDecoder {
-			printf("re-adding decoder: %p", block)
-		}
-		frame.rawInput = nil
-		frame.bBuf = nil
-		if frame.history.decoders.br != nil {
-			frame.history.decoders.br.in = nil
-			frame.history.decoders.br.cursor = 0
-		}
-		d.decoders <- block
-	}()
-	frame.bBuf = input
-
-	for {
-		frame.history.reset()
-		err := frame.reset(&frame.bBuf)
-		if err != nil {
-			if err == io.EOF {
-				if debugDecoder {
-					println("frame reset return EOF")
-				}
-				return dst, nil
-			}
-			return dst, err
-		}
-		if err = d.setDict(frame); err != nil {
-			return nil, err
-		}
-		if frame.WindowSize > d.o.maxWindowSize {
-			if debugDecoder {
-				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
-			}
-			return dst, ErrWindowSizeExceeded
-		}
-		if frame.FrameContentSize != fcsUnknown {
-			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
-				if debugDecoder {
-					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
-				}
-				return dst, ErrDecoderSizeExceeded
-			}
-			if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
-				if debugDecoder {
-					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
-				}
-				return dst, ErrDecoderSizeExceeded
-			}
-			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
-				copy(dst2, dst)
-				dst = dst2
-			}
-		}
-
-		if cap(dst) == 0 && !d.o.limitToCap {
-			// Allocate len(input) * 2 by default if nothing is provided
-			// and we didn't get frame content size.
-			size := len(input) * 2
-			// Cap to 1 MB.
-			if size > 1<<20 {
-				size = 1 << 20
-			}
-			if uint64(size) > d.o.maxDecodedSize {
-				size = int(d.o.maxDecodedSize)
-			}
-			dst = make([]byte, 0, size)
-		}
-
-		dst, err = frame.runDecoder(dst, block)
-		if err != nil {
-			return dst, err
-		}
-		if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
-			return dst, ErrDecoderSizeExceeded
-		}
-		if len(frame.bBuf) == 0 {
-			if debugDecoder {
-				println("frame dbuf empty")
-			}
-			break
-		}
-	}
-	return dst, nil
-}
-
-// nextBlock returns the next block.
-// If an error occurs d.err will be set.
-// Optionally the function can block for new output.
-// If non-blocking mode is used the returned boolean will be false
-// if no data was available without blocking.
-func (d *Decoder) nextBlock(blocking bool) (ok bool) {
-	if d.current.err != nil {
-		// Keep error state.
-		return false
-	}
-	d.current.b = d.current.b[:0]
-
-	// SYNC:
-	if d.syncStream.enabled {
-		if !blocking {
-			return false
-		}
-		ok = d.nextBlockSync()
-		if !ok {
-			d.stashDecoder()
-		}
-		return ok
-	}
-
-	//ASYNC:
-	d.stashDecoder()
-	if blocking {
-		d.current.decodeOutput, ok = <-d.current.output
-	} else {
-		select {
-		case d.current.decodeOutput, ok = <-d.current.output:
-		default:
-			return false
-		}
-	}
-	if !ok {
-		// This should not happen, so signal error state...
-		d.current.err = io.ErrUnexpectedEOF
-		return false
-	}
-	next := d.current.decodeOutput
-	if next.d != nil && next.d.async.newHist != nil {
-		d.current.crc.Reset()
-	}
-	if debugDecoder {
-		var tmp [4]byte
-		binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b)))
-		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
-	}
-
-	if d.o.ignoreChecksum {
-		return true
-	}
-
-	if len(next.b) > 0 {
-		d.current.crc.Write(next.b)
-	}
-	if next.err == nil && next.d != nil && next.d.hasCRC {
-		got := uint32(d.current.crc.Sum64())
-		if got != next.d.checkCRC {
-			if debugDecoder {
-				printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
-			}
-			d.current.err = ErrCRCMismatch
-		} else {
-			if debugDecoder {
-				printf("CRC ok %08x\n", got)
-			}
-		}
-	}
-
-	return true
-}
-
-func (d *Decoder) nextBlockSync() (ok bool) {
-	if d.current.d == nil {
-		d.current.d = <-d.decoders
-	}
-	for len(d.current.b) == 0 {
-		if !d.syncStream.inFrame {
-			d.frame.history.reset()
-			d.current.err = d.frame.reset(&d.syncStream.br)
-			if d.current.err == nil {
-				d.current.err = d.setDict(d.frame)
-			}
-			if d.current.err != nil {
-				return false
-			}
-			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
-				d.current.err = ErrDecoderSizeExceeded
-				return false
-			}
-
-			d.syncStream.decodedFrame = 0
-			d.syncStream.inFrame = true
-		}
-		d.current.err = d.frame.next(d.current.d)
-		if d.current.err != nil {
-			return false
-		}
-		d.frame.history.ensureBlock()
-		if debugDecoder {
-			println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame)
-		}
-		histBefore := len(d.frame.history.b)
-		d.current.err = d.current.d.decodeBuf(&d.frame.history)
-
-		if d.current.err != nil {
-			println("error after:", d.current.err)
-			return false
-		}
-		d.current.b = d.frame.history.b[histBefore:]
-		if debugDecoder {
-			println("history after:", len(d.frame.history.b))
-		}
-
-		// Check frame size (before CRC)
-		d.syncStream.decodedFrame += uint64(len(d.current.b))
-		if d.syncStream.decodedFrame > d.frame.FrameContentSize {
-			if debugDecoder {
-				printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
-			}
-			d.current.err = ErrFrameSizeExceeded
-			return false
-		}
-
-		// Check FCS
-		if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
-			if debugDecoder {
-				printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
-			}
-			d.current.err = ErrFrameSizeMismatch
-			return false
-		}
-
-		// Update/Check CRC
-		if d.frame.HasCheckSum {
-			if !d.o.ignoreChecksum {
-				d.frame.crc.Write(d.current.b)
-			}
-			if d.current.d.Last {
-				if !d.o.ignoreChecksum {
-					d.current.err = d.frame.checkCRC()
-				} else {
-					d.current.err = d.frame.consumeCRC()
-				}
-				if d.current.err != nil {
-					println("CRC error:", d.current.err)
-					return false
-				}
-			}
-		}
-		d.syncStream.inFrame = !d.current.d.Last
-	}
-	return true
-}
-
-func (d *Decoder) stashDecoder() {
-	if d.current.d != nil {
-		if debugDecoder {
-			printf("re-adding current decoder %p", d.current.d)
-		}
-		d.decoders <- d.current.d
-		d.current.d = nil
-	}
-}
-
-// Close will release all resources.
-// It is NOT possible to reuse the decoder after this.
-func (d *Decoder) Close() {
-	if d.current.err == ErrDecoderClosed {
-		return
-	}
-	d.drainOutput()
-	if d.current.cancel != nil {
-		d.current.cancel()
-		d.streamWg.Wait()
-		d.current.cancel = nil
-	}
-	if d.decoders != nil {
-		close(d.decoders)
-		for dec := range d.decoders {
-			dec.Close()
-		}
-		d.decoders = nil
-	}
-	if d.current.d != nil {
-		d.current.d.Close()
-		d.current.d = nil
-	}
-	d.current.err = ErrDecoderClosed
-}
-
-// IOReadCloser returns the decoder as an io.ReadCloser for convenience.
-// Any changes to the decoder will be reflected, so the returned ReadCloser
-// can be reused along with the decoder.
-// io.WriterTo is also supported by the returned ReadCloser.
-func (d *Decoder) IOReadCloser() io.ReadCloser {
-	return closeWrapper{d: d}
-}
-
-// closeWrapper wraps a function call as a closer.
-type closeWrapper struct {
-	d *Decoder
-}
-
-// WriteTo forwards WriteTo calls to the decoder.
-func (c closeWrapper) WriteTo(w io.Writer) (n int64, err error) {
-	return c.d.WriteTo(w)
-}
-
-// Read forwards read calls to the decoder.
-func (c closeWrapper) Read(p []byte) (n int, err error) {
-	return c.d.Read(p)
-}
-
-// Close closes the decoder.
-func (c closeWrapper) Close() error {
-	c.d.Close()
-	return nil
-}
-
-type decodeOutput struct {
-	d   *blockDec
-	b   []byte
-	err error
-}
-
-func (d *Decoder) startSyncDecoder(r io.Reader) error {
-	d.frame.history.reset()
-	d.syncStream.br = readerWrapper{r: r}
-	d.syncStream.inFrame = false
-	d.syncStream.enabled = true
-	d.syncStream.decodedFrame = 0
-	return nil
-}
-
-// Create Decoder:
-// ASYNC:
-// Spawn 3 go routines.
-// 0: Read frames and decode block literals.
-// 1: Decode sequences.
-// 2: Execute sequences, send to output.
-func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
-	defer d.streamWg.Done()
-	br := readerWrapper{r: r}
-
-	var seqDecode = make(chan *blockDec, d.o.concurrent)
-	var seqExecute = make(chan *blockDec, d.o.concurrent)
-
-	// Async 1: Decode sequences...
-	go func() {
-		var hist history
-		var hasErr bool
-
-		for block := range seqDecode {
-			if hasErr {
-				if block != nil {
-					seqExecute <- block
-				}
-				continue
-			}
-			if block.async.newHist != nil {
-				if debugDecoder {
-					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
-				}
-				hist.reset()
-				hist.decoders = block.async.newHist.decoders
-				hist.recentOffsets = block.async.newHist.recentOffsets
-				hist.windowSize = block.async.newHist.windowSize
-				if block.async.newHist.dict != nil {
-					hist.setDict(block.async.newHist.dict)
-				}
-			}
-			if block.err != nil || block.Type != blockTypeCompressed {
-				hasErr = block.err != nil
-				seqExecute <- block
-				continue
-			}
-
-			hist.decoders.literals = block.async.literals
-			block.err = block.prepareSequences(block.async.seqData, &hist)
-			if debugDecoder && block.err != nil {
-				println("prepareSequences returned:", block.err)
-			}
-			hasErr = block.err != nil
-			if block.err == nil {
-				block.err = block.decodeSequences(&hist)
-				if debugDecoder && block.err != nil {
-					println("decodeSequences returned:", block.err)
-				}
-				hasErr = block.err != nil
-				//				block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs]
-				block.async.seqSize = hist.decoders.seqSize
-			}
-			seqExecute <- block
-		}
-		close(seqExecute)
-		hist.reset()
-	}()
-
-	var wg sync.WaitGroup
-	wg.Add(1)
-
-	// Async 3: Execute sequences...
-	frameHistCache := d.frame.history.b
-	go func() {
-		var hist history
-		var decodedFrame uint64
-		var fcs uint64
-		var hasErr bool
-		for block := range seqExecute {
-			out := decodeOutput{err: block.err, d: block}
-			if block.err != nil || hasErr {
-				hasErr = true
-				output <- out
-				continue
-			}
-			if block.async.newHist != nil {
-				if debugDecoder {
-					println("Async 2: new history")
-				}
-				hist.reset()
-				hist.windowSize = block.async.newHist.windowSize
-				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
-				if block.async.newHist.dict != nil {
-					hist.setDict(block.async.newHist.dict)
-				}
-
-				if cap(hist.b) < hist.allocFrameBuffer {
-					if cap(frameHistCache) >= hist.allocFrameBuffer {
-						hist.b = frameHistCache
-					} else {
-						hist.b = make([]byte, 0, hist.allocFrameBuffer)
-						println("Alloc history sized", hist.allocFrameBuffer)
-					}
-				}
-				hist.b = hist.b[:0]
-				fcs = block.async.fcs
-				decodedFrame = 0
-			}
-			do := decodeOutput{err: block.err, d: block}
-			switch block.Type {
-			case blockTypeRLE:
-				if debugDecoder {
-					println("add rle block length:", block.RLESize)
-				}
-
-				if cap(block.dst) < int(block.RLESize) {
-					if block.lowMem {
-						block.dst = make([]byte, block.RLESize)
-					} else {
-						block.dst = make([]byte, maxCompressedBlockSize)
-					}
-				}
-				block.dst = block.dst[:block.RLESize]
-				v := block.data[0]
-				for i := range block.dst {
-					block.dst[i] = v
-				}
-				hist.append(block.dst)
-				do.b = block.dst
-			case blockTypeRaw:
-				if debugDecoder {
-					println("add raw block length:", len(block.data))
-				}
-				hist.append(block.data)
-				do.b = block.data
-			case blockTypeCompressed:
-				if debugDecoder {
-					println("execute with history length:", len(hist.b), "window:", hist.windowSize)
-				}
-				hist.decoders.seqSize = block.async.seqSize
-				hist.decoders.literals = block.async.literals
-				do.err = block.executeSequences(&hist)
-				hasErr = do.err != nil
-				if debugDecoder && hasErr {
-					println("executeSequences returned:", do.err)
-				}
-				do.b = block.dst
-			}
-			if !hasErr {
-				decodedFrame += uint64(len(do.b))
-				if decodedFrame > fcs {
-					println("fcs exceeded", block.Last, fcs, decodedFrame)
-					do.err = ErrFrameSizeExceeded
-					hasErr = true
-				} else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
-					do.err = ErrFrameSizeMismatch
-					hasErr = true
-				} else {
-					if debugDecoder {
-						println("fcs ok", block.Last, fcs, decodedFrame)
-					}
-				}
-			}
-			output <- do
-		}
-		close(output)
-		frameHistCache = hist.b
-		wg.Done()
-		if debugDecoder {
-			println("decoder goroutines finished")
-		}
-		hist.reset()
-	}()
-
-	var hist history
-decodeStream:
-	for {
-		var hasErr bool
-		hist.reset()
-		decodeBlock := func(block *blockDec) {
-			if hasErr {
-				if block != nil {
-					seqDecode <- block
-				}
-				return
-			}
-			if block.err != nil || block.Type != blockTypeCompressed {
-				hasErr = block.err != nil
-				seqDecode <- block
-				return
-			}
-
-			remain, err := block.decodeLiterals(block.data, &hist)
-			block.err = err
-			hasErr = block.err != nil
-			if err == nil {
-				block.async.literals = hist.decoders.literals
-				block.async.seqData = remain
-			} else if debugDecoder {
-				println("decodeLiterals error:", err)
-			}
-			seqDecode <- block
-		}
-		frame := d.frame
-		if debugDecoder {
-			println("New frame...")
-		}
-		var historySent bool
-		frame.history.reset()
-		err := frame.reset(&br)
-		if debugDecoder && err != nil {
-			println("Frame decoder returned", err)
-		}
-		if err == nil {
-			err = d.setDict(frame)
-		}
-		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
-			if debugDecoder {
-				println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
-			}
-
-			err = ErrDecoderSizeExceeded
-		}
-		if err != nil {
-			select {
-			case <-ctx.Done():
-			case dec := <-d.decoders:
-				dec.sendErr(err)
-				decodeBlock(dec)
-			}
-			break decodeStream
-		}
-
-		// Go through all blocks of the frame.
-		for {
-			var dec *blockDec
-			select {
-			case <-ctx.Done():
-				break decodeStream
-			case dec = <-d.decoders:
-				// Once we have a decoder, we MUST return it.
-			}
-			err := frame.next(dec)
-			if !historySent {
-				h := frame.history
-				if debugDecoder {
-					println("Alloc History:", h.allocFrameBuffer)
-				}
-				hist.reset()
-				if h.dict != nil {
-					hist.setDict(h.dict)
-				}
-				dec.async.newHist = &h
-				dec.async.fcs = frame.FrameContentSize
-				historySent = true
-			} else {
-				dec.async.newHist = nil
-			}
-			if debugDecoder && err != nil {
-				println("next block returned error:", err)
-			}
-			dec.err = err
-			dec.hasCRC = false
-			if dec.Last && frame.HasCheckSum && err == nil {
-				crc, err := frame.rawInput.readSmall(4)
-				if len(crc) < 4 {
-					if err == nil {
-						err = io.ErrUnexpectedEOF
-
-					}
-					println("CRC missing?", err)
-					dec.err = err
-				} else {
-					dec.checkCRC = binary.LittleEndian.Uint32(crc)
-					dec.hasCRC = true
-					if debugDecoder {
-						printf("found crc to check: %08x\n", dec.checkCRC)
-					}
-				}
-			}
-			err = dec.err
-			last := dec.Last
-			decodeBlock(dec)
-			if err != nil {
-				break decodeStream
-			}
-			if last {
-				break
-			}
-		}
-	}
-	close(seqDecode)
-	wg.Wait()
-	hist.reset()
-	d.frame.history.b = frameHistCache
-}
-
-func (d *Decoder) setDict(frame *frameDec) (err error) {
-	dict, ok := d.dicts[frame.DictionaryID]
-	if ok {
-		if debugDecoder {
-			println("setting dict", frame.DictionaryID)
-		}
-		frame.history.setDict(dict)
-	} else if frame.DictionaryID != 0 {
-		// A zero or missing dictionary id is ambiguous:
-		// either dictionary zero, or no dictionary. In particular,
-		// zstd --patch-from uses this id for the source file,
-		// so only return an error if the dictionary id is not zero.
-		err = ErrUnknownDictionary
-	}
-	return err
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
deleted file mode 100644
index 774c5f00f..000000000
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"math/bits"
-	"runtime"
-)
-
-// DOption is an option for creating a decoder.
-type DOption func(*decoderOptions) error
-
-// options retains accumulated state of multiple options.
-type decoderOptions struct {
-	lowMem          bool
-	concurrent      int
-	maxDecodedSize  uint64
-	maxWindowSize   uint64
-	dicts           []*dict
-	ignoreChecksum  bool
-	limitToCap      bool
-	decodeBufsBelow int
-}
-
-func (o *decoderOptions) setDefault() {
-	*o = decoderOptions{
-		// use less ram: true for now, but may change.
-		lowMem:          true,
-		concurrent:      runtime.GOMAXPROCS(0),
-		maxWindowSize:   MaxWindowSize,
-		decodeBufsBelow: 128 << 10,
-	}
-	if o.concurrent > 4 {
-		o.concurrent = 4
-	}
-	o.maxDecodedSize = 64 << 30
-}
-
-// WithDecoderLowmem will set whether to use a lower amount of memory,
-// but possibly have to allocate more while running.
-func WithDecoderLowmem(b bool) DOption {
-	return func(o *decoderOptions) error { o.lowMem = b; return nil }
-}
-
-// WithDecoderConcurrency sets the number of created decoders.
-// When decoding block with DecodeAll, this will limit the number
-// of possible concurrently running decodes.
-// When decoding streams, this will limit the number of
-// inflight blocks.
-// When decoding streams and setting maximum to 1,
-// no async decoding will be done.
-// When a value of 0 is provided GOMAXPROCS will be used.
-// By default this will be set to 4 or GOMAXPROCS, whatever is lower.
-func WithDecoderConcurrency(n int) DOption {
-	return func(o *decoderOptions) error {
-		if n < 0 {
-			return errors.New("concurrency must be at least 1")
-		}
-		if n == 0 {
-			o.concurrent = runtime.GOMAXPROCS(0)
-		} else {
-			o.concurrent = n
-		}
-		return nil
-	}
-}
-
-// WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
-// non-streaming operations or maximum window size for streaming operations.
-// This can be used to control memory usage of potentially hostile content.
-// Maximum is 1 << 63 bytes. Default is 64GiB.
-func WithDecoderMaxMemory(n uint64) DOption {
-	return func(o *decoderOptions) error {
-		if n == 0 {
-			return errors.New("WithDecoderMaxMemory must be at least 1")
-		}
-		if n > 1<<63 {
-			return errors.New("WithDecoderMaxmemory must be less than 1 << 63")
-		}
-		o.maxDecodedSize = n
-		return nil
-	}
-}
-
-// WithDecoderDicts allows to register one or more dictionaries for the decoder.
-//
-// Each slice in dict must be in the [dictionary format] produced by
-// "zstd --train" from the Zstandard reference implementation.
-//
-// If several dictionaries with the same ID are provided, the last one will be used.
-//
-// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
-func WithDecoderDicts(dicts ...[]byte) DOption {
-	return func(o *decoderOptions) error {
-		for _, b := range dicts {
-			d, err := loadDict(b)
-			if err != nil {
-				return err
-			}
-			o.dicts = append(o.dicts, d)
-		}
-		return nil
-	}
-}
-
-// WithDecoderDictRaw registers a dictionary that may be used by the decoder.
-// The slice content can be arbitrary data.
-func WithDecoderDictRaw(id uint32, content []byte) DOption {
-	return func(o *decoderOptions) error {
-		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
-			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
-		}
-		o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
-		return nil
-	}
-}
-
-// WithDecoderMaxWindow allows to set a maximum window size for decodes.
-// This allows rejecting packets that will cause big memory usage.
-// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
-// If WithDecoderMaxMemory is set to a lower value, that will be used.
-// Default is 512MB, Maximum is ~3.75 TB as per zstandard spec.
-func WithDecoderMaxWindow(size uint64) DOption {
-	return func(o *decoderOptions) error {
-		if size < MinWindowSize {
-			return errors.New("WithMaxWindowSize must be at least 1KB, 1024 bytes")
-		}
-		if size > (1<<41)+7*(1<<38) {
-			return errors.New("WithMaxWindowSize must be less than (1<<41) + 7*(1<<38) ~ 3.75TB")
-		}
-		o.maxWindowSize = size
-		return nil
-	}
-}
-
-// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
-// or any size set in WithDecoderMaxMemory.
-// This can be used to limit decoding to a specific maximum output size.
-// Disabled by default.
-func WithDecodeAllCapLimit(b bool) DOption {
-	return func(o *decoderOptions) error {
-		o.limitToCap = b
-		return nil
-	}
-}
-
-// WithDecodeBuffersBelow will fully decode readers that have a
-// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
-// This typically uses less allocations but will have the full decompressed object in memory.
-// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
-// Default is 128KiB.
-func WithDecodeBuffersBelow(size int) DOption {
-	return func(o *decoderOptions) error {
-		o.decodeBufsBelow = size
-		return nil
-	}
-}
-
-// IgnoreChecksum allows to forcibly ignore checksum checking.
-func IgnoreChecksum(b bool) DOption {
-	return func(o *decoderOptions) error {
-		o.ignoreChecksum = b
-		return nil
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
deleted file mode 100644
index b7b83164b..000000000
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ /dev/null
@@ -1,565 +0,0 @@
-package zstd
-
-import (
-	"bytes"
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"math"
-	"sort"
-
-	"github.com/klauspost/compress/huff0"
-)
-
-type dict struct {
-	id uint32
-
-	litEnc              *huff0.Scratch
-	llDec, ofDec, mlDec sequenceDec
-	offsets             [3]int
-	content             []byte
-}
-
-const dictMagic = "\x37\xa4\x30\xec"
-
-// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
-const dictMaxLength = 1 << 31
-
-// ID returns the dictionary id or 0 if d is nil.
-func (d *dict) ID() uint32 {
-	if d == nil {
-		return 0
-	}
-	return d.id
-}
-
-// ContentSize returns the dictionary content size or 0 if d is nil.
-func (d *dict) ContentSize() int {
-	if d == nil {
-		return 0
-	}
-	return len(d.content)
-}
-
-// Content returns the dictionary content.
-func (d *dict) Content() []byte {
-	if d == nil {
-		return nil
-	}
-	return d.content
-}
-
-// Offsets returns the initial offsets.
-func (d *dict) Offsets() [3]int {
-	if d == nil {
-		return [3]int{}
-	}
-	return d.offsets
-}
-
-// LitEncoder returns the literal encoder.
-func (d *dict) LitEncoder() *huff0.Scratch {
-	if d == nil {
-		return nil
-	}
-	return d.litEnc
-}
-
-// Load a dictionary as described in
-// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
-func loadDict(b []byte) (*dict, error) {
-	// Check static field size.
-	if len(b) <= 8+(3*4) {
-		return nil, io.ErrUnexpectedEOF
-	}
-	d := dict{
-		llDec: sequenceDec{fse: &fseDecoder{}},
-		ofDec: sequenceDec{fse: &fseDecoder{}},
-		mlDec: sequenceDec{fse: &fseDecoder{}},
-	}
-	if string(b[:4]) != dictMagic {
-		return nil, ErrMagicMismatch
-	}
-	d.id = binary.LittleEndian.Uint32(b[4:8])
-	if d.id == 0 {
-		return nil, errors.New("dictionaries cannot have ID 0")
-	}
-
-	// Read literal table
-	var err error
-	d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
-	if err != nil {
-		return nil, fmt.Errorf("loading literal table: %w", err)
-	}
-	d.litEnc.Reuse = huff0.ReusePolicyMust
-
-	br := byteReader{
-		b:   b,
-		off: 0,
-	}
-	readDec := func(i tableIndex, dec *fseDecoder) error {
-		if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil {
-			return err
-		}
-		if br.overread() {
-			return io.ErrUnexpectedEOF
-		}
-		err = dec.transform(symbolTableX[i])
-		if err != nil {
-			println("Transform table error:", err)
-			return err
-		}
-		if debugDecoder || debugEncoder {
-			println("Read table ok", "symbolLen:", dec.symbolLen)
-		}
-		// Set decoders as predefined so they aren't reused.
-		dec.preDefined = true
-		return nil
-	}
-
-	if err := readDec(tableOffsets, d.ofDec.fse); err != nil {
-		return nil, err
-	}
-	if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil {
-		return nil, err
-	}
-	if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil {
-		return nil, err
-	}
-	if br.remain() < 12 {
-		return nil, io.ErrUnexpectedEOF
-	}
-
-	d.offsets[0] = int(br.Uint32())
-	br.advance(4)
-	d.offsets[1] = int(br.Uint32())
-	br.advance(4)
-	d.offsets[2] = int(br.Uint32())
-	br.advance(4)
-	if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 {
-		return nil, errors.New("invalid offset in dictionary")
-	}
-	d.content = make([]byte, br.remain())
-	copy(d.content, br.unread())
-	if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) {
-		return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets)
-	}
-
-	return &d, nil
-}
-
-// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
-func InspectDictionary(b []byte) (interface {
-	ID() uint32
-	ContentSize() int
-	Content() []byte
-	Offsets() [3]int
-	LitEncoder() *huff0.Scratch
-}, error) {
-	initPredefined()
-	d, err := loadDict(b)
-	return d, err
-}
-
-type BuildDictOptions struct {
-	// Dictionary ID.
-	ID uint32
-
-	// Content to use to create dictionary tables.
-	Contents [][]byte
-
-	// History to use for all blocks.
-	History []byte
-
-	// Offsets to use.
-	Offsets [3]int
-
-	// CompatV155 will make the dictionary compatible with Zstd v1.5.5 and earlier.
-	// See https://github.com/facebook/zstd/issues/3724
-	CompatV155 bool
-
-	// Use the specified encoder level.
-	// The dictionary will be built using the specified encoder level,
-	// which will reflect speed and make the dictionary tailored for that level.
-	// If not set SpeedBestCompression will be used.
-	Level EncoderLevel
-
-	// DebugOut will write stats and other details here if set.
-	DebugOut io.Writer
-}
-
-func BuildDict(o BuildDictOptions) ([]byte, error) {
-	initPredefined()
-	hist := o.History
-	contents := o.Contents
-	debug := o.DebugOut != nil
-	println := func(args ...interface{}) {
-		if o.DebugOut != nil {
-			fmt.Fprintln(o.DebugOut, args...)
-		}
-	}
-	printf := func(s string, args ...interface{}) {
-		if o.DebugOut != nil {
-			fmt.Fprintf(o.DebugOut, s, args...)
-		}
-	}
-	print := func(args ...interface{}) {
-		if o.DebugOut != nil {
-			fmt.Fprint(o.DebugOut, args...)
-		}
-	}
-
-	if int64(len(hist)) > dictMaxLength {
-		return nil, fmt.Errorf("dictionary of size %d > %d", len(hist), int64(dictMaxLength))
-	}
-	if len(hist) < 8 {
-		return nil, fmt.Errorf("dictionary of size %d < %d", len(hist), 8)
-	}
-	if len(contents) == 0 {
-		return nil, errors.New("no content provided")
-	}
-	d := dict{
-		id:      o.ID,
-		litEnc:  nil,
-		llDec:   sequenceDec{},
-		ofDec:   sequenceDec{},
-		mlDec:   sequenceDec{},
-		offsets: o.Offsets,
-		content: hist,
-	}
-	block := blockEnc{lowMem: false}
-	block.init()
-	enc := encoder(&bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(maxMatchLen), bufferReset: math.MaxInt32 - int32(maxMatchLen*2), lowMem: false}})
-	if o.Level != 0 {
-		eOpts := encoderOptions{
-			level:      o.Level,
-			blockSize:  maxMatchLen,
-			windowSize: maxMatchLen,
-			dict:       &d,
-			lowMem:     false,
-		}
-		enc = eOpts.encoder()
-	} else {
-		o.Level = SpeedBestCompression
-	}
-	var (
-		remain [256]int
-		ll     [256]int
-		ml     [256]int
-		of     [256]int
-	)
-	addValues := func(dst *[256]int, src []byte) {
-		for _, v := range src {
-			dst[v]++
-		}
-	}
-	addHist := func(dst *[256]int, src *[256]uint32) {
-		for i, v := range src {
-			dst[i] += int(v)
-		}
-	}
-	seqs := 0
-	nUsed := 0
-	litTotal := 0
-	newOffsets := make(map[uint32]int, 1000)
-	for _, b := range contents {
-		block.reset(nil)
-		if len(b) < 8 {
-			continue
-		}
-		nUsed++
-		enc.Reset(&d, true)
-		enc.Encode(&block, b)
-		addValues(&remain, block.literals)
-		litTotal += len(block.literals)
-		if len(block.sequences) == 0 {
-			continue
-		}
-		seqs += len(block.sequences)
-		block.genCodes()
-		addHist(&ll, block.coders.llEnc.Histogram())
-		addHist(&ml, block.coders.mlEnc.Histogram())
-		addHist(&of, block.coders.ofEnc.Histogram())
-		for i, seq := range block.sequences {
-			if i > 3 {
-				break
-			}
-			offset := seq.offset
-			if offset == 0 {
-				continue
-			}
-			if int(offset) >= len(o.History) {
-				continue
-			}
-			if offset > 3 {
-				newOffsets[offset-3]++
-			} else {
-				newOffsets[uint32(o.Offsets[offset-1])]++
-			}
-		}
-	}
-	// Find most used offsets.
-	var sortedOffsets []uint32
-	for k := range newOffsets {
-		sortedOffsets = append(sortedOffsets, k)
-	}
-	sort.Slice(sortedOffsets, func(i, j int) bool {
-		a, b := sortedOffsets[i], sortedOffsets[j]
-		if a == b {
-			// Prefer the longer offset
-			return sortedOffsets[i] > sortedOffsets[j]
-		}
-		return newOffsets[sortedOffsets[i]] > newOffsets[sortedOffsets[j]]
-	})
-	if len(sortedOffsets) > 3 {
-		if debug {
-			print("Offsets:")
-			for i, v := range sortedOffsets {
-				if i > 20 {
-					break
-				}
-				printf("[%d: %d],", v, newOffsets[v])
-			}
-			println("")
-		}
-
-		sortedOffsets = sortedOffsets[:3]
-	}
-	for i, v := range sortedOffsets {
-		o.Offsets[i] = int(v)
-	}
-	if debug {
-		println("New repeat offsets", o.Offsets)
-	}
-
-	if nUsed == 0 || seqs == 0 {
-		return nil, fmt.Errorf("%d blocks, %d sequences found", nUsed, seqs)
-	}
-	if debug {
-		println("Sequences:", seqs, "Blocks:", nUsed, "Literals:", litTotal)
-	}
-	if seqs/nUsed < 512 {
-		// Use 512 as minimum.
-		nUsed = seqs / 512
-		if nUsed == 0 {
-			nUsed = 1
-		}
-	}
-	copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) {
-		hist := dst.Histogram()
-		var maxSym uint8
-		var maxCount int
-		var fakeLength int
-		for i, v := range src {
-			if v > 0 {
-				v = v / nUsed
-				if v == 0 {
-					v = 1
-				}
-			}
-			if v > maxCount {
-				maxCount = v
-			}
-			if v != 0 {
-				maxSym = uint8(i)
-			}
-			fakeLength += v
-			hist[i] = uint32(v)
-		}
-
-		// Ensure we aren't trying to represent RLE.
-		if maxCount == fakeLength {
-			for i := range hist {
-				if uint8(i) == maxSym {
-					fakeLength++
-					maxSym++
-					hist[i+1] = 1
-					if maxSym > 1 {
-						break
-					}
-				}
-				if hist[0] == 0 {
-					fakeLength++
-					hist[i] = 1
-					if maxSym > 1 {
-						break
-					}
-				}
-			}
-		}
-
-		dst.HistogramFinished(maxSym, maxCount)
-		dst.reUsed = false
-		dst.useRLE = false
-		err := dst.normalizeCount(fakeLength)
-		if err != nil {
-			return nil, err
-		}
-		if debug {
-			println("RAW:", dst.count[:maxSym+1], "NORM:", dst.norm[:maxSym+1], "LEN:", fakeLength)
-		}
-		return dst.writeCount(nil)
-	}
-	if debug {
-		print("Literal lengths: ")
-	}
-	llTable, err := copyHist(block.coders.llEnc, &ll)
-	if err != nil {
-		return nil, err
-	}
-	if debug {
-		print("Match lengths: ")
-	}
-	mlTable, err := copyHist(block.coders.mlEnc, &ml)
-	if err != nil {
-		return nil, err
-	}
-	if debug {
-		print("Offsets: ")
-	}
-	ofTable, err := copyHist(block.coders.ofEnc, &of)
-	if err != nil {
-		return nil, err
-	}
-
-	// Literal table
-	avgSize := litTotal
-	if avgSize > huff0.BlockSizeMax/2 {
-		avgSize = huff0.BlockSizeMax / 2
-	}
-	huffBuff := make([]byte, 0, avgSize)
-	// Target size
-	div := litTotal / avgSize
-	if div < 1 {
-		div = 1
-	}
-	if debug {
-		println("Huffman weights:")
-	}
-	for i, n := range remain[:] {
-		if n > 0 {
-			n = n / div
-			// Allow all entries to be represented.
-			if n == 0 {
-				n = 1
-			}
-			huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
-			if debug {
-				printf("[%d: %d], ", i, n)
-			}
-		}
-	}
-	if o.CompatV155 && remain[255]/div == 0 {
-		huffBuff = append(huffBuff, 255)
-	}
-	scratch := &huff0.Scratch{TableLog: 11}
-	for tries := 0; tries < 255; tries++ {
-		scratch = &huff0.Scratch{TableLog: 11}
-		_, _, err = huff0.Compress1X(huffBuff, scratch)
-		if err == nil {
-			break
-		}
-		if debug {
-			printf("Try %d: Huffman error: %v\n", tries+1, err)
-		}
-		huffBuff = huffBuff[:0]
-		if tries == 250 {
-			if debug {
-				println("Huffman: Bailing out with predefined table")
-			}
-
-			// Bail out.... Just generate something
-			huffBuff = append(huffBuff, bytes.Repeat([]byte{255}, 10000)...)
-			for i := 0; i < 128; i++ {
-				huffBuff = append(huffBuff, byte(i))
-			}
-			continue
-		}
-		if errors.Is(err, huff0.ErrIncompressible) {
-			// Try truncating least common.
-			for i, n := range remain[:] {
-				if n > 0 {
-					n = n / (div * (i + 1))
-					if n > 0 {
-						huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
-					}
-				}
-			}
-			if o.CompatV155 && len(huffBuff) > 0 && huffBuff[len(huffBuff)-1] != 255 {
-				huffBuff = append(huffBuff, 255)
-			}
-			if len(huffBuff) == 0 {
-				huffBuff = append(huffBuff, 0, 255)
-			}
-		}
-		if errors.Is(err, huff0.ErrUseRLE) {
-			for i, n := range remain[:] {
-				n = n / (div * (i + 1))
-				// Allow all entries to be represented.
-				if n == 0 {
-					n = 1
-				}
-				huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
-			}
-		}
-	}
-
-	var out bytes.Buffer
-	out.Write([]byte(dictMagic))
-	out.Write(binary.LittleEndian.AppendUint32(nil, o.ID))
-	out.Write(scratch.OutTable)
-	if debug {
-		println("huff table:", len(scratch.OutTable), "bytes")
-		println("of table:", len(ofTable), "bytes")
-		println("ml table:", len(mlTable), "bytes")
-		println("ll table:", len(llTable), "bytes")
-	}
-	out.Write(ofTable)
-	out.Write(mlTable)
-	out.Write(llTable)
-	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[0])))
-	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[1])))
-	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[2])))
-	out.Write(hist)
-	if debug {
-		_, err := loadDict(out.Bytes())
-		if err != nil {
-			panic(err)
-		}
-		i, err := InspectDictionary(out.Bytes())
-		if err != nil {
-			panic(err)
-		}
-		println("ID:", i.ID())
-		println("Content size:", i.ContentSize())
-		println("Encoder:", i.LitEncoder() != nil)
-		println("Offsets:", i.Offsets())
-		var totalSize int
-		for _, b := range contents {
-			totalSize += len(b)
-		}
-
-		encWith := func(opts ...EOption) int {
-			enc, err := NewWriter(nil, opts...)
-			if err != nil {
-				panic(err)
-			}
-			defer enc.Close()
-			var dst []byte
-			var totalSize int
-			for _, b := range contents {
-				dst = enc.EncodeAll(b, dst[:0])
-				totalSize += len(dst)
-			}
-			return totalSize
-		}
-		plain := encWith(WithEncoderLevel(o.Level))
-		withDict := encWith(WithEncoderLevel(o.Level), WithEncoderDict(out.Bytes()))
-		println("Input size:", totalSize)
-		println("Plain Compressed:", plain)
-		println("Dict Compressed:", withDict)
-		println("Saved:", plain-withDict, (plain-withDict)/len(contents), "bytes per input (rounded down)")
-	}
-	return out.Bytes(), nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
deleted file mode 100644
index 7d250c67f..000000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ /dev/null
@@ -1,173 +0,0 @@
-package zstd
-
-import (
-	"fmt"
-	"math/bits"
-
-	"github.com/klauspost/compress/zstd/internal/xxhash"
-)
-
-const (
-	dictShardBits = 6
-)
-
-type fastBase struct {
-	// cur is the offset at the start of hist
-	cur int32
-	// maximum offset. Should be at least 2x block size.
-	maxMatchOff int32
-	bufferReset int32
-	hist        []byte
-	crc         *xxhash.Digest
-	tmp         [8]byte
-	blk         *blockEnc
-	lastDictID  uint32
-	lowMem      bool
-}
-
-// CRC returns the underlying CRC writer.
-func (e *fastBase) CRC() *xxhash.Digest {
-	return e.crc
-}
-
-// AppendCRC will append the CRC to the destination slice and return it.
-func (e *fastBase) AppendCRC(dst []byte) []byte {
-	crc := e.crc.Sum(e.tmp[:0])
-	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
-	return dst
-}
-
-// WindowSize returns the window size of the encoder,
-// or a window size small enough to contain the input size, if > 0.
-func (e *fastBase) WindowSize(size int64) int32 {
-	if size > 0 && size < int64(e.maxMatchOff) {
-		b := int32(1) << uint(bits.Len(uint(size)))
-		// Keep minimum window.
-		if b < 1024 {
-			b = 1024
-		}
-		return b
-	}
-	return e.maxMatchOff
-}
-
-// Block returns the current block.
-func (e *fastBase) Block() *blockEnc {
-	return e.blk
-}
-
-func (e *fastBase) addBlock(src []byte) int32 {
-	if debugAsserts && e.cur > e.bufferReset {
-		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
-	}
-	// check if we have space already
-	if len(e.hist)+len(src) > cap(e.hist) {
-		if cap(e.hist) == 0 {
-			e.ensureHist(len(src))
-		} else {
-			if cap(e.hist) < int(e.maxMatchOff+maxCompressedBlockSize) {
-				panic(fmt.Errorf("unexpected buffer cap %d, want at least %d with window %d", cap(e.hist), e.maxMatchOff+maxCompressedBlockSize, e.maxMatchOff))
-			}
-			// Move down
-			offset := int32(len(e.hist)) - e.maxMatchOff
-			copy(e.hist[0:e.maxMatchOff], e.hist[offset:])
-			e.cur += offset
-			e.hist = e.hist[:e.maxMatchOff]
-		}
-	}
-	s := int32(len(e.hist))
-	e.hist = append(e.hist, src...)
-	return s
-}
-
-// ensureHist will ensure that history can keep at least this many bytes.
-func (e *fastBase) ensureHist(n int) {
-	if cap(e.hist) >= n {
-		return
-	}
-	l := e.maxMatchOff
-	if (e.lowMem && e.maxMatchOff > maxCompressedBlockSize) || e.maxMatchOff <= maxCompressedBlockSize {
-		l += maxCompressedBlockSize
-	} else {
-		l += e.maxMatchOff
-	}
-	// Make it at least 1MB.
-	if l < 1<<20 && !e.lowMem {
-		l = 1 << 20
-	}
-	// Make it at least the requested size.
-	if l < int32(n) {
-		l = int32(n)
-	}
-	e.hist = make([]byte, 0, l)
-}
-
-// useBlock will replace the block with the provided one,
-// but transfer recent offsets from the previous.
-func (e *fastBase) UseBlock(enc *blockEnc) {
-	enc.reset(e.blk)
-	e.blk = enc
-}
-
-func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
-	if debugAsserts {
-		if s < 0 {
-			err := fmt.Sprintf("s (%d) < 0", s)
-			panic(err)
-		}
-		if t < 0 {
-			err := fmt.Sprintf("t (%d) < 0", t)
-			panic(err)
-		}
-		if s-t > e.maxMatchOff {
-			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
-			panic(err)
-		}
-		if len(src)-int(s) > maxCompressedBlockSize {
-			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
-		}
-	}
-	return int32(matchLen(src[s:], src[t:]))
-}
-
-// Reset the encoding table.
-func (e *fastBase) resetBase(d *dict, singleBlock bool) {
-	if e.blk == nil {
-		e.blk = &blockEnc{lowMem: e.lowMem}
-		e.blk.init()
-	} else {
-		e.blk.reset(nil)
-	}
-	e.blk.initNewEncode()
-	if e.crc == nil {
-		e.crc = xxhash.New()
-	} else {
-		e.crc.Reset()
-	}
-	e.blk.dictLitEnc = nil
-	if d != nil {
-		low := e.lowMem
-		if singleBlock {
-			e.lowMem = true
-		}
-		e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
-		e.lowMem = low
-	}
-
-	// We offset current position so everything will be out of reach.
-	// If above reset line, history will be purged.
-	if e.cur < e.bufferReset {
-		e.cur += e.maxMatchOff + int32(len(e.hist))
-	}
-	e.hist = e.hist[:0]
-	if d != nil {
-		// Set offsets (currently not used)
-		for i, off := range d.offsets {
-			e.blk.recentOffsets[i] = uint32(off)
-			e.blk.prevRecentOffsets[i] = e.blk.recentOffsets[i]
-		}
-		// Transfer litenc.
-		e.blk.dictLitEnc = d.litEnc
-		e.hist = append(e.hist, d.content...)
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
deleted file mode 100644
index 4613724e9..000000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ /dev/null
@@ -1,560 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"bytes"
-	"fmt"
-
-	"github.com/klauspost/compress"
-)
-
-const (
-	bestLongTableBits = 22                     // Bits used in the long match table
-	bestLongTableSize = 1 << bestLongTableBits // Size of the table
-	bestLongLen       = 8                      // Bytes used for table hash
-
-	// Note: Increasing the short table bits or making the hash shorter
-	// can actually lead to compression degradation since it will 'steal' more from the
-	// long match table and match offsets are quite big.
-	// This greatly depends on the type of input.
-	bestShortTableBits = 18                      // Bits used in the short match table
-	bestShortTableSize = 1 << bestShortTableBits // Size of the table
-	bestShortLen       = 4                       // Bytes used for table hash
-
-)
-
-type match struct {
-	offset int32
-	s      int32
-	length int32
-	rep    int32
-	est    int32
-}
-
-const highScore = maxMatchLen * 8
-
-// estBits will estimate output bits from predefined tables.
-func (m *match) estBits(bitsPerByte int32) {
-	mlc := mlCode(uint32(m.length - zstdMinMatch))
-	var ofc uint8
-	if m.rep < 0 {
-		ofc = ofCode(uint32(m.s-m.offset) + 3)
-	} else {
-		ofc = ofCode(uint32(m.rep) & 3)
-	}
-	// Cost, excluding
-	ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
-
-	// Add cost of match encoding...
-	m.est = int32(ofTT.outBits + mlTT.outBits)
-	m.est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16)
-	// Subtract savings compared to literal encoding...
-	m.est -= (m.length * bitsPerByte) >> 10
-	if m.est > 0 {
-		// Unlikely gain..
-		m.length = 0
-		m.est = highScore
-	}
-}
-
-// bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
-// The long match table contains the previous entry with the same hash,
-// effectively making it a "chain" of length 2.
-// When we find a long match we choose between the two values and select the longest.
-// When we find a short match, after checking the long, we check if we can find a long at n+1
-// and that it is longer (lazy matching).
-type bestFastEncoder struct {
-	fastBase
-	table         [bestShortTableSize]prevEntry
-	longTable     [bestLongTableSize]prevEntry
-	dictTable     []prevEntry
-	dictLongTable []prevEntry
-}
-
-// Encode improves compression...
-func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
-	const (
-		// Input margin is the number of bytes we read (8)
-		// and the maximum we will read ahead (2)
-		inputMargin            = 8 + 4
-		minNonLiteralBlockSize = 16
-	)
-
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			e.table = [bestShortTableSize]prevEntry{}
-			e.longTable = [bestLongTableSize]prevEntry{}
-			e.cur = e.maxMatchOff
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			v2 := e.table[i].prev
-			if v < minOff {
-				v = 0
-				v2 = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-				if v2 < minOff {
-					v2 = 0
-				} else {
-					v2 = v2 - e.cur + e.maxMatchOff
-				}
-			}
-			e.table[i] = prevEntry{
-				offset: v,
-				prev:   v2,
-			}
-		}
-		for i := range e.longTable[:] {
-			v := e.longTable[i].offset
-			v2 := e.longTable[i].prev
-			if v < minOff {
-				v = 0
-				v2 = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-				if v2 < minOff {
-					v2 = 0
-				} else {
-					v2 = v2 - e.cur + e.maxMatchOff
-				}
-			}
-			e.longTable[i] = prevEntry{
-				offset: v,
-				prev:   v2,
-			}
-		}
-		e.cur = e.maxMatchOff
-		break
-	}
-
-	// Add block to history
-	s := e.addBlock(src)
-	blk.size = len(src)
-
-	// Check RLE first
-	if len(src) > zstdMinMatch {
-		ml := matchLen(src[1:], src)
-		if ml == len(src)-1 {
-			blk.literals = append(blk.literals, src[0])
-			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
-			return
-		}
-	}
-
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Use this to estimate literal cost.
-	// Scaled by 10 bits.
-	bitsPerByte := int32((compress.ShannonEntropyBits(src) * 1024) / len(src))
-	// Huffman can never go < 1 bit/byte
-	if bitsPerByte < 1024 {
-		bitsPerByte = 1024
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	const kSearchStrength = 10
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-	offset3 := int32(blk.recentOffsets[2])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		// We allow the encoder to optionally turn off repeat offsets across blocks
-		canRepeat := len(blk.sequences) > 2
-
-		if debugAsserts && canRepeat && offset1 == 0 {
-			panic("offset0 was 0")
-		}
-
-		const goodEnough = 250
-
-		cv := load6432(src, s)
-
-		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
-		nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
-		candidateL := e.longTable[nextHashL]
-		candidateS := e.table[nextHashS]
-
-		// Set m to a match at offset if it looks like that will improve compression.
-		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
-			delta := s - offset
-			if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first {
-				return
-			}
-			// Try to quick reject if we already have a long match.
-			if m.length > 16 {
-				left := len(src) - int(m.s+m.length)
-				// If we are too close to the end, keep as is.
-				if left <= 0 {
-					return
-				}
-				checkLen := m.length - (s - m.s) - 8
-				if left > 2 && checkLen > 4 {
-					// Check 4 bytes, 4 bytes from the end of the current match.
-					a := load3232(src, offset+checkLen)
-					b := load3232(src, s+checkLen)
-					if a != b {
-						return
-					}
-				}
-			}
-			l := 4 + e.matchlen(s+4, offset+4, src)
-			if m.rep <= 0 {
-				// Extend candidate match backwards as far as possible.
-				// Do not extend repeats as we can assume they are optimal
-				// and offsets change if s == nextEmit.
-				tMin := s - e.maxMatchOff
-				if tMin < 0 {
-					tMin = 0
-				}
-				for offset > tMin && s > nextEmit && src[offset-1] == src[s-1] && l < maxMatchLength {
-					s--
-					offset--
-					l++
-				}
-			}
-			if debugAsserts {
-				if offset >= s {
-					panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff))
-				}
-				if !bytes.Equal(src[s:s+l], src[offset:offset+l]) {
-					panic(fmt.Sprintf("second match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
-				}
-			}
-			cand := match{offset: offset, s: s, length: l, rep: rep}
-			cand.estBits(bitsPerByte)
-			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
-				*m = cand
-			}
-		}
-
-		best := match{s: s, est: highScore}
-		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
-		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
-		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
-		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
-
-		if canRepeat && best.length < goodEnough {
-			if s == nextEmit {
-				// Check repeats straight after a match.
-				improve(&best, s-offset2, s, uint32(cv), 1|4)
-				improve(&best, s-offset3, s, uint32(cv), 2|4)
-				if offset1 > 1 {
-					improve(&best, s-(offset1-1), s, uint32(cv), 3|4)
-				}
-			}
-
-			// If either no match or a non-repeat match, check at + 1
-			if best.rep <= 0 {
-				cv32 := uint32(cv >> 8)
-				spp := s + 1
-				improve(&best, spp-offset1, spp, cv32, 1)
-				improve(&best, spp-offset2, spp, cv32, 2)
-				improve(&best, spp-offset3, spp, cv32, 3)
-				if best.rep < 0 {
-					cv32 = uint32(cv >> 24)
-					spp += 2
-					improve(&best, spp-offset1, spp, cv32, 1)
-					improve(&best, spp-offset2, spp, cv32, 2)
-					improve(&best, spp-offset3, spp, cv32, 3)
-				}
-			}
-		}
-		// Load next and check...
-		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
-		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
-		index0 := s + 1
-
-		// Look far ahead, unless we have a really long match already...
-		if best.length < goodEnough {
-			// No match found, move forward on input, no need to check forward...
-			if best.length < 4 {
-				s += 1 + (s-nextEmit)>>(kSearchStrength-1)
-				if s >= sLimit {
-					break encodeLoop
-				}
-				continue
-			}
-
-			candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
-			cv = load6432(src, s+1)
-			cv2 := load6432(src, s+2)
-			candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
-			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
-
-			// Short at s+1
-			improve(&best, candidateS.offset-e.cur, s+1, uint32(cv), -1)
-			// Long at s+1, s+2
-			improve(&best, candidateL.offset-e.cur, s+1, uint32(cv), -1)
-			improve(&best, candidateL.prev-e.cur, s+1, uint32(cv), -1)
-			improve(&best, candidateL2.offset-e.cur, s+2, uint32(cv2), -1)
-			improve(&best, candidateL2.prev-e.cur, s+2, uint32(cv2), -1)
-			if false {
-				// Short at s+3.
-				// Too often worse...
-				improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+3, uint32(cv2>>8), -1)
-			}
-
-			// Start check at a fixed offset to allow for a few mismatches.
-			// For this compression level 2 yields the best results.
-			// We cannot do this if we have already indexed this position.
-			const skipBeginning = 2
-			if best.s > s-skipBeginning {
-				// See if we can find a better match by checking where the current best ends.
-				// Use that offset to see if we can find a better full match.
-				if sAt := best.s + best.length; sAt < sLimit {
-					nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
-					candidateEnd := e.longTable[nextHashL]
-
-					if off := candidateEnd.offset - e.cur - best.length + skipBeginning; off >= 0 {
-						improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-						if off := candidateEnd.prev - e.cur - best.length + skipBeginning; off >= 0 {
-							improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-						}
-					}
-				}
-			}
-		}
-
-		if debugAsserts {
-			if best.offset >= best.s {
-				panic(fmt.Sprintf("best.offset > s: %d >= %d", best.offset, best.s))
-			}
-			if best.s < nextEmit {
-				panic(fmt.Sprintf("s %d < nextEmit %d", best.s, nextEmit))
-			}
-			if best.offset < s-e.maxMatchOff {
-				panic(fmt.Sprintf("best.offset < s-e.maxMatchOff: %d < %d", best.offset, s-e.maxMatchOff))
-			}
-			if !bytes.Equal(src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]) {
-				panic(fmt.Sprintf("match mismatch: %v != %v", src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]))
-			}
-		}
-
-		// We have a match, we can store the forward value
-		s = best.s
-		if best.rep > 0 {
-			var seq seq
-			seq.matchLen = uint32(best.length - zstdMinMatch)
-			addLiterals(&seq, best.s)
-
-			// Repeat. If bit 4 is set, this is a non-lit repeat.
-			seq.offset = uint32(best.rep & 3)
-			if debugSequences {
-				println("repeat sequence", seq, "next s:", best.s, "off:", best.s-best.offset)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Index old s + 1 -> s - 1
-			s = best.s + best.length
-			nextEmit = s
-
-			// Index skipped...
-			end := s
-			if s > sLimit+4 {
-				end = sLimit + 4
-			}
-			off := index0 + e.cur
-			for index0 < end {
-				cv0 := load6432(src, index0)
-				h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
-				h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
-				e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-				e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
-				off++
-				index0++
-			}
-
-			switch best.rep {
-			case 2, 4 | 1:
-				offset1, offset2 = offset2, offset1
-			case 3, 4 | 2:
-				offset1, offset2, offset3 = offset3, offset1, offset2
-			case 4 | 3:
-				offset1, offset2, offset3 = offset1-1, offset1, offset2
-			}
-			if s >= sLimit {
-				if debugEncoder {
-					println("repeat ended", s, best.length)
-				}
-				break encodeLoop
-			}
-			continue
-		}
-
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
-		t := best.offset
-		offset1, offset2, offset3 = s-t, offset1, offset2
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Write our sequence
-		var seq seq
-		l := best.length
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-
-		// Index old s + 1 -> s - 1 or sLimit
-		end := s
-		if s > sLimit-4 {
-			end = sLimit - 4
-		}
-
-		off := index0 + e.cur
-		for index0 < end {
-			cv0 := load6432(src, index0)
-			h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
-			h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
-			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
-			index0++
-			off++
-		}
-		if s >= sLimit {
-			break encodeLoop
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	blk.recentOffsets[2] = uint32(offset3)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-}
-
-// EncodeNoHist will encode a block with no history and no following blocks.
-// Most notable difference is that src will not be copied for history and
-// we do not need to check for max match length.
-func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
-	e.ensureHist(len(src))
-	e.Encode(blk, src)
-}
-
-// Reset will reset and set a dictionary if not nil
-func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
-	e.resetBase(d, singleBlock)
-	if d == nil {
-		return
-	}
-	// Init or copy dict table
-	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
-		if len(e.dictTable) != len(e.table) {
-			e.dictTable = make([]prevEntry, len(e.table))
-		}
-		end := int32(len(d.content)) - 8 + e.maxMatchOff
-		for i := e.maxMatchOff; i < end; i += 4 {
-			const hashLog = bestShortTableBits
-
-			cv := load6432(d.content, i-e.maxMatchOff)
-			nextHash := hashLen(cv, hashLog, bestShortLen)      // 0 -> 4
-			nextHash1 := hashLen(cv>>8, hashLog, bestShortLen)  // 1 -> 5
-			nextHash2 := hashLen(cv>>16, hashLog, bestShortLen) // 2 -> 6
-			nextHash3 := hashLen(cv>>24, hashLog, bestShortLen) // 3 -> 7
-			e.dictTable[nextHash] = prevEntry{
-				prev:   e.dictTable[nextHash].offset,
-				offset: i,
-			}
-			e.dictTable[nextHash1] = prevEntry{
-				prev:   e.dictTable[nextHash1].offset,
-				offset: i + 1,
-			}
-			e.dictTable[nextHash2] = prevEntry{
-				prev:   e.dictTable[nextHash2].offset,
-				offset: i + 2,
-			}
-			e.dictTable[nextHash3] = prevEntry{
-				prev:   e.dictTable[nextHash3].offset,
-				offset: i + 3,
-			}
-		}
-		e.lastDictID = d.id
-	}
-
-	// Init or copy dict table
-	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
-		if len(e.dictLongTable) != len(e.longTable) {
-			e.dictLongTable = make([]prevEntry, len(e.longTable))
-		}
-		if len(d.content) >= 8 {
-			cv := load6432(d.content, 0)
-			h := hashLen(cv, bestLongTableBits, bestLongLen)
-			e.dictLongTable[h] = prevEntry{
-				offset: e.maxMatchOff,
-				prev:   e.dictLongTable[h].offset,
-			}
-
-			end := int32(len(d.content)) - 8 + e.maxMatchOff
-			off := 8 // First to read
-			for i := e.maxMatchOff + 1; i < end; i++ {
-				cv = cv>>8 | (uint64(d.content[off]) << 56)
-				h := hashLen(cv, bestLongTableBits, bestLongLen)
-				e.dictLongTable[h] = prevEntry{
-					offset: i,
-					prev:   e.dictLongTable[h].offset,
-				}
-				off++
-			}
-		}
-		e.lastDictID = d.id
-	}
-	// Reset table to initial state
-	copy(e.longTable[:], e.dictLongTable)
-
-	e.cur = e.maxMatchOff
-	// Reset table to initial state
-	copy(e.table[:], e.dictTable)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
deleted file mode 100644
index 84a79fde7..000000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ /dev/null
@@ -1,1252 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import "fmt"
-
-const (
-	betterLongTableBits = 19                       // Bits used in the long match table
-	betterLongTableSize = 1 << betterLongTableBits // Size of the table
-	betterLongLen       = 8                        // Bytes used for table hash
-
-	// Note: Increasing the short table bits or making the hash shorter
-	// can actually lead to compression degradation since it will 'steal' more from the
-	// long match table and match offsets are quite big.
-	// This greatly depends on the type of input.
-	betterShortTableBits = 13                        // Bits used in the short match table
-	betterShortTableSize = 1 << betterShortTableBits // Size of the table
-	betterShortLen       = 5                         // Bytes used for table hash
-
-	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
-	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
-
-	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
-	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
-)
-
-type prevEntry struct {
-	offset int32
-	prev   int32
-}
-
-// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
-// The long match table contains the previous entry with the same hash,
-// effectively making it a "chain" of length 2.
-// When we find a long match we choose between the two values and select the longest.
-// When we find a short match, after checking the long, we check if we can find a long at n+1
-// and that it is longer (lazy matching).
-type betterFastEncoder struct {
-	fastBase
-	table     [betterShortTableSize]tableEntry
-	longTable [betterLongTableSize]prevEntry
-}
-
-type betterFastEncoderDict struct {
-	betterFastEncoder
-	dictTable            []tableEntry
-	dictLongTable        []prevEntry
-	shortTableShardDirty [betterShortTableShardCnt]bool
-	longTableShardDirty  [betterLongTableShardCnt]bool
-	allDirty             bool
-}
-
-// Encode improves compression...
-func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
-	const (
-		// Input margin is the number of bytes we read (8)
-		// and the maximum we will read ahead (2)
-		inputMargin            = 8 + 2
-		minNonLiteralBlockSize = 16
-	)
-
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			e.table = [betterShortTableSize]tableEntry{}
-			e.longTable = [betterLongTableSize]prevEntry{}
-			e.cur = e.maxMatchOff
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.table[i].offset = v
-		}
-		for i := range e.longTable[:] {
-			v := e.longTable[i].offset
-			v2 := e.longTable[i].prev
-			if v < minOff {
-				v = 0
-				v2 = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-				if v2 < minOff {
-					v2 = 0
-				} else {
-					v2 = v2 - e.cur + e.maxMatchOff
-				}
-			}
-			e.longTable[i] = prevEntry{
-				offset: v,
-				prev:   v2,
-			}
-		}
-		e.cur = e.maxMatchOff
-		break
-	}
-	// Add block to history
-	s := e.addBlock(src)
-	blk.size = len(src)
-
-	// Check RLE first
-	if len(src) > zstdMinMatch {
-		ml := matchLen(src[1:], src)
-		if ml == len(src)-1 {
-			blk.literals = append(blk.literals, src[0])
-			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
-			return
-		}
-	}
-
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 1.
-	const stepSize = 1
-
-	const kSearchStrength = 9
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		var t int32
-		// We allow the encoder to optionally turn off repeat offsets across blocks
-		canRepeat := len(blk.sequences) > 2
-		var matched, index0 int32
-
-		for {
-			if debugAsserts && canRepeat && offset1 == 0 {
-				panic("offset0 was 0")
-			}
-
-			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
-			candidateL := e.longTable[nextHashL]
-			candidateS := e.table[nextHashS]
-
-			const repOff = 1
-			repIndex := s - offset1 + repOff
-			off := s + e.cur
-			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
-			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
-			index0 = s + 1
-
-			if canRepeat {
-				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
-					// Consider history as well.
-					var seq seq
-					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 0
-					seq.offset = 1
-					if debugSequences {
-						println("repeat sequence", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-
-					// Index match start+1 (long) -> s - 1
-					index0 := s + repOff
-					s += length + repOff
-
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-					// Index skipped...
-					for index0 < s-1 {
-						cv0 := load6432(src, index0)
-						cv1 := cv0 >> 8
-						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-						off := index0 + e.cur
-						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
-						index0 += 2
-					}
-					cv = load6432(src, s)
-					continue
-				}
-				const repOff2 = 1
-
-				// We deviate from the reference encoder and also check offset 2.
-				// Still slower and not much better, so disabled.
-				// repIndex = s - offset2 + repOff2
-				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
-					// Consider history as well.
-					var seq seq
-					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff2
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 2
-					seq.offset = 2
-					if debugSequences {
-						println("repeat sequence 2", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-
-					s += length + repOff2
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-
-					// Index skipped...
-					for index0 < s-1 {
-						cv0 := load6432(src, index0)
-						cv1 := cv0 >> 8
-						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-						off := index0 + e.cur
-						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
-						index0 += 2
-					}
-					cv = load6432(src, s)
-					// Swap offsets
-					offset1, offset2 = offset2, offset1
-					continue
-				}
-			}
-			// Find the offsets of our two matches.
-			coffsetL := candidateL.offset - e.cur
-			coffsetLP := candidateL.prev - e.cur
-
-			// Check if we have a long match.
-			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
-				// Found a long match, at least 8 bytes.
-				matched = e.matchlen(s+8, coffsetL+8, src) + 8
-				t = coffsetL
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-
-				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
-					// Found a long match, at least 8 bytes.
-					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
-					if prevMatch > matched {
-						matched = prevMatch
-						t = coffsetLP
-					}
-					if debugAsserts && s <= t {
-						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-					}
-					if debugAsserts && s-t > e.maxMatchOff {
-						panic("s - t >e.maxMatchOff")
-					}
-					if debugMatches {
-						println("long match")
-					}
-				}
-				break
-			}
-
-			// Check if we have a long match on prev.
-			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
-				// Found a long match, at least 8 bytes.
-				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
-				t = coffsetLP
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-				break
-			}
-
-			coffsetS := candidateS.offset - e.cur
-
-			// Check if we have a short match.
-			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
-				// found a regular match
-				matched = e.matchlen(s+4, coffsetS+4, src) + 4
-
-				// See if we can find a long match at s+1
-				const checkAt = 1
-				cv := load6432(src, s+checkAt)
-				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
-				candidateL = e.longTable[nextHashL]
-				coffsetL = candidateL.offset - e.cur
-
-				// We can store it, since we have at least a 4 byte match.
-				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
-				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
-					// Found a long match, at least 8 bytes.
-					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
-					if matchedNext > matched {
-						t = coffsetL
-						s += checkAt
-						matched = matchedNext
-						if debugMatches {
-							println("long match (after short)")
-						}
-						break
-					}
-				}
-
-				// Check prev long...
-				coffsetL = candidateL.prev - e.cur
-				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
-					// Found a long match, at least 8 bytes.
-					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
-					if matchedNext > matched {
-						t = coffsetL
-						s += checkAt
-						matched = matchedNext
-						if debugMatches {
-							println("prev long match (after short)")
-						}
-						break
-					}
-				}
-				t = coffsetS
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				if debugMatches {
-					println("short match")
-				}
-				break
-			}
-
-			// No match found, move forward in input.
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-
-		// Try to find a better match by searching for a long match at the end of the current best match
-		if s+matched < sLimit {
-			// Allow some bytes at the beginning to mismatch.
-			// Sweet spot is around 3 bytes, but depends on input.
-			// The skipped bytes are tested in Extend backwards,
-			// and still picked up as part of the match if they do.
-			const skipBeginning = 3
-
-			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
-			s2 := s + skipBeginning
-			cv := load3232(src, s2)
-			candidateL := e.longTable[nextHashL]
-			coffsetL := candidateL.offset - e.cur - matched + skipBeginning
-			if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
-				// Found a long match, at least 4 bytes.
-				matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
-				if matchedNext > matched {
-					t = coffsetL
-					s = s2
-					matched = matchedNext
-					if debugMatches {
-						println("long match at end-of-match")
-					}
-				}
-			}
-
-			// Check prev long...
-			if true {
-				coffsetL = candidateL.prev - e.cur - matched + skipBeginning
-				if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
-					// Found a long match, at least 4 bytes.
-					matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
-					if matchedNext > matched {
-						t = coffsetL
-						s = s2
-						matched = matchedNext
-						if debugMatches {
-							println("prev long match at end-of-match")
-						}
-					}
-				}
-			}
-		}
-		// A match has been found. Update recent offsets.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Extend the n-byte match as long as possible.
-		l := matched
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-
-		// Index match start+1 (long) -> s - 1
-		off := index0 + e.cur
-		for index0 < s-1 {
-			cv0 := load6432(src, index0)
-			cv1 := cv0 >> 8
-			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-			e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
-			index0 += 2
-			off += 2
-		}
-
-		cv = load6432(src, s)
-		if !canRepeat {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
-			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-}
-
-// EncodeNoHist will encode a block with no history and no following blocks.
-// Most notable difference is that src will not be copied for history and
-// we do not need to check for max match length.
-func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
-	e.ensureHist(len(src))
-	e.Encode(blk, src)
-}
-
-// Encode improves compression...
-func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
-	const (
-		// Input margin is the number of bytes we read (8)
-		// and the maximum we will read ahead (2)
-		inputMargin            = 8 + 2
-		minNonLiteralBlockSize = 16
-	)
-
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = prevEntry{}
-			}
-			e.cur = e.maxMatchOff
-			e.allDirty = true
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.table[i].offset = v
-		}
-		for i := range e.longTable[:] {
-			v := e.longTable[i].offset
-			v2 := e.longTable[i].prev
-			if v < minOff {
-				v = 0
-				v2 = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-				if v2 < minOff {
-					v2 = 0
-				} else {
-					v2 = v2 - e.cur + e.maxMatchOff
-				}
-			}
-			e.longTable[i] = prevEntry{
-				offset: v,
-				prev:   v2,
-			}
-		}
-		e.allDirty = true
-		e.cur = e.maxMatchOff
-		break
-	}
-
-	s := e.addBlock(src)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 1.
-	const stepSize = 1
-
-	const kSearchStrength = 9
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		var t int32
-		// We allow the encoder to optionally turn off repeat offsets across blocks
-		canRepeat := len(blk.sequences) > 2
-		var matched, index0 int32
-
-		for {
-			if debugAsserts && canRepeat && offset1 == 0 {
-				panic("offset0 was 0")
-			}
-
-			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
-			candidateL := e.longTable[nextHashL]
-			candidateS := e.table[nextHashS]
-
-			const repOff = 1
-			repIndex := s - offset1 + repOff
-			off := s + e.cur
-			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
-			e.markLongShardDirty(nextHashL)
-			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
-			e.markShortShardDirty(nextHashS)
-			index0 = s + 1
-
-			if canRepeat {
-				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
-					// Consider history as well.
-					var seq seq
-					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 0
-					seq.offset = 1
-					if debugSequences {
-						println("repeat sequence", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-
-					// Index match start+1 (long) -> s - 1
-					s += length + repOff
-
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-					// Index skipped...
-					for index0 < s-1 {
-						cv0 := load6432(src, index0)
-						cv1 := cv0 >> 8
-						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-						off := index0 + e.cur
-						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.markLongShardDirty(h0)
-						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
-						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
-						e.markShortShardDirty(h1)
-						index0 += 2
-					}
-					cv = load6432(src, s)
-					continue
-				}
-				const repOff2 = 1
-
-				// We deviate from the reference encoder and also check offset 2.
-				// Still slower and not much better, so disabled.
-				// repIndex = s - offset2 + repOff2
-				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
-					// Consider history as well.
-					var seq seq
-					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff2
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 2
-					seq.offset = 2
-					if debugSequences {
-						println("repeat sequence 2", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-
-					s += length + repOff2
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-
-					// Index skipped...
-					for index0 < s-1 {
-						cv0 := load6432(src, index0)
-						cv1 := cv0 >> 8
-						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-						off := index0 + e.cur
-						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.markLongShardDirty(h0)
-						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
-						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
-						e.markShortShardDirty(h1)
-						index0 += 2
-					}
-					cv = load6432(src, s)
-					// Swap offsets
-					offset1, offset2 = offset2, offset1
-					continue
-				}
-			}
-			// Find the offsets of our two matches.
-			coffsetL := candidateL.offset - e.cur
-			coffsetLP := candidateL.prev - e.cur
-
-			// Check if we have a long match.
-			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
-				// Found a long match, at least 8 bytes.
-				matched = e.matchlen(s+8, coffsetL+8, src) + 8
-				t = coffsetL
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-
-				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
-					// Found a long match, at least 8 bytes.
-					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
-					if prevMatch > matched {
-						matched = prevMatch
-						t = coffsetLP
-					}
-					if debugAsserts && s <= t {
-						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-					}
-					if debugAsserts && s-t > e.maxMatchOff {
-						panic("s - t >e.maxMatchOff")
-					}
-					if debugMatches {
-						println("long match")
-					}
-				}
-				break
-			}
-
-			// Check if we have a long match on prev.
-			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
-				// Found a long match, at least 8 bytes.
-				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
-				t = coffsetLP
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-				break
-			}
-
-			coffsetS := candidateS.offset - e.cur
-
-			// Check if we have a short match.
-			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
-				// found a regular match
-				matched = e.matchlen(s+4, coffsetS+4, src) + 4
-
-				// See if we can find a long match at s+1
-				const checkAt = 1
-				cv := load6432(src, s+checkAt)
-				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
-				candidateL = e.longTable[nextHashL]
-				coffsetL = candidateL.offset - e.cur
-
-				// We can store it, since we have at least a 4 byte match.
-				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
-				e.markLongShardDirty(nextHashL)
-				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
-					// Found a long match, at least 8 bytes.
-					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
-					if matchedNext > matched {
-						t = coffsetL
-						s += checkAt
-						matched = matchedNext
-						if debugMatches {
-							println("long match (after short)")
-						}
-						break
-					}
-				}
-
-				// Check prev long...
-				coffsetL = candidateL.prev - e.cur
-				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
-					// Found a long match, at least 8 bytes.
-					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
-					if matchedNext > matched {
-						t = coffsetL
-						s += checkAt
-						matched = matchedNext
-						if debugMatches {
-							println("prev long match (after short)")
-						}
-						break
-					}
-				}
-				t = coffsetS
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				if debugMatches {
-					println("short match")
-				}
-				break
-			}
-
-			// No match found, move forward in input.
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-		// Try to find a better match by searching for a long match at the end of the current best match
-		if s+matched < sLimit {
-			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
-			cv := load3232(src, s)
-			candidateL := e.longTable[nextHashL]
-			coffsetL := candidateL.offset - e.cur - matched
-			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
-				// Found a long match, at least 4 bytes.
-				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
-				if matchedNext > matched {
-					t = coffsetL
-					matched = matchedNext
-					if debugMatches {
-						println("long match at end-of-match")
-					}
-				}
-			}
-
-			// Check prev long...
-			if true {
-				coffsetL = candidateL.prev - e.cur - matched
-				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
-					// Found a long match, at least 4 bytes.
-					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
-					if matchedNext > matched {
-						t = coffsetL
-						matched = matchedNext
-						if debugMatches {
-							println("prev long match at end-of-match")
-						}
-					}
-				}
-			}
-		}
-		// A match has been found. Update recent offsets.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Extend the n-byte match as long as possible.
-		l := matched
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-
-		// Index match start+1 (long) -> s - 1
-		off := index0 + e.cur
-		for index0 < s-1 {
-			cv0 := load6432(src, index0)
-			cv1 := cv0 >> 8
-			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-			e.markLongShardDirty(h0)
-			h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
-			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
-			e.markShortShardDirty(h1)
-			index0 += 2
-			off += 2
-		}
-
-		cv = load6432(src, s)
-		if !canRepeat {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
-			e.markLongShardDirty(nextHashL)
-			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.markShortShardDirty(nextHashS)
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-}
-
-// ResetDict will reset and set a dictionary if not nil
-func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
-	e.resetBase(d, singleBlock)
-	if d != nil {
-		panic("betterFastEncoder: Reset with dict")
-	}
-}
-
-// ResetDict will reset and set a dictionary if not nil
-func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
-	e.resetBase(d, singleBlock)
-	if d == nil {
-		return
-	}
-	// Init or copy dict table
-	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
-		if len(e.dictTable) != len(e.table) {
-			e.dictTable = make([]tableEntry, len(e.table))
-		}
-		end := int32(len(d.content)) - 8 + e.maxMatchOff
-		for i := e.maxMatchOff; i < end; i += 4 {
-			const hashLog = betterShortTableBits
-
-			cv := load6432(d.content, i-e.maxMatchOff)
-			nextHash := hashLen(cv, hashLog, betterShortLen)      // 0 -> 4
-			nextHash1 := hashLen(cv>>8, hashLog, betterShortLen)  // 1 -> 5
-			nextHash2 := hashLen(cv>>16, hashLog, betterShortLen) // 2 -> 6
-			nextHash3 := hashLen(cv>>24, hashLog, betterShortLen) // 3 -> 7
-			e.dictTable[nextHash] = tableEntry{
-				val:    uint32(cv),
-				offset: i,
-			}
-			e.dictTable[nextHash1] = tableEntry{
-				val:    uint32(cv >> 8),
-				offset: i + 1,
-			}
-			e.dictTable[nextHash2] = tableEntry{
-				val:    uint32(cv >> 16),
-				offset: i + 2,
-			}
-			e.dictTable[nextHash3] = tableEntry{
-				val:    uint32(cv >> 24),
-				offset: i + 3,
-			}
-		}
-		e.lastDictID = d.id
-		e.allDirty = true
-	}
-
-	// Init or copy dict table
-	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
-		if len(e.dictLongTable) != len(e.longTable) {
-			e.dictLongTable = make([]prevEntry, len(e.longTable))
-		}
-		if len(d.content) >= 8 {
-			cv := load6432(d.content, 0)
-			h := hashLen(cv, betterLongTableBits, betterLongLen)
-			e.dictLongTable[h] = prevEntry{
-				offset: e.maxMatchOff,
-				prev:   e.dictLongTable[h].offset,
-			}
-
-			end := int32(len(d.content)) - 8 + e.maxMatchOff
-			off := 8 // First to read
-			for i := e.maxMatchOff + 1; i < end; i++ {
-				cv = cv>>8 | (uint64(d.content[off]) << 56)
-				h := hashLen(cv, betterLongTableBits, betterLongLen)
-				e.dictLongTable[h] = prevEntry{
-					offset: i,
-					prev:   e.dictLongTable[h].offset,
-				}
-				off++
-			}
-		}
-		e.lastDictID = d.id
-		e.allDirty = true
-	}
-
-	// Reset table to initial state
-	{
-		dirtyShardCnt := 0
-		if !e.allDirty {
-			for i := range e.shortTableShardDirty {
-				if e.shortTableShardDirty[i] {
-					dirtyShardCnt++
-				}
-			}
-		}
-		const shardCnt = betterShortTableShardCnt
-		const shardSize = betterShortTableShardSize
-		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
-			copy(e.table[:], e.dictTable)
-			for i := range e.shortTableShardDirty {
-				e.shortTableShardDirty[i] = false
-			}
-		} else {
-			for i := range e.shortTableShardDirty {
-				if !e.shortTableShardDirty[i] {
-					continue
-				}
-
-				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
-				e.shortTableShardDirty[i] = false
-			}
-		}
-	}
-	{
-		dirtyShardCnt := 0
-		if !e.allDirty {
-			for i := range e.shortTableShardDirty {
-				if e.shortTableShardDirty[i] {
-					dirtyShardCnt++
-				}
-			}
-		}
-		const shardCnt = betterLongTableShardCnt
-		const shardSize = betterLongTableShardSize
-		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
-			copy(e.longTable[:], e.dictLongTable)
-			for i := range e.longTableShardDirty {
-				e.longTableShardDirty[i] = false
-			}
-		} else {
-			for i := range e.longTableShardDirty {
-				if !e.longTableShardDirty[i] {
-					continue
-				}
-
-				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
-				e.longTableShardDirty[i] = false
-			}
-		}
-	}
-	e.cur = e.maxMatchOff
-	e.allDirty = false
-}
-
-func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
-	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
-}
-
-func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
-	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
deleted file mode 100644
index d36be7bd8..000000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ /dev/null
@@ -1,1123 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import "fmt"
-
-const (
-	dFastLongTableBits = 17                      // Bits used in the long match table
-	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
-	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	dFastLongLen       = 8                       // Bytes used for table hash
-
-	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
-	dLongTableShardSize = dFastLongTableSize / tableShardCnt        // Size of an individual shard
-
-	dFastShortTableBits = tableBits                // Bits used in the short match table
-	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
-	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	dFastShortLen       = 5                        // Bytes used for table hash
-
-)
-
-type doubleFastEncoder struct {
-	fastEncoder
-	longTable [dFastLongTableSize]tableEntry
-}
-
-type doubleFastEncoderDict struct {
-	fastEncoderDict
-	longTable           [dFastLongTableSize]tableEntry
-	dictLongTable       []tableEntry
-	longTableShardDirty [dLongTableShardCnt]bool
-}
-
-// Encode mimmics functionality in zstd_dfast.c
-func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
-	const (
-		// Input margin is the number of bytes we read (8)
-		// and the maximum we will read ahead (2)
-		inputMargin            = 8 + 2
-		minNonLiteralBlockSize = 16
-	)
-
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			e.table = [dFastShortTableSize]tableEntry{}
-			e.longTable = [dFastLongTableSize]tableEntry{}
-			e.cur = e.maxMatchOff
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.table[i].offset = v
-		}
-		for i := range e.longTable[:] {
-			v := e.longTable[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.longTable[i].offset = v
-		}
-		e.cur = e.maxMatchOff
-		break
-	}
-
-	s := e.addBlock(src)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 1.
-	const stepSize = 1
-
-	const kSearchStrength = 8
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		var t int32
-		// We allow the encoder to optionally turn off repeat offsets across blocks
-		canRepeat := len(blk.sequences) > 2
-
-		for {
-			if debugAsserts && canRepeat && offset1 == 0 {
-				panic("offset0 was 0")
-			}
-
-			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
-			candidateL := e.longTable[nextHashL]
-			candidateS := e.table[nextHashS]
-
-			const repOff = 1
-			repIndex := s - offset1 + repOff
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL] = entry
-			e.table[nextHashS] = entry
-
-			if canRepeat {
-				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
-					// Consider history as well.
-					var seq seq
-					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 0
-					seq.offset = 1
-					if debugSequences {
-						println("repeat sequence", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-					s += length + repOff
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-					cv = load6432(src, s)
-					continue
-				}
-			}
-			// Find the offsets of our two matches.
-			coffsetL := s - (candidateL.offset - e.cur)
-			coffsetS := s - (candidateS.offset - e.cur)
-
-			// Check if we have a long match.
-			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
-				// Found a long match, likely at least 8 bytes.
-				// Reference encoder checks all 8 bytes, we only check 4,
-				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
-				t = candidateL.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-				break
-			}
-
-			// Check if we have a short match.
-			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
-				// found a regular match
-				// See if we can find a long match at s+1
-				const checkAt = 1
-				cv := load6432(src, s+checkAt)
-				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
-				candidateL = e.longTable[nextHashL]
-				coffsetL = s - (candidateL.offset - e.cur) + checkAt
-
-				// We can store it, since we have at least a 4 byte match.
-				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
-				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
-					// Found a long match, likely at least 8 bytes.
-					// Reference encoder checks all 8 bytes, we only check 4,
-					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
-					t = candidateL.offset - e.cur
-					s += checkAt
-					if debugMatches {
-						println("long match (after short)")
-					}
-					break
-				}
-
-				t = candidateS.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				if debugMatches {
-					println("short match")
-				}
-				break
-			}
-
-			// No match found, move forward in input.
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-
-		// Index match start+1 (long) and start+2 (short)
-		index0 := s - l + 1
-		// Index match end-2 (long) and end-1 (short)
-		index1 := s - 2
-
-		cv0 := load6432(src, index0)
-		cv1 := load6432(src, index1)
-		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
-		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
-		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
-		cv0 >>= 8
-		cv1 >>= 8
-		te0.offset++
-		te1.offset++
-		te0.val = uint32(cv0)
-		te1.val = uint32(cv1)
-		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
-		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
-
-		cv = load6432(src, s)
-
-		if !canRepeat {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
-			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL] = entry
-			e.table[nextHashS] = entry
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-}
-
-// EncodeNoHist will encode a block with no history and no following blocks.
-// Most notable difference is that src will not be copied for history and
-// we do not need to check for max match length.
-func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
-	const (
-		// Input margin is the number of bytes we read (8)
-		// and the maximum we will read ahead (2)
-		inputMargin            = 8 + 2
-		minNonLiteralBlockSize = 16
-	)
-
-	// Protect against e.cur wraparound.
-	if e.cur >= e.bufferReset {
-		for i := range e.table[:] {
-			e.table[i] = tableEntry{}
-		}
-		for i := range e.longTable[:] {
-			e.longTable[i] = tableEntry{}
-		}
-		e.cur = e.maxMatchOff
-	}
-
-	s := int32(0)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 1.
-	const stepSize = 1
-
-	const kSearchStrength = 8
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		var t int32
-		for {
-
-			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
-			candidateL := e.longTable[nextHashL]
-			candidateS := e.table[nextHashS]
-
-			const repOff = 1
-			repIndex := s - offset1 + repOff
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL] = entry
-			e.table[nextHashS] = entry
-
-			if len(blk.sequences) > 2 {
-				if load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
-					// Consider history as well.
-					var seq seq
-					//length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
-					length := 4 + int32(matchLen(src[s+4+repOff:], src[repIndex+4:]))
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 0
-					seq.offset = 1
-					if debugSequences {
-						println("repeat sequence", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-					s += length + repOff
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-					cv = load6432(src, s)
-					continue
-				}
-			}
-			// Find the offsets of our two matches.
-			coffsetL := s - (candidateL.offset - e.cur)
-			coffsetS := s - (candidateS.offset - e.cur)
-
-			// Check if we have a long match.
-			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
-				// Found a long match, likely at least 8 bytes.
-				// Reference encoder checks all 8 bytes, we only check 4,
-				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
-				t = candidateL.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d). cur: %d", s, t, e.cur))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-				break
-			}
-
-			// Check if we have a short match.
-			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
-				// found a regular match
-				// See if we can find a long match at s+1
-				const checkAt = 1
-				cv := load6432(src, s+checkAt)
-				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
-				candidateL = e.longTable[nextHashL]
-				coffsetL = s - (candidateL.offset - e.cur) + checkAt
-
-				// We can store it, since we have at least a 4 byte match.
-				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
-				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
-					// Found a long match, likely at least 8 bytes.
-					// Reference encoder checks all 8 bytes, we only check 4,
-					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
-					t = candidateL.offset - e.cur
-					s += checkAt
-					if debugMatches {
-						println("long match (after short)")
-					}
-					break
-				}
-
-				t = candidateS.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				if debugMatches {
-					println("short match")
-				}
-				break
-			}
-
-			// No match found, move forward in input.
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		// Extend the 4-byte match as long as possible.
-		//l := e.matchlen(s+4, t+4, src) + 4
-		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-
-		// Index match start+1 (long) and start+2 (short)
-		index0 := s - l + 1
-		// Index match end-2 (long) and end-1 (short)
-		index1 := s - 2
-
-		cv0 := load6432(src, index0)
-		cv1 := load6432(src, index1)
-		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
-		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
-		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
-		cv0 >>= 8
-		cv1 >>= 8
-		te0.offset++
-		te1.offset++
-		te0.val = uint32(cv0)
-		te1.val = uint32(cv1)
-		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
-		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
-
-		cv = load6432(src, s)
-
-		if len(blk.sequences) <= 2 {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashS := hashLen(cv1>>8, dFastShortTableBits, dFastShortLen)
-			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlen(s+4, o2+4, src)
-			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
-
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL] = entry
-			e.table[nextHashS] = entry
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-
-	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < e.bufferReset {
-		e.cur += int32(len(src))
-	}
-}
-
-// Encode will encode the content, with a dictionary if initialized for it.
-func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
-	const (
-		// Input margin is the number of bytes we read (8)
-		// and the maximum we will read ahead (2)
-		inputMargin            = 8 + 2
-		minNonLiteralBlockSize = 16
-	)
-
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = tableEntry{}
-			}
-			e.markAllShardsDirty()
-			e.cur = e.maxMatchOff
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.table[i].offset = v
-		}
-		for i := range e.longTable[:] {
-			v := e.longTable[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.longTable[i].offset = v
-		}
-		e.markAllShardsDirty()
-		e.cur = e.maxMatchOff
-		break
-	}
-
-	s := e.addBlock(src)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 1.
-	const stepSize = 1
-
-	const kSearchStrength = 8
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		var t int32
-		// We allow the encoder to optionally turn off repeat offsets across blocks
-		canRepeat := len(blk.sequences) > 2
-
-		for {
-			if debugAsserts && canRepeat && offset1 == 0 {
-				panic("offset0 was 0")
-			}
-
-			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
-			candidateL := e.longTable[nextHashL]
-			candidateS := e.table[nextHashS]
-
-			const repOff = 1
-			repIndex := s - offset1 + repOff
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL] = entry
-			e.markLongShardDirty(nextHashL)
-			e.table[nextHashS] = entry
-			e.markShardDirty(nextHashS)
-
-			if canRepeat {
-				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
-					// Consider history as well.
-					var seq seq
-					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
-
-					seq.matchLen = uint32(length - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 0
-					seq.offset = 1
-					if debugSequences {
-						println("repeat sequence", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-					s += length + repOff
-					nextEmit = s
-					if s >= sLimit {
-						if debugEncoder {
-							println("repeat ended", s, length)
-
-						}
-						break encodeLoop
-					}
-					cv = load6432(src, s)
-					continue
-				}
-			}
-			// Find the offsets of our two matches.
-			coffsetL := s - (candidateL.offset - e.cur)
-			coffsetS := s - (candidateS.offset - e.cur)
-
-			// Check if we have a long match.
-			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
-				// Found a long match, likely at least 8 bytes.
-				// Reference encoder checks all 8 bytes, we only check 4,
-				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
-				t = candidateL.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugMatches {
-					println("long match")
-				}
-				break
-			}
-
-			// Check if we have a short match.
-			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
-				// found a regular match
-				// See if we can find a long match at s+1
-				const checkAt = 1
-				cv := load6432(src, s+checkAt)
-				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
-				candidateL = e.longTable[nextHashL]
-				coffsetL = s - (candidateL.offset - e.cur) + checkAt
-
-				// We can store it, since we have at least a 4 byte match.
-				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
-				e.markLongShardDirty(nextHashL)
-				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
-					// Found a long match, likely at least 8 bytes.
-					// Reference encoder checks all 8 bytes, we only check 4,
-					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
-					t = candidateL.offset - e.cur
-					s += checkAt
-					if debugMatches {
-						println("long match (after short)")
-					}
-					break
-				}
-
-				t = candidateS.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				if debugMatches {
-					println("short match")
-				}
-				break
-			}
-
-			// No match found, move forward in input.
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-
-		// Index match start+1 (long) and start+2 (short)
-		index0 := s - l + 1
-		// Index match end-2 (long) and end-1 (short)
-		index1 := s - 2
-
-		cv0 := load6432(src, index0)
-		cv1 := load6432(src, index1)
-		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
-		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
-		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
-		e.longTable[longHash1] = te0
-		e.longTable[longHash2] = te1
-		e.markLongShardDirty(longHash1)
-		e.markLongShardDirty(longHash2)
-		cv0 >>= 8
-		cv1 >>= 8
-		te0.offset++
-		te1.offset++
-		te0.val = uint32(cv0)
-		te1.val = uint32(cv1)
-		hashVal1 := hashLen(cv0, dFastShortTableBits, dFastShortLen)
-		hashVal2 := hashLen(cv1, dFastShortTableBits, dFastShortLen)
-		e.table[hashVal1] = te0
-		e.markShardDirty(hashVal1)
-		e.table[hashVal2] = te1
-		e.markShardDirty(hashVal2)
-
-		cv = load6432(src, s)
-
-		if !canRepeat {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL] = entry
-			e.markLongShardDirty(nextHashL)
-			e.table[nextHashS] = entry
-			e.markShardDirty(nextHashS)
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-	// If we encoded more than 64K mark all dirty.
-	if len(src) > 64<<10 {
-		e.markAllShardsDirty()
-	}
-}
-
-// ResetDict will reset and set a dictionary if not nil
-func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
-	e.fastEncoder.Reset(d, singleBlock)
-	if d != nil {
-		panic("doubleFastEncoder: Reset with dict not supported")
-	}
-}
-
-// ResetDict will reset and set a dictionary if not nil
-func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
-	allDirty := e.allDirty
-	e.fastEncoderDict.Reset(d, singleBlock)
-	if d == nil {
-		return
-	}
-
-	// Init or copy dict table
-	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
-		if len(e.dictLongTable) != len(e.longTable) {
-			e.dictLongTable = make([]tableEntry, len(e.longTable))
-		}
-		if len(d.content) >= 8 {
-			cv := load6432(d.content, 0)
-			e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
-				val:    uint32(cv),
-				offset: e.maxMatchOff,
-			}
-			end := int32(len(d.content)) - 8 + e.maxMatchOff
-			for i := e.maxMatchOff + 1; i < end; i++ {
-				cv = cv>>8 | (uint64(d.content[i-e.maxMatchOff+7]) << 56)
-				e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
-					val:    uint32(cv),
-					offset: i,
-				}
-			}
-		}
-		e.lastDictID = d.id
-		allDirty = true
-	}
-	// Reset table to initial state
-	e.cur = e.maxMatchOff
-
-	dirtyShardCnt := 0
-	if !allDirty {
-		for i := range e.longTableShardDirty {
-			if e.longTableShardDirty[i] {
-				dirtyShardCnt++
-			}
-		}
-	}
-
-	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
-		//copy(e.longTable[:], e.dictLongTable)
-		e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
-		for i := range e.longTableShardDirty {
-			e.longTableShardDirty[i] = false
-		}
-		return
-	}
-	for i := range e.longTableShardDirty {
-		if !e.longTableShardDirty[i] {
-			continue
-		}
-
-		// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
-		*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
-
-		e.longTableShardDirty[i] = false
-	}
-}
-
-func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
-	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
deleted file mode 100644
index f45a3da7d..000000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ /dev/null
@@ -1,891 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"fmt"
-)
-
-const (
-	tableBits        = 15                               // Bits used in the table
-	tableSize        = 1 << tableBits                   // Size of the table
-	tableShardCnt    = 1 << (tableBits - dictShardBits) // Number of shards in the table
-	tableShardSize   = tableSize / tableShardCnt        // Size of an individual shard
-	tableFastHashLen = 6
-	tableMask        = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
-	maxMatchLength   = 131074
-)
-
-type tableEntry struct {
-	val    uint32
-	offset int32
-}
-
-type fastEncoder struct {
-	fastBase
-	table [tableSize]tableEntry
-}
-
-type fastEncoderDict struct {
-	fastEncoder
-	dictTable       []tableEntry
-	tableShardDirty [tableShardCnt]bool
-	allDirty        bool
-}
-
-// Encode mimmics functionality in zstd_fast.c
-func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
-	const (
-		inputMargin            = 8
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			e.cur = e.maxMatchOff
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.table[i].offset = v
-		}
-		e.cur = e.maxMatchOff
-		break
-	}
-
-	s := e.addBlock(src)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 2.
-	const stepSize = 2
-
-	// TEMPLATE
-	const hashLog = tableBits
-	// seems global, but would be nice to tweak.
-	const kSearchStrength = 6
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		// t will contain the match offset when we find one.
-		// When existing the search loop, we have already checked 4 bytes.
-		var t int32
-
-		// We will not use repeat offsets across blocks.
-		// By not using them for the first 3 matches
-		canRepeat := len(blk.sequences) > 2
-
-		for {
-			if debugAsserts && canRepeat && offset1 == 0 {
-				panic("offset0 was 0")
-			}
-
-			nextHash := hashLen(cv, hashLog, tableFastHashLen)
-			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
-			candidate := e.table[nextHash]
-			candidate2 := e.table[nextHash2]
-			repIndex := s - offset1 + 2
-
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
-
-			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
-				// Consider history as well.
-				var seq seq
-				length := 4 + e.matchlen(s+6, repIndex+4, src)
-				seq.matchLen = uint32(length - zstdMinMatch)
-
-				// We might be able to match backwards.
-				// Extend as long as we can.
-				start := s + 2
-				// We end the search early, so we don't risk 0 literals
-				// and have to do special offset treatment.
-				startLimit := nextEmit + 1
-
-				sMin := s - e.maxMatchOff
-				if sMin < 0 {
-					sMin = 0
-				}
-				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
-					repIndex--
-					start--
-					seq.matchLen++
-				}
-				addLiterals(&seq, start)
-
-				// rep 0
-				seq.offset = 1
-				if debugSequences {
-					println("repeat sequence", seq, "next s:", s)
-				}
-				blk.sequences = append(blk.sequences, seq)
-				s += length + 2
-				nextEmit = s
-				if s >= sLimit {
-					if debugEncoder {
-						println("repeat ended", s, length)
-
-					}
-					break encodeLoop
-				}
-				cv = load6432(src, s)
-				continue
-			}
-			coffset0 := s - (candidate.offset - e.cur)
-			coffset1 := s - (candidate2.offset - e.cur) + 1
-			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
-				// found a regular match
-				t = candidate.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				break
-			}
-
-			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
-				// found a regular match
-				t = candidate2.offset - e.cur
-				s++
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				break
-			}
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-		// A 4-byte match has been found. We'll later see if more than 4 bytes.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence.
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		// Don't use repeat offsets
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-		cv = load6432(src, s)
-
-		// Check offset 2
-		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			// Store this, since we have it.
-			nextHash := hashLen(cv, hashLog, tableFastHashLen)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				break encodeLoop
-			}
-			// Prepare next loop.
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-}
-
-// EncodeNoHist will encode a block with no history and no following blocks.
-// Most notable difference is that src will not be copied for history and
-// we do not need to check for max match length.
-func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
-	const (
-		inputMargin            = 8
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-	if debugEncoder {
-		if len(src) > maxCompressedBlockSize {
-			panic("src too big")
-		}
-	}
-
-	// Protect against e.cur wraparound.
-	if e.cur >= e.bufferReset {
-		for i := range e.table[:] {
-			e.table[i] = tableEntry{}
-		}
-		e.cur = e.maxMatchOff
-	}
-
-	s := int32(0)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 2.
-	const stepSize = 2
-
-	// TEMPLATE
-	const hashLog = tableBits
-	// seems global, but would be nice to tweak.
-	const kSearchStrength = 6
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		// t will contain the match offset when we find one.
-		// When existing the search loop, we have already checked 4 bytes.
-		var t int32
-
-		// We will not use repeat offsets across blocks.
-		// By not using them for the first 3 matches
-
-		for {
-			nextHash := hashLen(cv, hashLog, tableFastHashLen)
-			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
-			candidate := e.table[nextHash]
-			candidate2 := e.table[nextHash2]
-			repIndex := s - offset1 + 2
-
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
-
-			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
-				// Consider history as well.
-				var seq seq
-				length := 4 + e.matchlen(s+6, repIndex+4, src)
-
-				seq.matchLen = uint32(length - zstdMinMatch)
-
-				// We might be able to match backwards.
-				// Extend as long as we can.
-				start := s + 2
-				// We end the search early, so we don't risk 0 literals
-				// and have to do special offset treatment.
-				startLimit := nextEmit + 1
-
-				sMin := s - e.maxMatchOff
-				if sMin < 0 {
-					sMin = 0
-				}
-				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] {
-					repIndex--
-					start--
-					seq.matchLen++
-				}
-				addLiterals(&seq, start)
-
-				// rep 0
-				seq.offset = 1
-				if debugSequences {
-					println("repeat sequence", seq, "next s:", s)
-				}
-				blk.sequences = append(blk.sequences, seq)
-				s += length + 2
-				nextEmit = s
-				if s >= sLimit {
-					if debugEncoder {
-						println("repeat ended", s, length)
-
-					}
-					break encodeLoop
-				}
-				cv = load6432(src, s)
-				continue
-			}
-			coffset0 := s - (candidate.offset - e.cur)
-			coffset1 := s - (candidate2.offset - e.cur) + 1
-			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
-				// found a regular match
-				t = candidate.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic(fmt.Sprintf("t (%d) < 0, candidate.offset: %d, e.cur: %d, coffset0: %d, e.maxMatchOff: %d", t, candidate.offset, e.cur, coffset0, e.maxMatchOff))
-				}
-				break
-			}
-
-			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
-				// found a regular match
-				t = candidate2.offset - e.cur
-				s++
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				break
-			}
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-		// A 4-byte match has been found. We'll later see if more than 4 bytes.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && t < 0 {
-			panic(fmt.Sprintf("t (%d) < 0 ", t))
-		}
-		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence.
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		// Don't use repeat offsets
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-		cv = load6432(src, s)
-
-		// Check offset 2
-		if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) {
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			// Store this, since we have it.
-			nextHash := hashLen(cv, hashLog, tableFastHashLen)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				break encodeLoop
-			}
-			// Prepare next loop.
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < e.bufferReset {
-		e.cur += int32(len(src))
-	}
-}
-
-// Encode will encode the content, with a dictionary if initialized for it.
-func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
-	const (
-		inputMargin            = 8
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-	if e.allDirty || len(src) > 32<<10 {
-		e.fastEncoder.Encode(blk, src)
-		e.allDirty = true
-		return
-	}
-	// Protect against e.cur wraparound.
-	for e.cur >= e.bufferReset-int32(len(e.hist)) {
-		if len(e.hist) == 0 {
-			e.table = [tableSize]tableEntry{}
-			e.cur = e.maxMatchOff
-			break
-		}
-		// Shift down everything in the table that isn't already too far away.
-		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
-		for i := range e.table[:] {
-			v := e.table[i].offset
-			if v < minOff {
-				v = 0
-			} else {
-				v = v - e.cur + e.maxMatchOff
-			}
-			e.table[i].offset = v
-		}
-		e.cur = e.maxMatchOff
-		break
-	}
-
-	s := e.addBlock(src)
-	blk.size = len(src)
-	if len(src) < minNonLiteralBlockSize {
-		blk.extraLits = len(src)
-		blk.literals = blk.literals[:len(src)]
-		copy(blk.literals, src)
-		return
-	}
-
-	// Override src
-	src = e.hist
-	sLimit := int32(len(src)) - inputMargin
-	// stepSize is the number of bytes to skip on every main loop iteration.
-	// It should be >= 2.
-	const stepSize = 2
-
-	// TEMPLATE
-	const hashLog = tableBits
-	// seems global, but would be nice to tweak.
-	const kSearchStrength = 7
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := s
-	cv := load6432(src, s)
-
-	// Relative offsets
-	offset1 := int32(blk.recentOffsets[0])
-	offset2 := int32(blk.recentOffsets[1])
-
-	addLiterals := func(s *seq, until int32) {
-		if until == nextEmit {
-			return
-		}
-		blk.literals = append(blk.literals, src[nextEmit:until]...)
-		s.litLen = uint32(until - nextEmit)
-	}
-	if debugEncoder {
-		println("recent offsets:", blk.recentOffsets)
-	}
-
-encodeLoop:
-	for {
-		// t will contain the match offset when we find one.
-		// When existing the search loop, we have already checked 4 bytes.
-		var t int32
-
-		// We will not use repeat offsets across blocks.
-		// By not using them for the first 3 matches
-		canRepeat := len(blk.sequences) > 2
-
-		for {
-			if debugAsserts && canRepeat && offset1 == 0 {
-				panic("offset0 was 0")
-			}
-
-			nextHash := hashLen(cv, hashLog, tableFastHashLen)
-			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
-			candidate := e.table[nextHash]
-			candidate2 := e.table[nextHash2]
-			repIndex := s - offset1 + 2
-
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.markShardDirty(nextHash)
-			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
-			e.markShardDirty(nextHash2)
-
-			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
-				// Consider history as well.
-				var seq seq
-				length := 4 + e.matchlen(s+6, repIndex+4, src)
-
-				seq.matchLen = uint32(length - zstdMinMatch)
-
-				// We might be able to match backwards.
-				// Extend as long as we can.
-				start := s + 2
-				// We end the search early, so we don't risk 0 literals
-				// and have to do special offset treatment.
-				startLimit := nextEmit + 1
-
-				sMin := s - e.maxMatchOff
-				if sMin < 0 {
-					sMin = 0
-				}
-				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
-					repIndex--
-					start--
-					seq.matchLen++
-				}
-				addLiterals(&seq, start)
-
-				// rep 0
-				seq.offset = 1
-				if debugSequences {
-					println("repeat sequence", seq, "next s:", s)
-				}
-				blk.sequences = append(blk.sequences, seq)
-				s += length + 2
-				nextEmit = s
-				if s >= sLimit {
-					if debugEncoder {
-						println("repeat ended", s, length)
-
-					}
-					break encodeLoop
-				}
-				cv = load6432(src, s)
-				continue
-			}
-			coffset0 := s - (candidate.offset - e.cur)
-			coffset1 := s - (candidate2.offset - e.cur) + 1
-			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
-				// found a regular match
-				t = candidate.offset - e.cur
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				break
-			}
-
-			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
-				// found a regular match
-				t = candidate2.offset - e.cur
-				s++
-				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-				}
-				if debugAsserts && s-t > e.maxMatchOff {
-					panic("s - t >e.maxMatchOff")
-				}
-				if debugAsserts && t < 0 {
-					panic("t<0")
-				}
-				break
-			}
-			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
-			if s >= sLimit {
-				break encodeLoop
-			}
-			cv = load6432(src, s)
-		}
-		// A 4-byte match has been found. We'll later see if more than 4 bytes.
-		offset2 = offset1
-		offset1 = s - t
-
-		if debugAsserts && s <= t {
-			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
-		}
-
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
-			panic("invalid offset")
-		}
-
-		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
-		// Write our sequence.
-		var seq seq
-		seq.litLen = uint32(s - nextEmit)
-		seq.matchLen = uint32(l - zstdMinMatch)
-		if seq.litLen > 0 {
-			blk.literals = append(blk.literals, src[nextEmit:s]...)
-		}
-		// Don't use repeat offsets
-		seq.offset = uint32(s-t) + 3
-		s += l
-		if debugSequences {
-			println("sequence", seq, "next s:", s)
-		}
-		blk.sequences = append(blk.sequences, seq)
-		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
-		}
-		cv = load6432(src, s)
-
-		// Check offset 2
-		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			// Store this, since we have it.
-			nextHash := hashLen(cv, hashLog, tableFastHashLen)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.markShardDirty(nextHash)
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				break encodeLoop
-			}
-			// Prepare next loop.
-			cv = load6432(src, s)
-		}
-	}
-
-	if int(nextEmit) < len(src) {
-		blk.literals = append(blk.literals, src[nextEmit:]...)
-		blk.extraLits = len(src) - int(nextEmit)
-	}
-	blk.recentOffsets[0] = uint32(offset1)
-	blk.recentOffsets[1] = uint32(offset2)
-	if debugEncoder {
-		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
-	}
-}
-
-// ResetDict will reset and set a dictionary if not nil
-func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
-	e.resetBase(d, singleBlock)
-	if d != nil {
-		panic("fastEncoder: Reset with dict")
-	}
-}
-
-// ResetDict will reset and set a dictionary if not nil
-func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
-	e.resetBase(d, singleBlock)
-	if d == nil {
-		return
-	}
-
-	// Init or copy dict table
-	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
-		if len(e.dictTable) != len(e.table) {
-			e.dictTable = make([]tableEntry, len(e.table))
-		}
-		if true {
-			end := e.maxMatchOff + int32(len(d.content)) - 8
-			for i := e.maxMatchOff; i < end; i += 2 {
-				const hashLog = tableBits
-
-				cv := load6432(d.content, i-e.maxMatchOff)
-				nextHash := hashLen(cv, hashLog, tableFastHashLen)     // 0 -> 6
-				nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen) // 1 -> 7
-				e.dictTable[nextHash] = tableEntry{
-					val:    uint32(cv),
-					offset: i,
-				}
-				e.dictTable[nextHash1] = tableEntry{
-					val:    uint32(cv >> 8),
-					offset: i + 1,
-				}
-			}
-		}
-		e.lastDictID = d.id
-		e.allDirty = true
-	}
-
-	e.cur = e.maxMatchOff
-	dirtyShardCnt := 0
-	if !e.allDirty {
-		for i := range e.tableShardDirty {
-			if e.tableShardDirty[i] {
-				dirtyShardCnt++
-			}
-		}
-	}
-
-	const shardCnt = tableShardCnt
-	const shardSize = tableShardSize
-	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
-		//copy(e.table[:], e.dictTable)
-		e.table = *(*[tableSize]tableEntry)(e.dictTable)
-		for i := range e.tableShardDirty {
-			e.tableShardDirty[i] = false
-		}
-		e.allDirty = false
-		return
-	}
-	for i := range e.tableShardDirty {
-		if !e.tableShardDirty[i] {
-			continue
-		}
-
-		//copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
-		*(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
-		e.tableShardDirty[i] = false
-	}
-	e.allDirty = false
-}
-
-func (e *fastEncoderDict) markAllShardsDirty() {
-	e.allDirty = true
-}
-
-func (e *fastEncoderDict) markShardDirty(entryNum uint32) {
-	e.tableShardDirty[entryNum/tableShardSize] = true
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
deleted file mode 100644
index 8f8223cd3..000000000
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ /dev/null
@@ -1,642 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"crypto/rand"
-	"errors"
-	"fmt"
-	"io"
-	"math"
-	rdebug "runtime/debug"
-	"sync"
-
-	"github.com/klauspost/compress/zstd/internal/xxhash"
-)
-
-// Encoder provides encoding to Zstandard.
-// An Encoder can be used for either compressing a stream via the
-// io.WriteCloser interface supported by the Encoder or as multiple independent
-// tasks via the EncodeAll function.
-// Smaller encodes are encouraged to use the EncodeAll function.
-// Use NewWriter to create a new instance.
-type Encoder struct {
-	o        encoderOptions
-	encoders chan encoder
-	state    encoderState
-	init     sync.Once
-}
-
-type encoder interface {
-	Encode(blk *blockEnc, src []byte)
-	EncodeNoHist(blk *blockEnc, src []byte)
-	Block() *blockEnc
-	CRC() *xxhash.Digest
-	AppendCRC([]byte) []byte
-	WindowSize(size int64) int32
-	UseBlock(*blockEnc)
-	Reset(d *dict, singleBlock bool)
-}
-
-type encoderState struct {
-	w                io.Writer
-	filling          []byte
-	current          []byte
-	previous         []byte
-	encoder          encoder
-	writing          *blockEnc
-	err              error
-	writeErr         error
-	nWritten         int64
-	nInput           int64
-	frameContentSize int64
-	headerWritten    bool
-	eofWritten       bool
-	fullFrameWritten bool
-
-	// This waitgroup indicates an encode is running.
-	wg sync.WaitGroup
-	// This waitgroup indicates we have a block encoding/writing.
-	wWg sync.WaitGroup
-}
-
-// NewWriter will create a new Zstandard encoder.
-// If the encoder will be used for encoding blocks a nil writer can be used.
-func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) {
-	initPredefined()
-	var e Encoder
-	e.o.setDefault()
-	for _, o := range opts {
-		err := o(&e.o)
-		if err != nil {
-			return nil, err
-		}
-	}
-	if w != nil {
-		e.Reset(w)
-	}
-	return &e, nil
-}
-
-func (e *Encoder) initialize() {
-	if e.o.concurrent == 0 {
-		e.o.setDefault()
-	}
-	e.encoders = make(chan encoder, e.o.concurrent)
-	for i := 0; i < e.o.concurrent; i++ {
-		enc := e.o.encoder()
-		e.encoders <- enc
-	}
-}
-
-// Reset will re-initialize the writer and new writes will encode to the supplied writer
-// as a new, independent stream.
-func (e *Encoder) Reset(w io.Writer) {
-	s := &e.state
-	s.wg.Wait()
-	s.wWg.Wait()
-	if cap(s.filling) == 0 {
-		s.filling = make([]byte, 0, e.o.blockSize)
-	}
-	if e.o.concurrent > 1 {
-		if cap(s.current) == 0 {
-			s.current = make([]byte, 0, e.o.blockSize)
-		}
-		if cap(s.previous) == 0 {
-			s.previous = make([]byte, 0, e.o.blockSize)
-		}
-		s.current = s.current[:0]
-		s.previous = s.previous[:0]
-		if s.writing == nil {
-			s.writing = &blockEnc{lowMem: e.o.lowMem}
-			s.writing.init()
-		}
-		s.writing.initNewEncode()
-	}
-	if s.encoder == nil {
-		s.encoder = e.o.encoder()
-	}
-	s.filling = s.filling[:0]
-	s.encoder.Reset(e.o.dict, false)
-	s.headerWritten = false
-	s.eofWritten = false
-	s.fullFrameWritten = false
-	s.w = w
-	s.err = nil
-	s.nWritten = 0
-	s.nInput = 0
-	s.writeErr = nil
-	s.frameContentSize = 0
-}
-
-// ResetContentSize will reset and set a content size for the next stream.
-// If the bytes written does not match the size given an error will be returned
-// when calling Close().
-// This is removed when Reset is called.
-// Sizes <= 0 results in no content size set.
-func (e *Encoder) ResetContentSize(w io.Writer, size int64) {
-	e.Reset(w)
-	if size >= 0 {
-		e.state.frameContentSize = size
-	}
-}
-
-// Write data to the encoder.
-// Input data will be buffered and as the buffer fills up
-// content will be compressed and written to the output.
-// When done writing, use Close to flush the remaining output
-// and write CRC if requested.
-func (e *Encoder) Write(p []byte) (n int, err error) {
-	s := &e.state
-	if s.eofWritten {
-		return 0, ErrEncoderClosed
-	}
-	for len(p) > 0 {
-		if len(p)+len(s.filling) < e.o.blockSize {
-			if e.o.crc {
-				_, _ = s.encoder.CRC().Write(p)
-			}
-			s.filling = append(s.filling, p...)
-			return n + len(p), nil
-		}
-		add := p
-		if len(p)+len(s.filling) > e.o.blockSize {
-			add = add[:e.o.blockSize-len(s.filling)]
-		}
-		if e.o.crc {
-			_, _ = s.encoder.CRC().Write(add)
-		}
-		s.filling = append(s.filling, add...)
-		p = p[len(add):]
-		n += len(add)
-		if len(s.filling) < e.o.blockSize {
-			return n, nil
-		}
-		err := e.nextBlock(false)
-		if err != nil {
-			return n, err
-		}
-		if debugAsserts && len(s.filling) > 0 {
-			panic(len(s.filling))
-		}
-	}
-	return n, nil
-}
-
-// nextBlock will synchronize and start compressing input in e.state.filling.
-// If an error has occurred during encoding it will be returned.
-func (e *Encoder) nextBlock(final bool) error {
-	s := &e.state
-	// Wait for current block.
-	s.wg.Wait()
-	if s.err != nil {
-		return s.err
-	}
-	if len(s.filling) > e.o.blockSize {
-		return fmt.Errorf("block > maxStoreBlockSize")
-	}
-	if !s.headerWritten {
-		// If we have a single block encode, do a sync compression.
-		if final && len(s.filling) == 0 && !e.o.fullZero {
-			s.headerWritten = true
-			s.fullFrameWritten = true
-			s.eofWritten = true
-			return nil
-		}
-		if final && len(s.filling) > 0 {
-			s.current = e.encodeAll(s.encoder, s.filling, s.current[:0])
-			var n2 int
-			n2, s.err = s.w.Write(s.current)
-			if s.err != nil {
-				return s.err
-			}
-			s.nWritten += int64(n2)
-			s.nInput += int64(len(s.filling))
-			s.current = s.current[:0]
-			s.filling = s.filling[:0]
-			s.headerWritten = true
-			s.fullFrameWritten = true
-			s.eofWritten = true
-			return nil
-		}
-
-		var tmp [maxHeaderSize]byte
-		fh := frameHeader{
-			ContentSize:   uint64(s.frameContentSize),
-			WindowSize:    uint32(s.encoder.WindowSize(s.frameContentSize)),
-			SingleSegment: false,
-			Checksum:      e.o.crc,
-			DictID:        e.o.dict.ID(),
-		}
-
-		dst := fh.appendTo(tmp[:0])
-		s.headerWritten = true
-		s.wWg.Wait()
-		var n2 int
-		n2, s.err = s.w.Write(dst)
-		if s.err != nil {
-			return s.err
-		}
-		s.nWritten += int64(n2)
-	}
-	if s.eofWritten {
-		// Ensure we only write it once.
-		final = false
-	}
-
-	if len(s.filling) == 0 {
-		// Final block, but no data.
-		if final {
-			enc := s.encoder
-			blk := enc.Block()
-			blk.reset(nil)
-			blk.last = true
-			blk.encodeRaw(nil)
-			s.wWg.Wait()
-			_, s.err = s.w.Write(blk.output)
-			s.nWritten += int64(len(blk.output))
-			s.eofWritten = true
-		}
-		return s.err
-	}
-
-	// SYNC:
-	if e.o.concurrent == 1 {
-		src := s.filling
-		s.nInput += int64(len(s.filling))
-		if debugEncoder {
-			println("Adding sync block,", len(src), "bytes, final:", final)
-		}
-		enc := s.encoder
-		blk := enc.Block()
-		blk.reset(nil)
-		enc.Encode(blk, src)
-		blk.last = final
-		if final {
-			s.eofWritten = true
-		}
-
-		s.err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-		if s.err != nil {
-			return s.err
-		}
-		_, s.err = s.w.Write(blk.output)
-		s.nWritten += int64(len(blk.output))
-		s.filling = s.filling[:0]
-		return s.err
-	}
-
-	// Move blocks forward.
-	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
-	s.nInput += int64(len(s.current))
-	s.wg.Add(1)
-	if final {
-		s.eofWritten = true
-	}
-	go func(src []byte) {
-		if debugEncoder {
-			println("Adding block,", len(src), "bytes, final:", final)
-		}
-		defer func() {
-			if r := recover(); r != nil {
-				s.err = fmt.Errorf("panic while encoding: %v", r)
-				rdebug.PrintStack()
-			}
-			s.wg.Done()
-		}()
-		enc := s.encoder
-		blk := enc.Block()
-		enc.Encode(blk, src)
-		blk.last = final
-		// Wait for pending writes.
-		s.wWg.Wait()
-		if s.writeErr != nil {
-			s.err = s.writeErr
-			return
-		}
-		// Transfer encoders from previous write block.
-		blk.swapEncoders(s.writing)
-		// Transfer recent offsets to next.
-		enc.UseBlock(s.writing)
-		s.writing = blk
-		s.wWg.Add(1)
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					s.writeErr = fmt.Errorf("panic while encoding/writing: %v", r)
-					rdebug.PrintStack()
-				}
-				s.wWg.Done()
-			}()
-			s.writeErr = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-			if s.writeErr != nil {
-				return
-			}
-			_, s.writeErr = s.w.Write(blk.output)
-			s.nWritten += int64(len(blk.output))
-		}()
-	}(s.current)
-	return nil
-}
-
-// ReadFrom reads data from r until EOF or error.
-// The return value n is the number of bytes read.
-// Any error except io.EOF encountered during the read is also returned.
-//
-// The Copy function uses ReaderFrom if available.
-func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
-	if debugEncoder {
-		println("Using ReadFrom")
-	}
-
-	// Flush any current writes.
-	if len(e.state.filling) > 0 {
-		if err := e.nextBlock(false); err != nil {
-			return 0, err
-		}
-	}
-	e.state.filling = e.state.filling[:e.o.blockSize]
-	src := e.state.filling
-	for {
-		n2, err := r.Read(src)
-		if e.o.crc {
-			_, _ = e.state.encoder.CRC().Write(src[:n2])
-		}
-		// src is now the unfilled part...
-		src = src[n2:]
-		n += int64(n2)
-		switch err {
-		case io.EOF:
-			e.state.filling = e.state.filling[:len(e.state.filling)-len(src)]
-			if debugEncoder {
-				println("ReadFrom: got EOF final block:", len(e.state.filling))
-			}
-			return n, nil
-		case nil:
-		default:
-			if debugEncoder {
-				println("ReadFrom: got error:", err)
-			}
-			e.state.err = err
-			return n, err
-		}
-		if len(src) > 0 {
-			if debugEncoder {
-				println("ReadFrom: got space left in source:", len(src))
-			}
-			continue
-		}
-		err = e.nextBlock(false)
-		if err != nil {
-			return n, err
-		}
-		e.state.filling = e.state.filling[:e.o.blockSize]
-		src = e.state.filling
-	}
-}
-
-// Flush will send the currently written data to output
-// and block until everything has been written.
-// This should only be used on rare occasions where pushing the currently queued data is critical.
-func (e *Encoder) Flush() error {
-	s := &e.state
-	if len(s.filling) > 0 {
-		err := e.nextBlock(false)
-		if err != nil {
-			// Ignore Flush after Close.
-			if errors.Is(s.err, ErrEncoderClosed) {
-				return nil
-			}
-			return err
-		}
-	}
-	s.wg.Wait()
-	s.wWg.Wait()
-	if s.err != nil {
-		// Ignore Flush after Close.
-		if errors.Is(s.err, ErrEncoderClosed) {
-			return nil
-		}
-		return s.err
-	}
-	return s.writeErr
-}
-
-// Close will flush the final output and close the stream.
-// The function will block until everything has been written.
-// The Encoder can still be re-used after calling this.
-func (e *Encoder) Close() error {
-	s := &e.state
-	if s.encoder == nil {
-		return nil
-	}
-	err := e.nextBlock(true)
-	if err != nil {
-		if errors.Is(s.err, ErrEncoderClosed) {
-			return nil
-		}
-		return err
-	}
-	if s.frameContentSize > 0 {
-		if s.nInput != s.frameContentSize {
-			return fmt.Errorf("frame content size %d given, but %d bytes was written", s.frameContentSize, s.nInput)
-		}
-	}
-	if e.state.fullFrameWritten {
-		return s.err
-	}
-	s.wg.Wait()
-	s.wWg.Wait()
-
-	if s.err != nil {
-		return s.err
-	}
-	if s.writeErr != nil {
-		return s.writeErr
-	}
-
-	// Write CRC
-	if e.o.crc && s.err == nil {
-		// heap alloc.
-		var tmp [4]byte
-		_, s.err = s.w.Write(s.encoder.AppendCRC(tmp[:0]))
-		s.nWritten += 4
-	}
-
-	// Add padding with content from crypto/rand.Reader
-	if s.err == nil && e.o.pad > 0 {
-		add := calcSkippableFrame(s.nWritten, int64(e.o.pad))
-		frame, err := skippableFrame(s.filling[:0], add, rand.Reader)
-		if err != nil {
-			return err
-		}
-		_, s.err = s.w.Write(frame)
-	}
-	if s.err == nil {
-		s.err = ErrEncoderClosed
-		return nil
-	}
-
-	return s.err
-}
-
-// EncodeAll will encode all input in src and append it to dst.
-// This function can be called concurrently, but each call will only run on a single goroutine.
-// If empty input is given, nothing is returned, unless WithZeroFrames is specified.
-// Encoded blocks can be concatenated and the result will be the combined input stream.
-// Data compressed with EncodeAll can be decoded with the Decoder,
-// using either a stream or DecodeAll.
-func (e *Encoder) EncodeAll(src, dst []byte) []byte {
-	e.init.Do(e.initialize)
-	enc := <-e.encoders
-	defer func() {
-		e.encoders <- enc
-	}()
-	return e.encodeAll(enc, src, dst)
-}
-
-func (e *Encoder) encodeAll(enc encoder, src, dst []byte) []byte {
-	if len(src) == 0 {
-		if e.o.fullZero {
-			// Add frame header.
-			fh := frameHeader{
-				ContentSize:   0,
-				WindowSize:    MinWindowSize,
-				SingleSegment: true,
-				// Adding a checksum would be a waste of space.
-				Checksum: false,
-				DictID:   0,
-			}
-			dst = fh.appendTo(dst)
-
-			// Write raw block as last one only.
-			var blk blockHeader
-			blk.setSize(0)
-			blk.setType(blockTypeRaw)
-			blk.setLast(true)
-			dst = blk.appendTo(dst)
-		}
-		return dst
-	}
-
-	// Use single segments when above minimum window and below window size.
-	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
-	if e.o.single != nil {
-		single = *e.o.single
-	}
-	fh := frameHeader{
-		ContentSize:   uint64(len(src)),
-		WindowSize:    uint32(enc.WindowSize(int64(len(src)))),
-		SingleSegment: single,
-		Checksum:      e.o.crc,
-		DictID:        e.o.dict.ID(),
-	}
-
-	// If less than 1MB, allocate a buffer up front.
-	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
-		dst = make([]byte, 0, len(src))
-	}
-	dst = fh.appendTo(dst)
-
-	// If we can do everything in one block, prefer that.
-	if len(src) <= e.o.blockSize {
-		enc.Reset(e.o.dict, true)
-		// Slightly faster with no history and everything in one block.
-		if e.o.crc {
-			_, _ = enc.CRC().Write(src)
-		}
-		blk := enc.Block()
-		blk.last = true
-		if e.o.dict == nil {
-			enc.EncodeNoHist(blk, src)
-		} else {
-			enc.Encode(blk, src)
-		}
-
-		// If we got the exact same number of literals as input,
-		// assume the literals cannot be compressed.
-		oldout := blk.output
-		// Output directly to dst
-		blk.output = dst
-
-		err := blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-		if err != nil {
-			panic(err)
-		}
-		dst = blk.output
-		blk.output = oldout
-	} else {
-		enc.Reset(e.o.dict, false)
-		blk := enc.Block()
-		for len(src) > 0 {
-			todo := src
-			if len(todo) > e.o.blockSize {
-				todo = todo[:e.o.blockSize]
-			}
-			src = src[len(todo):]
-			if e.o.crc {
-				_, _ = enc.CRC().Write(todo)
-			}
-			blk.pushOffsets()
-			enc.Encode(blk, todo)
-			if len(src) == 0 {
-				blk.last = true
-			}
-			err := blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
-			if err != nil {
-				panic(err)
-			}
-			dst = append(dst, blk.output...)
-			blk.reset(nil)
-		}
-	}
-	if e.o.crc {
-		dst = enc.AppendCRC(dst)
-	}
-	// Add padding with content from crypto/rand.Reader
-	if e.o.pad > 0 {
-		add := calcSkippableFrame(int64(len(dst)), int64(e.o.pad))
-		var err error
-		dst, err = skippableFrame(dst, add, rand.Reader)
-		if err != nil {
-			panic(err)
-		}
-	}
-	return dst
-}
-
-// MaxEncodedSize returns the expected maximum
-// size of an encoded block or stream.
-func (e *Encoder) MaxEncodedSize(size int) int {
-	frameHeader := 4 + 2 // magic + frame header & window descriptor
-	if e.o.dict != nil {
-		frameHeader += 4
-	}
-	// Frame content size:
-	if size < 256 {
-		frameHeader++
-	} else if size < 65536+256 {
-		frameHeader += 2
-	} else if size < math.MaxInt32 {
-		frameHeader += 4
-	} else {
-		frameHeader += 8
-	}
-	// Final crc
-	if e.o.crc {
-		frameHeader += 4
-	}
-
-	// Max overhead is 3 bytes/block.
-	// There cannot be 0 blocks.
-	blocks := (size + e.o.blockSize) / e.o.blockSize
-
-	// Combine, add padding.
-	maxSz := frameHeader + 3*blocks + size
-	if e.o.pad > 1 {
-		maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
-	}
-	return maxSz
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
deleted file mode 100644
index 20671dcb9..000000000
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ /dev/null
@@ -1,339 +0,0 @@
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"math"
-	"math/bits"
-	"runtime"
-	"strings"
-)
-
-// EOption is an option for creating a encoder.
-type EOption func(*encoderOptions) error
-
-// options retains accumulated state of multiple options.
-type encoderOptions struct {
-	concurrent      int
-	level           EncoderLevel
-	single          *bool
-	pad             int
-	blockSize       int
-	windowSize      int
-	crc             bool
-	fullZero        bool
-	noEntropy       bool
-	allLitEntropy   bool
-	customWindow    bool
-	customALEntropy bool
-	customBlockSize bool
-	lowMem          bool
-	dict            *dict
-}
-
-func (o *encoderOptions) setDefault() {
-	*o = encoderOptions{
-		concurrent:    runtime.GOMAXPROCS(0),
-		crc:           true,
-		single:        nil,
-		blockSize:     maxCompressedBlockSize,
-		windowSize:    8 << 20,
-		level:         SpeedDefault,
-		allLitEntropy: false,
-		lowMem:        false,
-	}
-}
-
-// encoder returns an encoder with the selected options.
-func (o encoderOptions) encoder() encoder {
-	switch o.level {
-	case SpeedFastest:
-		if o.dict != nil {
-			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
-		}
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
-
-	case SpeedDefault:
-		if o.dict != nil {
-			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
-		}
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
-	case SpeedBetterCompression:
-		if o.dict != nil {
-			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
-		}
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
-	case SpeedBestCompression:
-		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
-	}
-	panic("unknown compression level")
-}
-
-// WithEncoderCRC will add CRC value to output.
-// Output will be 4 bytes larger.
-func WithEncoderCRC(b bool) EOption {
-	return func(o *encoderOptions) error { o.crc = b; return nil }
-}
-
-// WithEncoderConcurrency will set the concurrency,
-// meaning the maximum number of encoders to run concurrently.
-// The value supplied must be at least 1.
-// For streams, setting a value of 1 will disable async compression.
-// By default this will be set to GOMAXPROCS.
-func WithEncoderConcurrency(n int) EOption {
-	return func(o *encoderOptions) error {
-		if n <= 0 {
-			return fmt.Errorf("concurrency must be at least 1")
-		}
-		o.concurrent = n
-		return nil
-	}
-}
-
-// WithWindowSize will set the maximum allowed back-reference distance.
-// The value must be a power of two between MinWindowSize and MaxWindowSize.
-// A larger value will enable better compression but allocate more memory and,
-// for above-default values, take considerably longer.
-// The default value is determined by the compression level and max 8MB.
-func WithWindowSize(n int) EOption {
-	return func(o *encoderOptions) error {
-		switch {
-		case n < MinWindowSize:
-			return fmt.Errorf("window size must be at least %d", MinWindowSize)
-		case n > MaxWindowSize:
-			return fmt.Errorf("window size must be at most %d", MaxWindowSize)
-		case (n & (n - 1)) != 0:
-			return errors.New("window size must be a power of 2")
-		}
-
-		o.windowSize = n
-		o.customWindow = true
-		if o.blockSize > o.windowSize {
-			o.blockSize = o.windowSize
-			o.customBlockSize = true
-		}
-		return nil
-	}
-}
-
-// WithEncoderPadding will add padding to all output so the size will be a multiple of n.
-// This can be used to obfuscate the exact output size or make blocks of a certain size.
-// The contents will be a skippable frame, so it will be invisible by the decoder.
-// n must be > 0 and <= 1GB, 1<<30 bytes.
-// The padded area will be filled with data from crypto/rand.Reader.
-// If `EncodeAll` is used with data already in the destination, the total size will be multiple of this.
-func WithEncoderPadding(n int) EOption {
-	return func(o *encoderOptions) error {
-		if n <= 0 {
-			return fmt.Errorf("padding must be at least 1")
-		}
-		// No need to waste our time.
-		if n == 1 {
-			n = 0
-		}
-		if n > 1<<30 {
-			return fmt.Errorf("padding must less than 1GB (1<<30 bytes) ")
-		}
-		o.pad = n
-		return nil
-	}
-}
-
-// EncoderLevel predefines encoder compression levels.
-// Only use the constants made available, since the actual mapping
-// of these values are very likely to change and your compression could change
-// unpredictably when upgrading the library.
-type EncoderLevel int
-
-const (
-	speedNotSet EncoderLevel = iota
-
-	// SpeedFastest will choose the fastest reasonable compression.
-	// This is roughly equivalent to the fastest Zstandard mode.
-	SpeedFastest
-
-	// SpeedDefault is the default "pretty fast" compression option.
-	// This is roughly equivalent to the default Zstandard mode (level 3).
-	SpeedDefault
-
-	// SpeedBetterCompression will yield better compression than the default.
-	// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.
-	// By using this, notice that CPU usage may go up in the future.
-	SpeedBetterCompression
-
-	// SpeedBestCompression will choose the best available compression option.
-	// This will offer the best compression no matter the CPU cost.
-	SpeedBestCompression
-
-	// speedLast should be kept as the last actual compression option.
-	// The is not for external usage, but is used to keep track of the valid options.
-	speedLast
-)
-
-// EncoderLevelFromString will convert a string representation of an encoding level back
-// to a compression level. The compare is not case sensitive.
-// If the string wasn't recognized, (false, SpeedDefault) will be returned.
-func EncoderLevelFromString(s string) (bool, EncoderLevel) {
-	for l := speedNotSet + 1; l < speedLast; l++ {
-		if strings.EqualFold(s, l.String()) {
-			return true, l
-		}
-	}
-	return false, SpeedDefault
-}
-
-// EncoderLevelFromZstd will return an encoder level that closest matches the compression
-// ratio of a specific zstd compression level.
-// Many input values will provide the same compression level.
-func EncoderLevelFromZstd(level int) EncoderLevel {
-	switch {
-	case level < 3:
-		return SpeedFastest
-	case level >= 3 && level < 6:
-		return SpeedDefault
-	case level >= 6 && level < 10:
-		return SpeedBetterCompression
-	default:
-		return SpeedBestCompression
-	}
-}
-
-// String provides a string representation of the compression level.
-func (e EncoderLevel) String() string {
-	switch e {
-	case SpeedFastest:
-		return "fastest"
-	case SpeedDefault:
-		return "default"
-	case SpeedBetterCompression:
-		return "better"
-	case SpeedBestCompression:
-		return "best"
-	default:
-		return "invalid"
-	}
-}
-
-// WithEncoderLevel specifies a predefined compression level.
-func WithEncoderLevel(l EncoderLevel) EOption {
-	return func(o *encoderOptions) error {
-		switch {
-		case l <= speedNotSet || l >= speedLast:
-			return fmt.Errorf("unknown encoder level")
-		}
-		o.level = l
-		if !o.customWindow {
-			switch o.level {
-			case SpeedFastest:
-				o.windowSize = 4 << 20
-				if !o.customBlockSize {
-					o.blockSize = 1 << 16
-				}
-			case SpeedDefault:
-				o.windowSize = 8 << 20
-			case SpeedBetterCompression:
-				o.windowSize = 8 << 20
-			case SpeedBestCompression:
-				o.windowSize = 8 << 20
-			}
-		}
-		if !o.customALEntropy {
-			o.allLitEntropy = l > SpeedDefault
-		}
-
-		return nil
-	}
-}
-
-// WithZeroFrames will encode 0 length input as full frames.
-// This can be needed for compatibility with zstandard usage,
-// but is not needed for this package.
-func WithZeroFrames(b bool) EOption {
-	return func(o *encoderOptions) error {
-		o.fullZero = b
-		return nil
-	}
-}
-
-// WithAllLitEntropyCompression will apply entropy compression if no matches are found.
-// Disabling this will skip incompressible data faster, but in cases with no matches but
-// skewed character distribution compression is lost.
-// Default value depends on the compression level selected.
-func WithAllLitEntropyCompression(b bool) EOption {
-	return func(o *encoderOptions) error {
-		o.customALEntropy = true
-		o.allLitEntropy = b
-		return nil
-	}
-}
-
-// WithNoEntropyCompression will always skip entropy compression of literals.
-// This can be useful if content has matches, but unlikely to benefit from entropy
-// compression. Usually the slight speed improvement is not worth enabling this.
-func WithNoEntropyCompression(b bool) EOption {
-	return func(o *encoderOptions) error {
-		o.noEntropy = b
-		return nil
-	}
-}
-
-// WithSingleSegment will set the "single segment" flag when EncodeAll is used.
-// If this flag is set, data must be regenerated within a single continuous memory segment.
-// In this case, Window_Descriptor byte is skipped, but Frame_Content_Size is necessarily present.
-// As a consequence, the decoder must allocate a memory segment of size equal or larger than size of your content.
-// In order to preserve the decoder from unreasonable memory requirements,
-// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
-// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
-// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
-// This setting has no effect on streamed encodes.
-func WithSingleSegment(b bool) EOption {
-	return func(o *encoderOptions) error {
-		o.single = &b
-		return nil
-	}
-}
-
-// WithLowerEncoderMem will trade in some memory cases trade less memory usage for
-// slower encoding speed.
-// This will not change the window size which is the primary function for reducing
-// memory usage. See WithWindowSize.
-func WithLowerEncoderMem(b bool) EOption {
-	return func(o *encoderOptions) error {
-		o.lowMem = b
-		return nil
-	}
-}
-
-// WithEncoderDict allows to register a dictionary that will be used for the encode.
-//
-// The slice dict must be in the [dictionary format] produced by
-// "zstd --train" from the Zstandard reference implementation.
-//
-// The encoder *may* choose to use no dictionary instead for certain payloads.
-//
-// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
-func WithEncoderDict(dict []byte) EOption {
-	return func(o *encoderOptions) error {
-		d, err := loadDict(dict)
-		if err != nil {
-			return err
-		}
-		o.dict = d
-		return nil
-	}
-}
-
-// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
-//
-// The slice content may contain arbitrary data. It will be used as an initial
-// history.
-func WithEncoderDictRaw(id uint32, content []byte) EOption {
-	return func(o *encoderOptions) error {
-		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
-			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
-		}
-		o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
-		return nil
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
deleted file mode 100644
index e47af66e7..000000000
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ /dev/null
@@ -1,415 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"encoding/binary"
-	"encoding/hex"
-	"errors"
-	"io"
-
-	"github.com/klauspost/compress/zstd/internal/xxhash"
-)
-
-type frameDec struct {
-	o   decoderOptions
-	crc *xxhash.Digest
-
-	WindowSize uint64
-
-	// Frame history passed between blocks
-	history history
-
-	rawInput byteBuffer
-
-	// Byte buffer that can be reused for small input blocks.
-	bBuf byteBuf
-
-	FrameContentSize uint64
-
-	DictionaryID  uint32
-	HasCheckSum   bool
-	SingleSegment bool
-}
-
-const (
-	// MinWindowSize is the minimum Window Size, which is 1 KB.
-	MinWindowSize = 1 << 10
-
-	// MaxWindowSize is the maximum encoder window size
-	// and the default decoder maximum window size.
-	MaxWindowSize = 1 << 29
-)
-
-const (
-	frameMagic          = "\x28\xb5\x2f\xfd"
-	skippableFrameMagic = "\x2a\x4d\x18"
-)
-
-func newFrameDec(o decoderOptions) *frameDec {
-	if o.maxWindowSize > o.maxDecodedSize {
-		o.maxWindowSize = o.maxDecodedSize
-	}
-	d := frameDec{
-		o: o,
-	}
-	return &d
-}
-
-// reset will read the frame header and prepare for block decoding.
-// If nothing can be read from the input, io.EOF will be returned.
-// Any other error indicated that the stream contained data, but
-// there was a problem.
-func (d *frameDec) reset(br byteBuffer) error {
-	d.HasCheckSum = false
-	d.WindowSize = 0
-	var signature [4]byte
-	for {
-		var err error
-		// Check if we can read more...
-		b, err := br.readSmall(1)
-		switch err {
-		case io.EOF, io.ErrUnexpectedEOF:
-			return io.EOF
-		case nil:
-			signature[0] = b[0]
-		default:
-			return err
-		}
-		// Read the rest, don't allow io.ErrUnexpectedEOF
-		b, err = br.readSmall(3)
-		switch err {
-		case io.EOF:
-			return io.EOF
-		case nil:
-			copy(signature[1:], b)
-		default:
-			return err
-		}
-
-		if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
-			if debugDecoder {
-				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
-			}
-			// Break if not skippable frame.
-			break
-		}
-		// Read size to skip
-		b, err = br.readSmall(4)
-		if err != nil {
-			if debugDecoder {
-				println("Reading Frame Size", err)
-			}
-			return err
-		}
-		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
-		println("Skipping frame with", n, "bytes.")
-		err = br.skipN(int64(n))
-		if err != nil {
-			if debugDecoder {
-				println("Reading discarded frame", err)
-			}
-			return err
-		}
-	}
-	if string(signature[:]) != frameMagic {
-		if debugDecoder {
-			println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
-		}
-		return ErrMagicMismatch
-	}
-
-	// Read Frame_Header_Descriptor
-	fhd, err := br.readByte()
-	if err != nil {
-		if debugDecoder {
-			println("Reading Frame_Header_Descriptor", err)
-		}
-		return err
-	}
-	d.SingleSegment = fhd&(1<<5) != 0
-
-	if fhd&(1<<3) != 0 {
-		return errors.New("reserved bit set on frame header")
-	}
-
-	// Read Window_Descriptor
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
-	d.WindowSize = 0
-	if !d.SingleSegment {
-		wd, err := br.readByte()
-		if err != nil {
-			if debugDecoder {
-				println("Reading Window_Descriptor", err)
-			}
-			return err
-		}
-		if debugDecoder {
-			printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
-		}
-		windowLog := 10 + (wd >> 3)
-		windowBase := uint64(1) << windowLog
-		windowAdd := (windowBase / 8) * uint64(wd&0x7)
-		d.WindowSize = windowBase + windowAdd
-	}
-
-	// Read Dictionary_ID
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = 0
-	if size := fhd & 3; size != 0 {
-		if size == 3 {
-			size = 4
-		}
-
-		b, err := br.readSmall(int(size))
-		if err != nil {
-			println("Reading Dictionary_ID", err)
-			return err
-		}
-		var id uint32
-		switch len(b) {
-		case 1:
-			id = uint32(b[0])
-		case 2:
-			id = uint32(b[0]) | (uint32(b[1]) << 8)
-		case 4:
-			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
-		}
-		if debugDecoder {
-			println("Dict size", size, "ID:", id)
-		}
-		d.DictionaryID = id
-	}
-
-	// Read Frame_Content_Size
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
-	var fcsSize int
-	v := fhd >> 6
-	switch v {
-	case 0:
-		if d.SingleSegment {
-			fcsSize = 1
-		}
-	default:
-		fcsSize = 1 << v
-	}
-	d.FrameContentSize = fcsUnknown
-	if fcsSize > 0 {
-		b, err := br.readSmall(fcsSize)
-		if err != nil {
-			println("Reading Frame content", err)
-			return err
-		}
-		switch len(b) {
-		case 1:
-			d.FrameContentSize = uint64(b[0])
-		case 2:
-			// When FCS_Field_Size is 2, the offset of 256 is added.
-			d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) + 256
-		case 4:
-			d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24)
-		case 8:
-			d1 := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
-			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
-			d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
-		}
-		if debugDecoder {
-			println("Read FCS:", d.FrameContentSize)
-		}
-	}
-
-	// Move this to shared.
-	d.HasCheckSum = fhd&(1<<2) != 0
-	if d.HasCheckSum {
-		if d.crc == nil {
-			d.crc = xxhash.New()
-		}
-		d.crc.Reset()
-	}
-
-	if d.WindowSize > d.o.maxWindowSize {
-		if debugDecoder {
-			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
-		}
-		return ErrWindowSizeExceeded
-	}
-
-	if d.WindowSize == 0 && d.SingleSegment {
-		// We may not need window in this case.
-		d.WindowSize = d.FrameContentSize
-		if d.WindowSize < MinWindowSize {
-			d.WindowSize = MinWindowSize
-		}
-		if d.WindowSize > d.o.maxDecodedSize {
-			if debugDecoder {
-				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
-			}
-			return ErrDecoderSizeExceeded
-		}
-	}
-
-	// The minimum Window_Size is 1 KB.
-	if d.WindowSize < MinWindowSize {
-		if debugDecoder {
-			println("got window size: ", d.WindowSize)
-		}
-		return ErrWindowSizeTooSmall
-	}
-	d.history.windowSize = int(d.WindowSize)
-	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
-		// Alloc 2x window size if not low-mem, or window size below 2MB.
-		d.history.allocFrameBuffer = d.history.windowSize * 2
-	} else {
-		if d.o.lowMem {
-			// Alloc with 1MB extra.
-			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
-		} else {
-			// Alloc with 2MB extra.
-			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
-		}
-	}
-
-	if debugDecoder {
-		println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum)
-	}
-
-	// history contains input - maybe we do something
-	d.rawInput = br
-	return nil
-}
-
-// next will start decoding the next block from stream.
-func (d *frameDec) next(block *blockDec) error {
-	if debugDecoder {
-		println("decoding new block")
-	}
-	err := block.reset(d.rawInput, d.WindowSize)
-	if err != nil {
-		println("block error:", err)
-		// Signal the frame decoder we have a problem.
-		block.sendErr(err)
-		return err
-	}
-	return nil
-}
-
-// checkCRC will check the checksum, assuming the frame has one.
-// Will return ErrCRCMismatch if crc check failed, otherwise nil.
-func (d *frameDec) checkCRC() error {
-	// We can overwrite upper tmp now
-	buf, err := d.rawInput.readSmall(4)
-	if err != nil {
-		println("CRC missing?", err)
-		return err
-	}
-
-	want := binary.LittleEndian.Uint32(buf[:4])
-	got := uint32(d.crc.Sum64())
-
-	if got != want {
-		if debugDecoder {
-			printf("CRC check failed: got %08x, want %08x\n", got, want)
-		}
-		return ErrCRCMismatch
-	}
-	if debugDecoder {
-		printf("CRC ok %08x\n", got)
-	}
-	return nil
-}
-
-// consumeCRC skips over the checksum, assuming the frame has one.
-func (d *frameDec) consumeCRC() error {
-	_, err := d.rawInput.readSmall(4)
-	if err != nil {
-		println("CRC missing?", err)
-	}
-	return err
-}
-
-// runDecoder will run the decoder for the remainder of the frame.
-func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
-	saved := d.history.b
-
-	// We use the history for output to avoid copying it.
-	d.history.b = dst
-	d.history.ignoreBuffer = len(dst)
-	// Store input length, so we only check new data.
-	crcStart := len(dst)
-	d.history.decoders.maxSyncLen = 0
-	if d.o.limitToCap {
-		d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
-	}
-	if d.FrameContentSize != fcsUnknown {
-		if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
-			d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
-		}
-		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
-			if debugDecoder {
-				println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
-			}
-			return dst, ErrDecoderSizeExceeded
-		}
-		if debugDecoder {
-			println("maxSyncLen:", d.history.decoders.maxSyncLen)
-		}
-		if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
-			// Alloc for output
-			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
-			copy(dst2, dst)
-			dst = dst2
-		}
-	}
-	var err error
-	for {
-		err = dec.reset(d.rawInput, d.WindowSize)
-		if err != nil {
-			break
-		}
-		if debugDecoder {
-			println("next block:", dec)
-		}
-		err = dec.decodeBuf(&d.history)
-		if err != nil {
-			break
-		}
-		if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
-			println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
-			err = ErrDecoderSizeExceeded
-			break
-		}
-		if d.o.limitToCap && len(d.history.b) > cap(dst) {
-			println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
-			err = ErrDecoderSizeExceeded
-			break
-		}
-		if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
-			println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
-			err = ErrFrameSizeExceeded
-			break
-		}
-		if dec.Last {
-			break
-		}
-		if debugDecoder {
-			println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
-		}
-	}
-	dst = d.history.b
-	if err == nil {
-		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
-			err = ErrFrameSizeMismatch
-		} else if d.HasCheckSum {
-			if d.o.ignoreChecksum {
-				err = d.consumeCRC()
-			} else {
-				d.crc.Write(dst[crcStart:])
-				err = d.checkCRC()
-			}
-		}
-	}
-	d.history.b = saved
-	return dst, err
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go
deleted file mode 100644
index 667ca0679..000000000
--- a/vendor/github.com/klauspost/compress/zstd/frameenc.go
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"encoding/binary"
-	"fmt"
-	"io"
-	"math"
-	"math/bits"
-)
-
-type frameHeader struct {
-	ContentSize   uint64
-	WindowSize    uint32
-	SingleSegment bool
-	Checksum      bool
-	DictID        uint32
-}
-
-const maxHeaderSize = 14
-
-func (f frameHeader) appendTo(dst []byte) []byte {
-	dst = append(dst, frameMagic...)
-	var fhd uint8
-	if f.Checksum {
-		fhd |= 1 << 2
-	}
-	if f.SingleSegment {
-		fhd |= 1 << 5
-	}
-
-	var dictIDContent []byte
-	if f.DictID > 0 {
-		var tmp [4]byte
-		if f.DictID < 256 {
-			fhd |= 1
-			tmp[0] = uint8(f.DictID)
-			dictIDContent = tmp[:1]
-		} else if f.DictID < 1<<16 {
-			fhd |= 2
-			binary.LittleEndian.PutUint16(tmp[:2], uint16(f.DictID))
-			dictIDContent = tmp[:2]
-		} else {
-			fhd |= 3
-			binary.LittleEndian.PutUint32(tmp[:4], f.DictID)
-			dictIDContent = tmp[:4]
-		}
-	}
-	var fcs uint8
-	if f.ContentSize >= 256 {
-		fcs++
-	}
-	if f.ContentSize >= 65536+256 {
-		fcs++
-	}
-	if f.ContentSize >= 0xffffffff {
-		fcs++
-	}
-
-	fhd |= fcs << 6
-
-	dst = append(dst, fhd)
-	if !f.SingleSegment {
-		const winLogMin = 10
-		windowLog := (bits.Len32(f.WindowSize-1) - winLogMin) << 3
-		dst = append(dst, uint8(windowLog))
-	}
-	if f.DictID > 0 {
-		dst = append(dst, dictIDContent...)
-	}
-	switch fcs {
-	case 0:
-		if f.SingleSegment {
-			dst = append(dst, uint8(f.ContentSize))
-		}
-		// Unless SingleSegment is set, framessizes < 256 are not stored.
-	case 1:
-		f.ContentSize -= 256
-		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8))
-	case 2:
-		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8), uint8(f.ContentSize>>16), uint8(f.ContentSize>>24))
-	case 3:
-		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8), uint8(f.ContentSize>>16), uint8(f.ContentSize>>24),
-			uint8(f.ContentSize>>32), uint8(f.ContentSize>>40), uint8(f.ContentSize>>48), uint8(f.ContentSize>>56))
-	default:
-		panic("invalid fcs")
-	}
-	return dst
-}
-
-const skippableFrameHeader = 4 + 4
-
-// calcSkippableFrame will return a total size to be added for written
-// to be divisible by multiple.
-// The value will always be > skippableFrameHeader.
-// The function will panic if written < 0 or wantMultiple <= 0.
-func calcSkippableFrame(written, wantMultiple int64) int {
-	if wantMultiple <= 0 {
-		panic("wantMultiple <= 0")
-	}
-	if written < 0 {
-		panic("written < 0")
-	}
-	leftOver := written % wantMultiple
-	if leftOver == 0 {
-		return 0
-	}
-	toAdd := wantMultiple - leftOver
-	for toAdd < skippableFrameHeader {
-		toAdd += wantMultiple
-	}
-	return int(toAdd)
-}
-
-// skippableFrame will add a skippable frame with a total size of bytes.
-// total should be >= skippableFrameHeader and < math.MaxUint32.
-func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
-	if total == 0 {
-		return dst, nil
-	}
-	if total < skippableFrameHeader {
-		return dst, fmt.Errorf("requested skippable frame (%d) < 8", total)
-	}
-	if int64(total) > math.MaxUint32 {
-		return dst, fmt.Errorf("requested skippable frame (%d) > max uint32", total)
-	}
-	dst = append(dst, 0x50, 0x2a, 0x4d, 0x18)
-	f := uint32(total - skippableFrameHeader)
-	dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16), uint8(f>>24))
-	start := len(dst)
-	dst = append(dst, make([]byte, f)...)
-	_, err := io.ReadFull(r, dst[start:])
-	return dst, err
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
deleted file mode 100644
index 2f8860a72..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ /dev/null
@@ -1,307 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-)
-
-const (
-	tablelogAbsoluteMax = 9
-)
-
-const (
-	/*!MEMORY_USAGE :
-	 *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
-	 *  Increasing memory usage improves compression ratio
-	 *  Reduced memory usage can improve speed, due to cache effect
-	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage = tablelogAbsoluteMax + 2
-
-	maxTableLog    = maxMemoryUsage - 2
-	maxTablesize   = 1 << maxTableLog
-	maxTableMask   = (1 << maxTableLog) - 1
-	minTablelog    = 5
-	maxSymbolValue = 255
-)
-
-// fseDecoder provides temporary storage for compression and decompression.
-type fseDecoder struct {
-	dt             [maxTablesize]decSymbol // Decompression table.
-	symbolLen      uint16                  // Length of active part of the symbol table.
-	actualTableLog uint8                   // Selected tablelog.
-	maxBits        uint8                   // Maximum number of additional bits
-
-	// used for table creation to avoid allocations.
-	stateTable [256]uint16
-	norm       [maxSymbolValue + 1]int16
-	preDefined bool
-}
-
-// tableStep returns the next table index.
-func tableStep(tableSize uint32) uint32 {
-	return (tableSize >> 1) + (tableSize >> 3) + 3
-}
-
-// readNCount will read the symbol distribution so decoding tables can be constructed.
-func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
-	var (
-		charnum   uint16
-		previous0 bool
-	)
-	if b.remain() < 4 {
-		return errors.New("input too small")
-	}
-	bitStream := b.Uint32NC()
-	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
-	if nbBits > tablelogAbsoluteMax {
-		println("Invalid tablelog:", nbBits)
-		return errors.New("tableLog too large")
-	}
-	bitStream >>= 4
-	bitCount := uint(4)
-
-	s.actualTableLog = uint8(nbBits)
-	remaining := int32((1 << nbBits) + 1)
-	threshold := int32(1 << nbBits)
-	gotTotal := int32(0)
-	nbBits++
-
-	for remaining > 1 && charnum <= maxSymbol {
-		if previous0 {
-			//println("prev0")
-			n0 := charnum
-			for (bitStream & 0xFFFF) == 0xFFFF {
-				//println("24 x 0")
-				n0 += 24
-				if r := b.remain(); r > 5 {
-					b.advance(2)
-					// The check above should make sure we can read 32 bits
-					bitStream = b.Uint32NC() >> bitCount
-				} else {
-					// end of bit stream
-					bitStream >>= 16
-					bitCount += 16
-				}
-			}
-			//printf("bitstream: %d, 0b%b", bitStream&3, bitStream)
-			for (bitStream & 3) == 3 {
-				n0 += 3
-				bitStream >>= 2
-				bitCount += 2
-			}
-			n0 += uint16(bitStream & 3)
-			bitCount += 2
-
-			if n0 > maxSymbolValue {
-				return errors.New("maxSymbolValue too small")
-			}
-			//println("inserting ", n0-charnum, "zeroes from idx", charnum, "ending before", n0)
-			for charnum < n0 {
-				s.norm[uint8(charnum)] = 0
-				charnum++
-			}
-
-			if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
-				b.advance(bitCount >> 3)
-				bitCount &= 7
-				// The check above should make sure we can read 32 bits
-				bitStream = b.Uint32NC() >> bitCount
-			} else {
-				bitStream >>= 2
-			}
-		}
-
-		max := (2*threshold - 1) - remaining
-		var count int32
-
-		if int32(bitStream)&(threshold-1) < max {
-			count = int32(bitStream) & (threshold - 1)
-			if debugAsserts && nbBits < 1 {
-				panic("nbBits underflow")
-			}
-			bitCount += nbBits - 1
-		} else {
-			count = int32(bitStream) & (2*threshold - 1)
-			if count >= threshold {
-				count -= max
-			}
-			bitCount += nbBits
-		}
-
-		// extra accuracy
-		count--
-		if count < 0 {
-			// -1 means +1
-			remaining += count
-			gotTotal -= count
-		} else {
-			remaining -= count
-			gotTotal += count
-		}
-		s.norm[charnum&0xff] = int16(count)
-		charnum++
-		previous0 = count == 0
-		for remaining < threshold {
-			nbBits--
-			threshold >>= 1
-		}
-
-		if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
-			b.advance(bitCount >> 3)
-			bitCount &= 7
-			// The check above should make sure we can read 32 bits
-			bitStream = b.Uint32NC() >> (bitCount & 31)
-		} else {
-			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
-			b.off = len(b.b) - 4
-			bitStream = b.Uint32() >> (bitCount & 31)
-		}
-	}
-	s.symbolLen = charnum
-	if s.symbolLen <= 1 {
-		return fmt.Errorf("symbolLen (%d) too small", s.symbolLen)
-	}
-	if s.symbolLen > maxSymbolValue+1 {
-		return fmt.Errorf("symbolLen (%d) too big", s.symbolLen)
-	}
-	if remaining != 1 {
-		return fmt.Errorf("corruption detected (remaining %d != 1)", remaining)
-	}
-	if bitCount > 32 {
-		return fmt.Errorf("corruption detected (bitCount %d > 32)", bitCount)
-	}
-	if gotTotal != 1<<s.actualTableLog {
-		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
-	}
-	b.advance((bitCount + 7) >> 3)
-	return s.buildDtable()
-}
-
-func (s *fseDecoder) mustReadFrom(r io.Reader) {
-	fatalErr := func(err error) {
-		if err != nil {
-			panic(err)
-		}
-	}
-	// 	dt             [maxTablesize]decSymbol // Decompression table.
-	//	symbolLen      uint16                  // Length of active part of the symbol table.
-	//	actualTableLog uint8                   // Selected tablelog.
-	//	maxBits        uint8                   // Maximum number of additional bits
-	//	// used for table creation to avoid allocations.
-	//	stateTable [256]uint16
-	//	norm       [maxSymbolValue + 1]int16
-	//	preDefined bool
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
-	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
-}
-
-// decSymbol contains information about a state entry,
-// Including the state offset base, the output symbol and
-// the number of bits to read for the low part of the destination state.
-// Using a composite uint64 is faster than a struct with separate members.
-type decSymbol uint64
-
-func newDecSymbol(nbits, addBits uint8, newState uint16, baseline uint32) decSymbol {
-	return decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
-}
-
-func (d decSymbol) nbBits() uint8 {
-	return uint8(d)
-}
-
-func (d decSymbol) addBits() uint8 {
-	return uint8(d >> 8)
-}
-
-func (d decSymbol) newState() uint16 {
-	return uint16(d >> 16)
-}
-
-func (d decSymbol) baselineInt() int {
-	return int(d >> 32)
-}
-
-func (d *decSymbol) setNBits(nBits uint8) {
-	const mask = 0xffffffffffffff00
-	*d = (*d & mask) | decSymbol(nBits)
-}
-
-func (d *decSymbol) setAddBits(addBits uint8) {
-	const mask = 0xffffffffffff00ff
-	*d = (*d & mask) | (decSymbol(addBits) << 8)
-}
-
-func (d *decSymbol) setNewState(state uint16) {
-	const mask = 0xffffffff0000ffff
-	*d = (*d & mask) | decSymbol(state)<<16
-}
-
-func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
-	const mask = 0xffff00ff
-	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
-}
-
-// decSymbolValue returns the transformed decSymbol for the given symbol.
-func decSymbolValue(symb uint8, t []baseOffset) (decSymbol, error) {
-	if int(symb) >= len(t) {
-		return 0, fmt.Errorf("rle symbol %d >= max %d", symb, len(t))
-	}
-	lu := t[symb]
-	return newDecSymbol(0, lu.addBits, 0, lu.baseLine), nil
-}
-
-// setRLE will set the decoder til RLE mode.
-func (s *fseDecoder) setRLE(symbol decSymbol) {
-	s.actualTableLog = 0
-	s.maxBits = symbol.addBits()
-	s.dt[0] = symbol
-}
-
-// transform will transform the decoder table into a table usable for
-// decoding without having to apply the transformation while decoding.
-// The state will contain the base value and the number of bits to read.
-func (s *fseDecoder) transform(t []baseOffset) error {
-	tableSize := uint16(1 << s.actualTableLog)
-	s.maxBits = 0
-	for i, v := range s.dt[:tableSize] {
-		add := v.addBits()
-		if int(add) >= len(t) {
-			return fmt.Errorf("invalid decoding table entry %d, symbol %d >= max (%d)", i, v.addBits(), len(t))
-		}
-		lu := t[add]
-		if lu.addBits > s.maxBits {
-			s.maxBits = lu.addBits
-		}
-		v.setExt(lu.addBits, lu.baseLine)
-		s.dt[i] = v
-	}
-	return nil
-}
-
-type fseState struct {
-	dt    []decSymbol
-	state decSymbol
-}
-
-// Initialize and decodeAsync first state and symbol.
-func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
-	s.dt = dt
-	br.fill()
-	s.state = dt[br.getBits(tableLog)]
-}
-
-// final returns the current state symbol without decoding the next.
-func (s decSymbol) final() (int, uint8) {
-	return s.baselineInt(), s.addBits()
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
deleted file mode 100644
index d04a829b0..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ /dev/null
@@ -1,65 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-package zstd
-
-import (
-	"fmt"
-)
-
-type buildDtableAsmContext struct {
-	// inputs
-	stateTable *uint16
-	norm       *int16
-	dt         *uint64
-
-	// outputs --- set by the procedure in the case of error;
-	// for interpretation please see the error handling part below
-	errParam1 uint64
-	errParam2 uint64
-}
-
-// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
-// Function returns non-zero exit code on error.
-//
-//go:noescape
-func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
-
-// please keep in sync with _generate/gen_fse.go
-const (
-	errorCorruptedNormalizedCounter = 1
-	errorNewStateTooBig             = 2
-	errorNewStateNoBits             = 3
-)
-
-// buildDtable will build the decoding table.
-func (s *fseDecoder) buildDtable() error {
-	ctx := buildDtableAsmContext{
-		stateTable: &s.stateTable[0],
-		norm:       &s.norm[0],
-		dt:         (*uint64)(&s.dt[0]),
-	}
-	code := buildDtable_asm(s, &ctx)
-
-	if code != 0 {
-		switch code {
-		case errorCorruptedNormalizedCounter:
-			position := ctx.errParam1
-			return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
-
-		case errorNewStateTooBig:
-			newState := decSymbol(ctx.errParam1)
-			size := ctx.errParam2
-			return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
-
-		case errorNewStateNoBits:
-			newState := decSymbol(ctx.errParam1)
-			oldState := decSymbol(ctx.errParam2)
-			return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
-
-		default:
-			return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
-		}
-	}
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
deleted file mode 100644
index bcde39869..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
+++ /dev/null
@@ -1,126 +0,0 @@
-// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
-
-//go:build !appengine && !noasm && gc && !noasm
-
-// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
-TEXT ·buildDtable_asm(SB), $0-24
-	MOVQ ctx+8(FP), CX
-	MOVQ s+0(FP), DI
-
-	// Load values
-	MOVBQZX 4098(DI), DX
-	XORQ    AX, AX
-	BTSQ    DX, AX
-	MOVQ    (CX), BX
-	MOVQ    16(CX), SI
-	LEAQ    -1(AX), R8
-	MOVQ    8(CX), CX
-	MOVWQZX 4096(DI), DI
-
-	// End load values
-	// Init, lay down lowprob symbols
-	XORQ R9, R9
-	JMP  init_main_loop_condition
-
-init_main_loop:
-	MOVWQSX (CX)(R9*2), R10
-	CMPW    R10, $-1
-	JNE     do_not_update_high_threshold
-	MOVB    R9, 1(SI)(R8*8)
-	DECQ    R8
-	MOVQ    $0x0000000000000001, R10
-
-do_not_update_high_threshold:
-	MOVW R10, (BX)(R9*2)
-	INCQ R9
-
-init_main_loop_condition:
-	CMPQ R9, DI
-	JL   init_main_loop
-
-	// Spread symbols
-	// Calculate table step
-	MOVQ AX, R9
-	SHRQ $0x01, R9
-	MOVQ AX, R10
-	SHRQ $0x03, R10
-	LEAQ 3(R9)(R10*1), R9
-
-	// Fill add bits values
-	LEAQ -1(AX), R10
-	XORQ R11, R11
-	XORQ R12, R12
-	JMP  spread_main_loop_condition
-
-spread_main_loop:
-	XORQ    R13, R13
-	MOVWQSX (CX)(R12*2), R14
-	JMP     spread_inner_loop_condition
-
-spread_inner_loop:
-	MOVB R12, 1(SI)(R11*8)
-
-adjust_position:
-	ADDQ R9, R11
-	ANDQ R10, R11
-	CMPQ R11, R8
-	JG   adjust_position
-	INCQ R13
-
-spread_inner_loop_condition:
-	CMPQ R13, R14
-	JL   spread_inner_loop
-	INCQ R12
-
-spread_main_loop_condition:
-	CMPQ  R12, DI
-	JL    spread_main_loop
-	TESTQ R11, R11
-	JZ    spread_check_ok
-	MOVQ  ctx+8(FP), AX
-	MOVQ  R11, 24(AX)
-	MOVQ  $+1, ret+16(FP)
-	RET
-
-spread_check_ok:
-	// Build Decoding table
-	XORQ DI, DI
-
-build_table_main_table:
-	MOVBQZX 1(SI)(DI*8), CX
-	MOVWQZX (BX)(CX*2), R8
-	LEAQ    1(R8), R9
-	MOVW    R9, (BX)(CX*2)
-	MOVQ    R8, R9
-	BSRQ    R9, R9
-	MOVQ    DX, CX
-	SUBQ    R9, CX
-	SHLQ    CL, R8
-	SUBQ    AX, R8
-	MOVB    CL, (SI)(DI*8)
-	MOVW    R8, 2(SI)(DI*8)
-	CMPQ    R8, AX
-	JLE     build_table_check1_ok
-	MOVQ    ctx+8(FP), CX
-	MOVQ    R8, 24(CX)
-	MOVQ    AX, 32(CX)
-	MOVQ    $+2, ret+16(FP)
-	RET
-
-build_table_check1_ok:
-	TESTB CL, CL
-	JNZ   build_table_check2_ok
-	CMPW  R8, DI
-	JNE   build_table_check2_ok
-	MOVQ  ctx+8(FP), AX
-	MOVQ  R8, 24(AX)
-	MOVQ  DI, 32(AX)
-	MOVQ  $+3, ret+16(FP)
-	RET
-
-build_table_check2_ok:
-	INCQ DI
-	CMPQ DI, AX
-	JL   build_table_main_table
-	MOVQ $+0, ret+16(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
deleted file mode 100644
index 8adfebb02..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
+++ /dev/null
@@ -1,73 +0,0 @@
-//go:build !amd64 || appengine || !gc || noasm
-// +build !amd64 appengine !gc noasm
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-)
-
-// buildDtable will build the decoding table.
-func (s *fseDecoder) buildDtable() error {
-	tableSize := uint32(1 << s.actualTableLog)
-	highThreshold := tableSize - 1
-	symbolNext := s.stateTable[:256]
-
-	// Init, lay down lowprob symbols
-	{
-		for i, v := range s.norm[:s.symbolLen] {
-			if v == -1 {
-				s.dt[highThreshold].setAddBits(uint8(i))
-				highThreshold--
-				v = 1
-			}
-			symbolNext[i] = uint16(v)
-		}
-	}
-
-	// Spread symbols
-	{
-		tableMask := tableSize - 1
-		step := tableStep(tableSize)
-		position := uint32(0)
-		for ss, v := range s.norm[:s.symbolLen] {
-			for i := 0; i < int(v); i++ {
-				s.dt[position].setAddBits(uint8(ss))
-				for {
-					// lowprob area
-					position = (position + step) & tableMask
-					if position <= highThreshold {
-						break
-					}
-				}
-			}
-		}
-		if position != 0 {
-			// position must reach all cells once, otherwise normalizedCounter is incorrect
-			return errors.New("corrupted input (position != 0)")
-		}
-	}
-
-	// Build Decoding table
-	{
-		tableSize := uint16(1 << s.actualTableLog)
-		for u, v := range s.dt[:tableSize] {
-			symbol := v.addBits()
-			nextState := symbolNext[symbol]
-			symbolNext[symbol] = nextState + 1
-			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
-			s.dt[u&maxTableMask].setNBits(nBits)
-			newState := (nextState << nBits) - tableSize
-			if newState > tableSize {
-				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
-			}
-			if newState == uint16(u) && nBits == 0 {
-				// Seems weird that this is possible with nbits > 0.
-				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
-			}
-			s.dt[u&maxTableMask].setNewState(newState)
-		}
-	}
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
deleted file mode 100644
index ab26326a8..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ /dev/null
@@ -1,701 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"math"
-)
-
-const (
-	// For encoding we only support up to
-	maxEncTableLog    = 8
-	maxEncTablesize   = 1 << maxTableLog
-	maxEncTableMask   = (1 << maxTableLog) - 1
-	minEncTablelog    = 5
-	maxEncSymbolValue = maxMatchLengthSymbol
-)
-
-// Scratch provides temporary storage for compression and decompression.
-type fseEncoder struct {
-	symbolLen      uint16 // Length of active part of the symbol table.
-	actualTableLog uint8  // Selected tablelog.
-	ct             cTable // Compression tables.
-	maxCount       int    // count of the most probable symbol
-	zeroBits       bool   // no bits has prob > 50%.
-	clearCount     bool   // clear count
-	useRLE         bool   // This encoder is for RLE
-	preDefined     bool   // This encoder is predefined.
-	reUsed         bool   // Set to know when the encoder has been reused.
-	rleVal         uint8  // RLE Symbol
-	maxBits        uint8  // Maximum output bits after transform.
-
-	// TODO: Technically zstd should be fine with 64 bytes.
-	count [256]uint32
-	norm  [256]int16
-}
-
-// cTable contains tables used for compression.
-type cTable struct {
-	tableSymbol []byte
-	stateTable  []uint16
-	symbolTT    []symbolTransform
-}
-
-// symbolTransform contains the state transform for a symbol.
-type symbolTransform struct {
-	deltaNbBits    uint32
-	deltaFindState int16
-	outBits        uint8
-}
-
-// String prints values as a human readable string.
-func (s symbolTransform) String() string {
-	return fmt.Sprintf("{deltabits: %08x, findstate:%d outbits:%d}", s.deltaNbBits, s.deltaFindState, s.outBits)
-}
-
-// Histogram allows to populate the histogram and skip that step in the compression,
-// It otherwise allows to inspect the histogram when compression is done.
-// To indicate that you have populated the histogram call HistogramFinished
-// with the value of the highest populated symbol, as well as the number of entries
-// in the most populated entry. These are accepted at face value.
-func (s *fseEncoder) Histogram() *[256]uint32 {
-	return &s.count
-}
-
-// HistogramFinished can be called to indicate that the histogram has been populated.
-// maxSymbol is the index of the highest set symbol of the next data segment.
-// maxCount is the number of entries in the most populated entry.
-// These are accepted at face value.
-func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
-	s.maxCount = maxCount
-	s.symbolLen = uint16(maxSymbol) + 1
-	s.clearCount = maxCount != 0
-}
-
-// allocCtable will allocate tables needed for compression.
-// If existing tables a re big enough, they are simply re-used.
-func (s *fseEncoder) allocCtable() {
-	tableSize := 1 << s.actualTableLog
-	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < tableSize {
-		s.ct.tableSymbol = make([]byte, tableSize)
-	}
-	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
-
-	ctSize := tableSize
-	if cap(s.ct.stateTable) < ctSize {
-		s.ct.stateTable = make([]uint16, ctSize)
-	}
-	s.ct.stateTable = s.ct.stateTable[:ctSize]
-
-	if cap(s.ct.symbolTT) < 256 {
-		s.ct.symbolTT = make([]symbolTransform, 256)
-	}
-	s.ct.symbolTT = s.ct.symbolTT[:256]
-}
-
-// buildCTable will populate the compression table so it is ready to be used.
-func (s *fseEncoder) buildCTable() error {
-	tableSize := uint32(1 << s.actualTableLog)
-	highThreshold := tableSize - 1
-	var cumul [256]int16
-
-	s.allocCtable()
-	tableSymbol := s.ct.tableSymbol[:tableSize]
-	// symbol start positions
-	{
-		cumul[0] = 0
-		for ui, v := range s.norm[:s.symbolLen-1] {
-			u := byte(ui) // one less than reference
-			if v == -1 {
-				// Low proba symbol
-				cumul[u+1] = cumul[u] + 1
-				tableSymbol[highThreshold] = u
-				highThreshold--
-			} else {
-				cumul[u+1] = cumul[u] + v
-			}
-		}
-		// Encode last symbol separately to avoid overflowing u
-		u := int(s.symbolLen - 1)
-		v := s.norm[s.symbolLen-1]
-		if v == -1 {
-			// Low proba symbol
-			cumul[u+1] = cumul[u] + 1
-			tableSymbol[highThreshold] = byte(u)
-			highThreshold--
-		} else {
-			cumul[u+1] = cumul[u] + v
-		}
-		if uint32(cumul[s.symbolLen]) != tableSize {
-			return fmt.Errorf("internal error: expected cumul[s.symbolLen] (%d) == tableSize (%d)", cumul[s.symbolLen], tableSize)
-		}
-		cumul[s.symbolLen] = int16(tableSize) + 1
-	}
-	// Spread symbols
-	s.zeroBits = false
-	{
-		step := tableStep(tableSize)
-		tableMask := tableSize - 1
-		var position uint32
-		// if any symbol > largeLimit, we may have 0 bits output.
-		largeLimit := int16(1 << (s.actualTableLog - 1))
-		for ui, v := range s.norm[:s.symbolLen] {
-			symbol := byte(ui)
-			if v > largeLimit {
-				s.zeroBits = true
-			}
-			for nbOccurrences := int16(0); nbOccurrences < v; nbOccurrences++ {
-				tableSymbol[position] = symbol
-				position = (position + step) & tableMask
-				for position > highThreshold {
-					position = (position + step) & tableMask
-				} /* Low proba area */
-			}
-		}
-
-		// Check if we have gone through all positions
-		if position != 0 {
-			return errors.New("position!=0")
-		}
-	}
-
-	// Build table
-	table := s.ct.stateTable
-	{
-		tsi := int(tableSize)
-		for u, v := range tableSymbol {
-			// TableU16 : sorted by symbol order; gives next state value
-			table[cumul[v]] = uint16(tsi + u)
-			cumul[v]++
-		}
-	}
-
-	// Build Symbol Transformation Table
-	{
-		total := int16(0)
-		symbolTT := s.ct.symbolTT[:s.symbolLen]
-		tableLog := s.actualTableLog
-		tl := (uint32(tableLog) << 16) - (1 << tableLog)
-		for i, v := range s.norm[:s.symbolLen] {
-			switch v {
-			case 0:
-			case -1, 1:
-				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = total - 1
-				total++
-			default:
-				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
-				minStatePlus := uint32(v) << maxBitsOut
-				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = total - v
-				total += v
-			}
-		}
-		if total != int16(tableSize) {
-			return fmt.Errorf("total mismatch %d (got) != %d (want)", total, tableSize)
-		}
-	}
-	return nil
-}
-
-var rtbTable = [...]uint32{0, 473195, 504333, 520860, 550000, 700000, 750000, 830000}
-
-func (s *fseEncoder) setRLE(val byte) {
-	s.allocCtable()
-	s.actualTableLog = 0
-	s.ct.stateTable = s.ct.stateTable[:1]
-	s.ct.symbolTT[val] = symbolTransform{
-		deltaFindState: 0,
-		deltaNbBits:    0,
-	}
-	if debugEncoder {
-		println("setRLE: val", val, "symbolTT", s.ct.symbolTT[val])
-	}
-	s.rleVal = val
-	s.useRLE = true
-}
-
-// setBits will set output bits for the transform.
-// if nil is provided, the number of bits is equal to the index.
-func (s *fseEncoder) setBits(transform []byte) {
-	if s.reUsed || s.preDefined {
-		return
-	}
-	if s.useRLE {
-		if transform == nil {
-			s.ct.symbolTT[s.rleVal].outBits = s.rleVal
-			s.maxBits = s.rleVal
-			return
-		}
-		s.maxBits = transform[s.rleVal]
-		s.ct.symbolTT[s.rleVal].outBits = s.maxBits
-		return
-	}
-	if transform == nil {
-		for i := range s.ct.symbolTT[:s.symbolLen] {
-			s.ct.symbolTT[i].outBits = uint8(i)
-		}
-		s.maxBits = uint8(s.symbolLen - 1)
-		return
-	}
-	s.maxBits = 0
-	for i, v := range transform[:s.symbolLen] {
-		s.ct.symbolTT[i].outBits = v
-		if v > s.maxBits {
-			// We could assume bits always going up, but we play safe.
-			s.maxBits = v
-		}
-	}
-}
-
-// normalizeCount will normalize the count of the symbols so
-// the total is equal to the table size.
-// If successful, compression tables will also be made ready.
-func (s *fseEncoder) normalizeCount(length int) error {
-	if s.reUsed {
-		return nil
-	}
-	s.optimalTableLog(length)
-	var (
-		tableLog          = s.actualTableLog
-		scale             = 62 - uint64(tableLog)
-		step              = (1 << 62) / uint64(length)
-		vStep             = uint64(1) << (scale - 20)
-		stillToDistribute = int16(1 << tableLog)
-		largest           int
-		largestP          int16
-		lowThreshold      = (uint32)(length >> tableLog)
-	)
-	if s.maxCount == length {
-		s.useRLE = true
-		return nil
-	}
-	s.useRLE = false
-	for i, cnt := range s.count[:s.symbolLen] {
-		// already handled
-		// if (count[s] == s.length) return 0;   /* rle special case */
-
-		if cnt == 0 {
-			s.norm[i] = 0
-			continue
-		}
-		if cnt <= lowThreshold {
-			s.norm[i] = -1
-			stillToDistribute--
-		} else {
-			proba := (int16)((uint64(cnt) * step) >> scale)
-			if proba < 8 {
-				restToBeat := vStep * uint64(rtbTable[proba])
-				v := uint64(cnt)*step - (uint64(proba) << scale)
-				if v > restToBeat {
-					proba++
-				}
-			}
-			if proba > largestP {
-				largestP = proba
-				largest = i
-			}
-			s.norm[i] = proba
-			stillToDistribute -= proba
-		}
-	}
-
-	if -stillToDistribute >= (s.norm[largest] >> 1) {
-		// corner case, need another normalization method
-		err := s.normalizeCount2(length)
-		if err != nil {
-			return err
-		}
-		if debugAsserts {
-			err = s.validateNorm()
-			if err != nil {
-				return err
-			}
-		}
-		return s.buildCTable()
-	}
-	s.norm[largest] += stillToDistribute
-	if debugAsserts {
-		err := s.validateNorm()
-		if err != nil {
-			return err
-		}
-	}
-	return s.buildCTable()
-}
-
-// Secondary normalization method.
-// To be used when primary method fails.
-func (s *fseEncoder) normalizeCount2(length int) error {
-	const notYetAssigned = -2
-	var (
-		distributed  uint32
-		total        = uint32(length)
-		tableLog     = s.actualTableLog
-		lowThreshold = total >> tableLog
-		lowOne       = (total * 3) >> (tableLog + 1)
-	)
-	for i, cnt := range s.count[:s.symbolLen] {
-		if cnt == 0 {
-			s.norm[i] = 0
-			continue
-		}
-		if cnt <= lowThreshold {
-			s.norm[i] = -1
-			distributed++
-			total -= cnt
-			continue
-		}
-		if cnt <= lowOne {
-			s.norm[i] = 1
-			distributed++
-			total -= cnt
-			continue
-		}
-		s.norm[i] = notYetAssigned
-	}
-	toDistribute := (1 << tableLog) - distributed
-
-	if (total / toDistribute) > lowOne {
-		// risk of rounding to zero
-		lowOne = (total * 3) / (toDistribute * 2)
-		for i, cnt := range s.count[:s.symbolLen] {
-			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
-				s.norm[i] = 1
-				distributed++
-				total -= cnt
-				continue
-			}
-		}
-		toDistribute = (1 << tableLog) - distributed
-	}
-	if distributed == uint32(s.symbolLen)+1 {
-		// all values are pretty poor;
-		//   probably incompressible data (should have already been detected);
-		//   find max, then give all remaining points to max
-		var maxV int
-		var maxC uint32
-		for i, cnt := range s.count[:s.symbolLen] {
-			if cnt > maxC {
-				maxV = i
-				maxC = cnt
-			}
-		}
-		s.norm[maxV] += int16(toDistribute)
-		return nil
-	}
-
-	if total == 0 {
-		// all of the symbols were low enough for the lowOne or lowThreshold
-		for i := uint32(0); toDistribute > 0; i = (i + 1) % (uint32(s.symbolLen)) {
-			if s.norm[i] > 0 {
-				toDistribute--
-				s.norm[i]++
-			}
-		}
-		return nil
-	}
-
-	var (
-		vStepLog = 62 - uint64(tableLog)
-		mid      = uint64((1 << (vStepLog - 1)) - 1)
-		rStep    = (((1 << vStepLog) * uint64(toDistribute)) + mid) / uint64(total) // scale on remaining
-		tmpTotal = mid
-	)
-	for i, cnt := range s.count[:s.symbolLen] {
-		if s.norm[i] == notYetAssigned {
-			var (
-				end    = tmpTotal + uint64(cnt)*rStep
-				sStart = uint32(tmpTotal >> vStepLog)
-				sEnd   = uint32(end >> vStepLog)
-				weight = sEnd - sStart
-			)
-			if weight < 1 {
-				return errors.New("weight < 1")
-			}
-			s.norm[i] = int16(weight)
-			tmpTotal = end
-		}
-	}
-	return nil
-}
-
-// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
-func (s *fseEncoder) optimalTableLog(length int) {
-	tableLog := uint8(maxEncTableLog)
-	minBitsSrc := highBit(uint32(length)) + 1
-	minBitsSymbols := highBit(uint32(s.symbolLen-1)) + 2
-	minBits := uint8(minBitsSymbols)
-	if minBitsSrc < minBitsSymbols {
-		minBits = uint8(minBitsSrc)
-	}
-
-	maxBitsSrc := uint8(highBit(uint32(length-1))) - 2
-	if maxBitsSrc < tableLog {
-		// Accuracy can be reduced
-		tableLog = maxBitsSrc
-	}
-	if minBits > tableLog {
-		tableLog = minBits
-	}
-	// Need a minimum to safely represent all symbol values
-	if tableLog < minEncTablelog {
-		tableLog = minEncTablelog
-	}
-	if tableLog > maxEncTableLog {
-		tableLog = maxEncTableLog
-	}
-	s.actualTableLog = tableLog
-}
-
-// validateNorm validates the normalized histogram table.
-func (s *fseEncoder) validateNorm() (err error) {
-	var total int
-	for _, v := range s.norm[:s.symbolLen] {
-		if v >= 0 {
-			total += int(v)
-		} else {
-			total -= int(v)
-		}
-	}
-	defer func() {
-		if err == nil {
-			return
-		}
-		fmt.Printf("selected TableLog: %d, Symbol length: %d\n", s.actualTableLog, s.symbolLen)
-		for i, v := range s.norm[:s.symbolLen] {
-			fmt.Printf("%3d: %5d -> %4d \n", i, s.count[i], v)
-		}
-	}()
-	if total != (1 << s.actualTableLog) {
-		return fmt.Errorf("warning: Total == %d != %d", total, 1<<s.actualTableLog)
-	}
-	for i, v := range s.count[s.symbolLen:] {
-		if v != 0 {
-			return fmt.Errorf("warning: Found symbol out of range, %d after cut", i)
-		}
-	}
-	return nil
-}
-
-// writeCount will write the normalized histogram count to header.
-// This is read back by readNCount.
-func (s *fseEncoder) writeCount(out []byte) ([]byte, error) {
-	if s.useRLE {
-		return append(out, s.rleVal), nil
-	}
-	if s.preDefined || s.reUsed {
-		// Never write predefined.
-		return out, nil
-	}
-
-	var (
-		tableLog  = s.actualTableLog
-		tableSize = 1 << tableLog
-		previous0 bool
-		charnum   uint16
-
-		// maximum header size plus 2 extra bytes for final output if bitCount == 0.
-		maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3 + 2
-
-		// Write Table Size
-		bitStream = uint32(tableLog - minEncTablelog)
-		bitCount  = uint(4)
-		remaining = int16(tableSize + 1) /* +1 for extra accuracy */
-		threshold = int16(tableSize)
-		nbBits    = uint(tableLog + 1)
-		outP      = len(out)
-	)
-	if cap(out) < outP+maxHeaderSize {
-		out = append(out, make([]byte, maxHeaderSize*3)...)
-		out = out[:len(out)-maxHeaderSize*3]
-	}
-	out = out[:outP+maxHeaderSize]
-
-	// stops at 1
-	for remaining > 1 {
-		if previous0 {
-			start := charnum
-			for s.norm[charnum] == 0 {
-				charnum++
-			}
-			for charnum >= start+24 {
-				start += 24
-				bitStream += uint32(0xFFFF) << bitCount
-				out[outP] = byte(bitStream)
-				out[outP+1] = byte(bitStream >> 8)
-				outP += 2
-				bitStream >>= 16
-			}
-			for charnum >= start+3 {
-				start += 3
-				bitStream += 3 << bitCount
-				bitCount += 2
-			}
-			bitStream += uint32(charnum-start) << bitCount
-			bitCount += 2
-			if bitCount > 16 {
-				out[outP] = byte(bitStream)
-				out[outP+1] = byte(bitStream >> 8)
-				outP += 2
-				bitStream >>= 16
-				bitCount -= 16
-			}
-		}
-
-		count := s.norm[charnum]
-		charnum++
-		max := (2*threshold - 1) - remaining
-		if count < 0 {
-			remaining += count
-		} else {
-			remaining -= count
-		}
-		count++ // +1 for extra accuracy
-		if count >= threshold {
-			count += max // [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[
-		}
-		bitStream += uint32(count) << bitCount
-		bitCount += nbBits
-		if count < max {
-			bitCount--
-		}
-
-		previous0 = count == 1
-		if remaining < 1 {
-			return nil, errors.New("internal error: remaining < 1")
-		}
-		for remaining < threshold {
-			nbBits--
-			threshold >>= 1
-		}
-
-		if bitCount > 16 {
-			out[outP] = byte(bitStream)
-			out[outP+1] = byte(bitStream >> 8)
-			outP += 2
-			bitStream >>= 16
-			bitCount -= 16
-		}
-	}
-
-	if outP+2 > len(out) {
-		return nil, fmt.Errorf("internal error: %d > %d, maxheader: %d, sl: %d, tl: %d, normcount: %v", outP+2, len(out), maxHeaderSize, s.symbolLen, int(tableLog), s.norm[:s.symbolLen])
-	}
-	out[outP] = byte(bitStream)
-	out[outP+1] = byte(bitStream >> 8)
-	outP += int((bitCount + 7) / 8)
-
-	if charnum > s.symbolLen {
-		return nil, errors.New("internal error: charnum > s.symbolLen")
-	}
-	return out[:outP], nil
-}
-
-// Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
-// note 1 : assume symbolValue is valid (<= maxSymbolValue)
-// note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits *
-func (s *fseEncoder) bitCost(symbolValue uint8, accuracyLog uint32) uint32 {
-	minNbBits := s.ct.symbolTT[symbolValue].deltaNbBits >> 16
-	threshold := (minNbBits + 1) << 16
-	if debugAsserts {
-		if !(s.actualTableLog < 16) {
-			panic("!s.actualTableLog < 16")
-		}
-		// ensure enough room for renormalization double shift
-		if !(uint8(accuracyLog) < 31-s.actualTableLog) {
-			panic("!uint8(accuracyLog) < 31-s.actualTableLog")
-		}
-	}
-	tableSize := uint32(1) << s.actualTableLog
-	deltaFromThreshold := threshold - (s.ct.symbolTT[symbolValue].deltaNbBits + tableSize)
-	// linear interpolation (very approximate)
-	normalizedDeltaFromThreshold := (deltaFromThreshold << accuracyLog) >> s.actualTableLog
-	bitMultiplier := uint32(1) << accuracyLog
-	if debugAsserts {
-		if s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold {
-			panic("s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold")
-		}
-		if normalizedDeltaFromThreshold > bitMultiplier {
-			panic("normalizedDeltaFromThreshold > bitMultiplier")
-		}
-	}
-	return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold
-}
-
-// Returns the cost in bits of encoding the distribution in count using ctable.
-// Histogram should only be up to the last non-zero symbol.
-// Returns an -1 if ctable cannot represent all the symbols in count.
-func (s *fseEncoder) approxSize(hist []uint32) uint32 {
-	if int(s.symbolLen) < len(hist) {
-		// More symbols than we have.
-		return math.MaxUint32
-	}
-	if s.useRLE {
-		// We will never reuse RLE encoders.
-		return math.MaxUint32
-	}
-	const kAccuracyLog = 8
-	badCost := (uint32(s.actualTableLog) + 1) << kAccuracyLog
-	var cost uint32
-	for i, v := range hist {
-		if v == 0 {
-			continue
-		}
-		if s.norm[i] == 0 {
-			return math.MaxUint32
-		}
-		bitCost := s.bitCost(uint8(i), kAccuracyLog)
-		if bitCost > badCost {
-			return math.MaxUint32
-		}
-		cost += v * bitCost
-	}
-	return cost >> kAccuracyLog
-}
-
-// maxHeaderSize returns the maximum header size in bits.
-// This is not exact size, but we want a penalty for new tables anyway.
-func (s *fseEncoder) maxHeaderSize() uint32 {
-	if s.preDefined {
-		return 0
-	}
-	if s.useRLE {
-		return 8
-	}
-	return (((uint32(s.symbolLen) * uint32(s.actualTableLog)) >> 3) + 3) * 8
-}
-
-// cState contains the compression state of a stream.
-type cState struct {
-	bw         *bitWriter
-	stateTable []uint16
-	state      uint16
-}
-
-// init will initialize the compression state to the first symbol of the stream.
-func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
-	c.bw = bw
-	c.stateTable = ct.stateTable
-	if len(c.stateTable) == 1 {
-		// RLE
-		c.stateTable[0] = uint16(0)
-		c.state = 0
-		return
-	}
-	nbBitsOut := (first.deltaNbBits + (1 << 15)) >> 16
-	im := int32((nbBitsOut << 16) - first.deltaNbBits)
-	lu := (im >> nbBitsOut) + int32(first.deltaFindState)
-	c.state = c.stateTable[lu]
-}
-
-// flush will write the tablelog to the output and flush the remaining full bytes.
-func (c *cState) flush(tableLog uint8) {
-	c.bw.flush32()
-	c.bw.addBits16NC(c.state, tableLog)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
deleted file mode 100644
index 474cb77d2..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"fmt"
-	"math"
-	"sync"
-)
-
-var (
-	// fsePredef are the predefined fse tables as defined here:
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions
-	// These values are already transformed.
-	fsePredef [3]fseDecoder
-
-	// fsePredefEnc are the predefined encoder based on fse tables as defined here:
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions
-	// These values are already transformed.
-	fsePredefEnc [3]fseEncoder
-
-	// symbolTableX contain the transformations needed for each type as defined in
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#the-codes-for-literals-lengths-match-lengths-and-offsets
-	symbolTableX [3][]baseOffset
-
-	// maxTableSymbol is the biggest supported symbol for each table type
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#the-codes-for-literals-lengths-match-lengths-and-offsets
-	maxTableSymbol = [3]uint8{tableLiteralLengths: maxLiteralLengthSymbol, tableOffsets: maxOffsetLengthSymbol, tableMatchLengths: maxMatchLengthSymbol}
-
-	// bitTables is the bits table for each table.
-	bitTables = [3][]byte{tableLiteralLengths: llBitsTable[:], tableOffsets: nil, tableMatchLengths: mlBitsTable[:]}
-)
-
-type tableIndex uint8
-
-const (
-	// indexes for fsePredef and symbolTableX
-	tableLiteralLengths tableIndex = 0
-	tableOffsets        tableIndex = 1
-	tableMatchLengths   tableIndex = 2
-
-	maxLiteralLengthSymbol = 35
-	maxOffsetLengthSymbol  = 30
-	maxMatchLengthSymbol   = 52
-)
-
-// baseOffset is used for calculating transformations.
-type baseOffset struct {
-	baseLine uint32
-	addBits  uint8
-}
-
-// fillBase will precalculate base offsets with the given bit distributions.
-func fillBase(dst []baseOffset, base uint32, bits ...uint8) {
-	if len(bits) != len(dst) {
-		panic(fmt.Sprintf("len(dst) (%d) != len(bits) (%d)", len(dst), len(bits)))
-	}
-	for i, bit := range bits {
-		if base > math.MaxInt32 {
-			panic("invalid decoding table, base overflows int32")
-		}
-
-		dst[i] = baseOffset{
-			baseLine: base,
-			addBits:  bit,
-		}
-		base += 1 << bit
-	}
-}
-
-var predef sync.Once
-
-func initPredefined() {
-	predef.Do(func() {
-		// Literals length codes
-		tmp := make([]baseOffset, 36)
-		for i := range tmp[:16] {
-			tmp[i] = baseOffset{
-				baseLine: uint32(i),
-				addBits:  0,
-			}
-		}
-		fillBase(tmp[16:], 16, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
-		symbolTableX[tableLiteralLengths] = tmp
-
-		// Match length codes
-		tmp = make([]baseOffset, 53)
-		for i := range tmp[:32] {
-			tmp[i] = baseOffset{
-				// The transformation adds the 3 length.
-				baseLine: uint32(i) + 3,
-				addBits:  0,
-			}
-		}
-		fillBase(tmp[32:], 35, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
-		symbolTableX[tableMatchLengths] = tmp
-
-		// Offset codes
-		tmp = make([]baseOffset, maxOffsetBits+1)
-		tmp[1] = baseOffset{
-			baseLine: 1,
-			addBits:  1,
-		}
-		fillBase(tmp[2:], 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)
-		symbolTableX[tableOffsets] = tmp
-
-		// Fill predefined tables and transform them.
-		// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions
-		for i := range fsePredef[:] {
-			f := &fsePredef[i]
-			switch tableIndex(i) {
-			case tableLiteralLengths:
-				// https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L243
-				f.actualTableLog = 6
-				copy(f.norm[:], []int16{4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
-					2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
-					-1, -1, -1, -1})
-				f.symbolLen = 36
-			case tableOffsets:
-				// https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L281
-				f.actualTableLog = 5
-				copy(f.norm[:], []int16{
-					1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
-					1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1})
-				f.symbolLen = 29
-			case tableMatchLengths:
-				//https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L304
-				f.actualTableLog = 6
-				copy(f.norm[:], []int16{
-					1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
-					1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-					1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1,
-					-1, -1, -1, -1, -1})
-				f.symbolLen = 53
-			}
-			if err := f.buildDtable(); err != nil {
-				panic(fmt.Errorf("building table %v: %v", tableIndex(i), err))
-			}
-			if err := f.transform(symbolTableX[i]); err != nil {
-				panic(fmt.Errorf("building table %v: %v", tableIndex(i), err))
-			}
-			f.preDefined = true
-
-			// Create encoder as well
-			enc := &fsePredefEnc[i]
-			copy(enc.norm[:], f.norm[:])
-			enc.symbolLen = f.symbolLen
-			enc.actualTableLog = f.actualTableLog
-			if err := enc.buildCTable(); err != nil {
-				panic(fmt.Errorf("building encoding table %v: %v", tableIndex(i), err))
-			}
-			enc.setBits(bitTables[i])
-			enc.preDefined = true
-		}
-	})
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
deleted file mode 100644
index 5d73c21eb..000000000
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-const (
-	prime3bytes = 506832829
-	prime4bytes = 2654435761
-	prime5bytes = 889523592379
-	prime6bytes = 227718039650203
-	prime7bytes = 58295818150454627
-	prime8bytes = 0xcf1bbcdcb7a56463
-)
-
-// hashLen returns a hash of the lowest mls bytes of with length output bits.
-// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
-// length should always be < 32.
-// Preferably length and mls should be a constant for inlining.
-func hashLen(u uint64, length, mls uint8) uint32 {
-	switch mls {
-	case 3:
-		return (uint32(u<<8) * prime3bytes) >> (32 - length)
-	case 5:
-		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
-	case 6:
-		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
-	case 7:
-		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
-	case 8:
-		return uint32((u * prime8bytes) >> (64 - length))
-	default:
-		return (uint32(u) * prime4bytes) >> (32 - length)
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
deleted file mode 100644
index 09164856d..000000000
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"github.com/klauspost/compress/huff0"
-)
-
-// history contains the information transferred between blocks.
-type history struct {
-	// Literal decompression
-	huffTree *huff0.Scratch
-
-	// Sequence decompression
-	decoders      sequenceDecs
-	recentOffsets [3]int
-
-	// History buffer...
-	b []byte
-
-	// ignoreBuffer is meant to ignore a number of bytes
-	// when checking for matches in history
-	ignoreBuffer int
-
-	windowSize       int
-	allocFrameBuffer int // needed?
-	error            bool
-	dict             *dict
-}
-
-// reset will reset the history to initial state of a frame.
-// The history must already have been initialized to the desired size.
-func (h *history) reset() {
-	h.b = h.b[:0]
-	h.ignoreBuffer = 0
-	h.error = false
-	h.recentOffsets = [3]int{1, 4, 8}
-	h.decoders.freeDecoders()
-	h.decoders = sequenceDecs{br: h.decoders.br}
-	h.freeHuffDecoder()
-	h.huffTree = nil
-	h.dict = nil
-	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
-}
-
-func (h *history) freeHuffDecoder() {
-	if h.huffTree != nil {
-		if h.dict == nil || h.dict.litEnc != h.huffTree {
-			huffDecoderPool.Put(h.huffTree)
-			h.huffTree = nil
-		}
-	}
-}
-
-func (h *history) setDict(dict *dict) {
-	if dict == nil {
-		return
-	}
-	h.dict = dict
-	h.decoders.litLengths = dict.llDec
-	h.decoders.offsets = dict.ofDec
-	h.decoders.matchLengths = dict.mlDec
-	h.decoders.dict = dict.content
-	h.recentOffsets = dict.offsets
-	h.huffTree = dict.litEnc
-}
-
-// append bytes to history.
-// This function will make sure there is space for it,
-// if the buffer has been allocated with enough extra space.
-func (h *history) append(b []byte) {
-	if len(b) >= h.windowSize {
-		// Discard all history by simply overwriting
-		h.b = h.b[:h.windowSize]
-		copy(h.b, b[len(b)-h.windowSize:])
-		return
-	}
-
-	// If there is space, append it.
-	if len(b) < cap(h.b)-len(h.b) {
-		h.b = append(h.b, b...)
-		return
-	}
-
-	// Move data down so we only have window size left.
-	// We know we have less than window size in b at this point.
-	discard := len(b) + len(h.b) - h.windowSize
-	copy(h.b, h.b[discard:])
-	h.b = h.b[:h.windowSize]
-	copy(h.b[h.windowSize-len(b):], b)
-}
-
-// ensureBlock will ensure there is space for at least one block...
-func (h *history) ensureBlock() {
-	if cap(h.b) < h.allocFrameBuffer {
-		h.b = make([]byte, 0, h.allocFrameBuffer)
-		return
-	}
-
-	avail := cap(h.b) - len(h.b)
-	if avail >= h.windowSize || avail > maxCompressedBlockSize {
-		return
-	}
-	// Move data down so we only have window size left.
-	// We know we have less than window size in b at this point.
-	discard := len(h.b) - h.windowSize
-	copy(h.b, h.b[discard:])
-	h.b = h.b[:h.windowSize]
-}
-
-// append bytes to history without ever discarding anything.
-func (h *history) appendKeep(b []byte) {
-	h.b = append(h.b, b...)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/LICENSE.txt b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/LICENSE.txt
deleted file mode 100644
index 24b53065f..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/LICENSE.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-Copyright (c) 2016 Caleb Spare
-
-MIT License
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
deleted file mode 100644
index 777290d44..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# xxhash
-
-VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
-
-xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
-high-quality hashing algorithm that is much faster than anything in the Go
-standard library.
-
-This package provides a straightforward API:
-
-```
-func Sum64(b []byte) uint64
-func Sum64String(s string) uint64
-type Digest struct{ ... }
-    func New() *Digest
-```
-
-The `Digest` type implements hash.Hash64. Its key methods are:
-
-```
-func (*Digest) Write([]byte) (int, error)
-func (*Digest) WriteString(string) (int, error)
-func (*Digest) Sum64() uint64
-```
-
-The package is written with optimized pure Go and also contains even faster
-assembly implementations for amd64 and arm64. If desired, the `purego` build tag
-opts into using the Go code even on those architectures.
-
-[xxHash]: http://cyan4973.github.io/xxHash/
-
-## Compatibility
-
-This package is in a module and the latest code is in version 2 of the module.
-You need a version of Go with at least "minimal module compatibility" to use
-github.com/cespare/xxhash/v2:
-
-* 1.9.7+ for Go 1.9
-* 1.10.3+ for Go 1.10
-* Go 1.11 or later
-
-I recommend using the latest release of Go.
-
-## Benchmarks
-
-Here are some quick benchmarks comparing the pure-Go and assembly
-implementations of Sum64.
-
-| input size | purego    | asm       |
-| ---------- | --------- | --------- |
-| 4 B        |  1.3 GB/s |  1.2 GB/s |
-| 16 B       |  2.9 GB/s |  3.5 GB/s |
-| 100 B      |  6.9 GB/s |  8.1 GB/s |
-| 4 KB       | 11.7 GB/s | 16.7 GB/s |
-| 10 MB      | 12.0 GB/s | 17.3 GB/s |
-
-These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
-CPU using the following commands under Go 1.19.2:
-
-```
-benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
-benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
-```
-
-## Projects using this package
-
-- [InfluxDB](https://github.com/influxdata/influxdb)
-- [Prometheus](https://github.com/prometheus/prometheus)
-- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
-- [FreeCache](https://github.com/coocood/freecache)
-- [FastCache](https://github.com/VictoriaMetrics/fastcache)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
deleted file mode 100644
index fc40c8200..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ /dev/null
@@ -1,230 +0,0 @@
-// Package xxhash implements the 64-bit variant of xxHash (XXH64) as described
-// at http://cyan4973.github.io/xxHash/.
-// THIS IS VENDORED: Go to github.com/cespare/xxhash for original package.
-
-package xxhash
-
-import (
-	"encoding/binary"
-	"errors"
-	"math/bits"
-)
-
-const (
-	prime1 uint64 = 11400714785074694791
-	prime2 uint64 = 14029467366897019727
-	prime3 uint64 = 1609587929392839161
-	prime4 uint64 = 9650029242287828579
-	prime5 uint64 = 2870177450012600261
-)
-
-// Store the primes in an array as well.
-//
-// The consts are used when possible in Go code to avoid MOVs but we need a
-// contiguous array of the assembly code.
-var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
-
-// Digest implements hash.Hash64.
-type Digest struct {
-	v1    uint64
-	v2    uint64
-	v3    uint64
-	v4    uint64
-	total uint64
-	mem   [32]byte
-	n     int // how much of mem is used
-}
-
-// New creates a new Digest that computes the 64-bit xxHash algorithm.
-func New() *Digest {
-	var d Digest
-	d.Reset()
-	return &d
-}
-
-// Reset clears the Digest's state so that it can be reused.
-func (d *Digest) Reset() {
-	d.v1 = primes[0] + prime2
-	d.v2 = prime2
-	d.v3 = 0
-	d.v4 = -primes[0]
-	d.total = 0
-	d.n = 0
-}
-
-// Size always returns 8 bytes.
-func (d *Digest) Size() int { return 8 }
-
-// BlockSize always returns 32 bytes.
-func (d *Digest) BlockSize() int { return 32 }
-
-// Write adds more data to d. It always returns len(b), nil.
-func (d *Digest) Write(b []byte) (n int, err error) {
-	n = len(b)
-	d.total += uint64(n)
-
-	memleft := d.mem[d.n&(len(d.mem)-1):]
-
-	if d.n+n < 32 {
-		// This new data doesn't even fill the current block.
-		copy(memleft, b)
-		d.n += n
-		return
-	}
-
-	if d.n > 0 {
-		// Finish off the partial block.
-		c := copy(memleft, b)
-		d.v1 = round(d.v1, u64(d.mem[0:8]))
-		d.v2 = round(d.v2, u64(d.mem[8:16]))
-		d.v3 = round(d.v3, u64(d.mem[16:24]))
-		d.v4 = round(d.v4, u64(d.mem[24:32]))
-		b = b[c:]
-		d.n = 0
-	}
-
-	if len(b) >= 32 {
-		// One or more full blocks left.
-		nw := writeBlocks(d, b)
-		b = b[nw:]
-	}
-
-	// Store any remaining partial block.
-	copy(d.mem[:], b)
-	d.n = len(b)
-
-	return
-}
-
-// Sum appends the current hash to b and returns the resulting slice.
-func (d *Digest) Sum(b []byte) []byte {
-	s := d.Sum64()
-	return append(
-		b,
-		byte(s>>56),
-		byte(s>>48),
-		byte(s>>40),
-		byte(s>>32),
-		byte(s>>24),
-		byte(s>>16),
-		byte(s>>8),
-		byte(s),
-	)
-}
-
-// Sum64 returns the current hash.
-func (d *Digest) Sum64() uint64 {
-	var h uint64
-
-	if d.total >= 32 {
-		v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
-		h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
-		h = mergeRound(h, v1)
-		h = mergeRound(h, v2)
-		h = mergeRound(h, v3)
-		h = mergeRound(h, v4)
-	} else {
-		h = d.v3 + prime5
-	}
-
-	h += d.total
-
-	b := d.mem[:d.n&(len(d.mem)-1)]
-	for ; len(b) >= 8; b = b[8:] {
-		k1 := round(0, u64(b[:8]))
-		h ^= k1
-		h = rol27(h)*prime1 + prime4
-	}
-	if len(b) >= 4 {
-		h ^= uint64(u32(b[:4])) * prime1
-		h = rol23(h)*prime2 + prime3
-		b = b[4:]
-	}
-	for ; len(b) > 0; b = b[1:] {
-		h ^= uint64(b[0]) * prime5
-		h = rol11(h) * prime1
-	}
-
-	h ^= h >> 33
-	h *= prime2
-	h ^= h >> 29
-	h *= prime3
-	h ^= h >> 32
-
-	return h
-}
-
-const (
-	magic         = "xxh\x06"
-	marshaledSize = len(magic) + 8*5 + 32
-)
-
-// MarshalBinary implements the encoding.BinaryMarshaler interface.
-func (d *Digest) MarshalBinary() ([]byte, error) {
-	b := make([]byte, 0, marshaledSize)
-	b = append(b, magic...)
-	b = appendUint64(b, d.v1)
-	b = appendUint64(b, d.v2)
-	b = appendUint64(b, d.v3)
-	b = appendUint64(b, d.v4)
-	b = appendUint64(b, d.total)
-	b = append(b, d.mem[:d.n]...)
-	b = b[:len(b)+len(d.mem)-d.n]
-	return b, nil
-}
-
-// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
-func (d *Digest) UnmarshalBinary(b []byte) error {
-	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
-		return errors.New("xxhash: invalid hash state identifier")
-	}
-	if len(b) != marshaledSize {
-		return errors.New("xxhash: invalid hash state size")
-	}
-	b = b[len(magic):]
-	b, d.v1 = consumeUint64(b)
-	b, d.v2 = consumeUint64(b)
-	b, d.v3 = consumeUint64(b)
-	b, d.v4 = consumeUint64(b)
-	b, d.total = consumeUint64(b)
-	copy(d.mem[:], b)
-	d.n = int(d.total % uint64(len(d.mem)))
-	return nil
-}
-
-func appendUint64(b []byte, x uint64) []byte {
-	var a [8]byte
-	binary.LittleEndian.PutUint64(a[:], x)
-	return append(b, a[:]...)
-}
-
-func consumeUint64(b []byte) ([]byte, uint64) {
-	x := u64(b)
-	return b[8:], x
-}
-
-func u64(b []byte) uint64 { return binary.LittleEndian.Uint64(b) }
-func u32(b []byte) uint32 { return binary.LittleEndian.Uint32(b) }
-
-func round(acc, input uint64) uint64 {
-	acc += input * prime2
-	acc = rol31(acc)
-	acc *= prime1
-	return acc
-}
-
-func mergeRound(acc, val uint64) uint64 {
-	val = round(0, val)
-	acc ^= val
-	acc = acc*prime1 + prime4
-	return acc
-}
-
-func rol1(x uint64) uint64  { return bits.RotateLeft64(x, 1) }
-func rol7(x uint64) uint64  { return bits.RotateLeft64(x, 7) }
-func rol11(x uint64) uint64 { return bits.RotateLeft64(x, 11) }
-func rol12(x uint64) uint64 { return bits.RotateLeft64(x, 12) }
-func rol18(x uint64) uint64 { return bits.RotateLeft64(x, 18) }
-func rol23(x uint64) uint64 { return bits.RotateLeft64(x, 23) }
-func rol27(x uint64) uint64 { return bits.RotateLeft64(x, 27) }
-func rol31(x uint64) uint64 { return bits.RotateLeft64(x, 31) }
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
deleted file mode 100644
index ddb63aa91..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ /dev/null
@@ -1,210 +0,0 @@
-//go:build !appengine && gc && !purego && !noasm
-// +build !appengine
-// +build gc
-// +build !purego
-// +build !noasm
-
-#include "textflag.h"
-
-// Registers:
-#define h      AX
-#define d      AX
-#define p      SI // pointer to advance through b
-#define n      DX
-#define end    BX // loop end
-#define v1     R8
-#define v2     R9
-#define v3     R10
-#define v4     R11
-#define x      R12
-#define prime1 R13
-#define prime2 R14
-#define prime4 DI
-
-#define round(acc, x) \
-	IMULQ prime2, x   \
-	ADDQ  x, acc      \
-	ROLQ  $31, acc    \
-	IMULQ prime1, acc
-
-// round0 performs the operation x = round(0, x).
-#define round0(x) \
-	IMULQ prime2, x \
-	ROLQ  $31, x    \
-	IMULQ prime1, x
-
-// mergeRound applies a merge round on the two registers acc and x.
-// It assumes that prime1, prime2, and prime4 have been loaded.
-#define mergeRound(acc, x) \
-	round0(x)         \
-	XORQ  x, acc      \
-	IMULQ prime1, acc \
-	ADDQ  prime4, acc
-
-// blockLoop processes as many 32-byte blocks as possible,
-// updating v1, v2, v3, and v4. It assumes that there is at least one block
-// to process.
-#define blockLoop() \
-loop:  \
-	MOVQ +0(p), x  \
-	round(v1, x)   \
-	MOVQ +8(p), x  \
-	round(v2, x)   \
-	MOVQ +16(p), x \
-	round(v3, x)   \
-	MOVQ +24(p), x \
-	round(v4, x)   \
-	ADDQ $32, p    \
-	CMPQ p, end    \
-	JLE  loop
-
-// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
-	// Load fixed primes.
-	MOVQ ·primes+0(SB), prime1
-	MOVQ ·primes+8(SB), prime2
-	MOVQ ·primes+24(SB), prime4
-
-	// Load slice.
-	MOVQ b_base+0(FP), p
-	MOVQ b_len+8(FP), n
-	LEAQ (p)(n*1), end
-
-	// The first loop limit will be len(b)-32.
-	SUBQ $32, end
-
-	// Check whether we have at least one block.
-	CMPQ n, $32
-	JLT  noBlocks
-
-	// Set up initial state (v1, v2, v3, v4).
-	MOVQ prime1, v1
-	ADDQ prime2, v1
-	MOVQ prime2, v2
-	XORQ v3, v3
-	XORQ v4, v4
-	SUBQ prime1, v4
-
-	blockLoop()
-
-	MOVQ v1, h
-	ROLQ $1, h
-	MOVQ v2, x
-	ROLQ $7, x
-	ADDQ x, h
-	MOVQ v3, x
-	ROLQ $12, x
-	ADDQ x, h
-	MOVQ v4, x
-	ROLQ $18, x
-	ADDQ x, h
-
-	mergeRound(h, v1)
-	mergeRound(h, v2)
-	mergeRound(h, v3)
-	mergeRound(h, v4)
-
-	JMP afterBlocks
-
-noBlocks:
-	MOVQ ·primes+32(SB), h
-
-afterBlocks:
-	ADDQ n, h
-
-	ADDQ $24, end
-	CMPQ p, end
-	JG   try4
-
-loop8:
-	MOVQ  (p), x
-	ADDQ  $8, p
-	round0(x)
-	XORQ  x, h
-	ROLQ  $27, h
-	IMULQ prime1, h
-	ADDQ  prime4, h
-
-	CMPQ p, end
-	JLE  loop8
-
-try4:
-	ADDQ $4, end
-	CMPQ p, end
-	JG   try1
-
-	MOVL  (p), x
-	ADDQ  $4, p
-	IMULQ prime1, x
-	XORQ  x, h
-
-	ROLQ  $23, h
-	IMULQ prime2, h
-	ADDQ  ·primes+16(SB), h
-
-try1:
-	ADDQ $4, end
-	CMPQ p, end
-	JGE  finalize
-
-loop1:
-	MOVBQZX (p), x
-	ADDQ    $1, p
-	IMULQ   ·primes+32(SB), x
-	XORQ    x, h
-	ROLQ    $11, h
-	IMULQ   prime1, h
-
-	CMPQ p, end
-	JL   loop1
-
-finalize:
-	MOVQ  h, x
-	SHRQ  $33, x
-	XORQ  x, h
-	IMULQ prime2, h
-	MOVQ  h, x
-	SHRQ  $29, x
-	XORQ  x, h
-	IMULQ ·primes+16(SB), h
-	MOVQ  h, x
-	SHRQ  $32, x
-	XORQ  x, h
-
-	MOVQ h, ret+24(FP)
-	RET
-
-// func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
-	// Load fixed primes needed for round.
-	MOVQ ·primes+0(SB), prime1
-	MOVQ ·primes+8(SB), prime2
-
-	// Load slice.
-	MOVQ b_base+8(FP), p
-	MOVQ b_len+16(FP), n
-	LEAQ (p)(n*1), end
-	SUBQ $32, end
-
-	// Load vN from d.
-	MOVQ s+0(FP), d
-	MOVQ 0(d), v1
-	MOVQ 8(d), v2
-	MOVQ 16(d), v3
-	MOVQ 24(d), v4
-
-	// We don't need to check the loop condition here; this function is
-	// always called with at least one block of data to process.
-	blockLoop()
-
-	// Copy vN back to d.
-	MOVQ v1, 0(d)
-	MOVQ v2, 8(d)
-	MOVQ v3, 16(d)
-	MOVQ v4, 24(d)
-
-	// The number of bytes written is p minus the old base pointer.
-	SUBQ b_base+8(FP), p
-	MOVQ p, ret+32(FP)
-
-	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
deleted file mode 100644
index ae7d4d329..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
+++ /dev/null
@@ -1,184 +0,0 @@
-//go:build !appengine && gc && !purego && !noasm
-// +build !appengine
-// +build gc
-// +build !purego
-// +build !noasm
-
-#include "textflag.h"
-
-// Registers:
-#define digest	R1
-#define h	R2 // return value
-#define p	R3 // input pointer
-#define n	R4 // input length
-#define nblocks	R5 // n / 32
-#define prime1	R7
-#define prime2	R8
-#define prime3	R9
-#define prime4	R10
-#define prime5	R11
-#define v1	R12
-#define v2	R13
-#define v3	R14
-#define v4	R15
-#define x1	R20
-#define x2	R21
-#define x3	R22
-#define x4	R23
-
-#define round(acc, x) \
-	MADD prime2, acc, x, acc \
-	ROR  $64-31, acc         \
-	MUL  prime1, acc
-
-// round0 performs the operation x = round(0, x).
-#define round0(x) \
-	MUL prime2, x \
-	ROR $64-31, x \
-	MUL prime1, x
-
-#define mergeRound(acc, x) \
-	round0(x)                     \
-	EOR  x, acc                   \
-	MADD acc, prime4, prime1, acc
-
-// blockLoop processes as many 32-byte blocks as possible,
-// updating v1, v2, v3, and v4. It assumes that n >= 32.
-#define blockLoop() \
-	LSR     $5, n, nblocks  \
-	PCALIGN $16             \
-	loop:                   \
-	LDP.P   16(p), (x1, x2) \
-	LDP.P   16(p), (x3, x4) \
-	round(v1, x1)           \
-	round(v2, x2)           \
-	round(v3, x3)           \
-	round(v4, x4)           \
-	SUB     $1, nblocks     \
-	CBNZ    nblocks, loop
-
-// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
-	LDP b_base+0(FP), (p, n)
-
-	LDP  ·primes+0(SB), (prime1, prime2)
-	LDP  ·primes+16(SB), (prime3, prime4)
-	MOVD ·primes+32(SB), prime5
-
-	CMP  $32, n
-	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
-	BLT  afterLoop
-
-	ADD  prime1, prime2, v1
-	MOVD prime2, v2
-	MOVD $0, v3
-	NEG  prime1, v4
-
-	blockLoop()
-
-	ROR $64-1, v1, x1
-	ROR $64-7, v2, x2
-	ADD x1, x2
-	ROR $64-12, v3, x3
-	ROR $64-18, v4, x4
-	ADD x3, x4
-	ADD x2, x4, h
-
-	mergeRound(h, v1)
-	mergeRound(h, v2)
-	mergeRound(h, v3)
-	mergeRound(h, v4)
-
-afterLoop:
-	ADD n, h
-
-	TBZ   $4, n, try8
-	LDP.P 16(p), (x1, x2)
-
-	round0(x1)
-
-	// NOTE: here and below, sequencing the EOR after the ROR (using a
-	// rotated register) is worth a small but measurable speedup for small
-	// inputs.
-	ROR  $64-27, h
-	EOR  x1 @> 64-27, h, h
-	MADD h, prime4, prime1, h
-
-	round0(x2)
-	ROR  $64-27, h
-	EOR  x2 @> 64-27, h, h
-	MADD h, prime4, prime1, h
-
-try8:
-	TBZ    $3, n, try4
-	MOVD.P 8(p), x1
-
-	round0(x1)
-	ROR  $64-27, h
-	EOR  x1 @> 64-27, h, h
-	MADD h, prime4, prime1, h
-
-try4:
-	TBZ     $2, n, try2
-	MOVWU.P 4(p), x2
-
-	MUL  prime1, x2
-	ROR  $64-23, h
-	EOR  x2 @> 64-23, h, h
-	MADD h, prime3, prime2, h
-
-try2:
-	TBZ     $1, n, try1
-	MOVHU.P 2(p), x3
-	AND     $255, x3, x1
-	LSR     $8, x3, x2
-
-	MUL prime5, x1
-	ROR $64-11, h
-	EOR x1 @> 64-11, h, h
-	MUL prime1, h
-
-	MUL prime5, x2
-	ROR $64-11, h
-	EOR x2 @> 64-11, h, h
-	MUL prime1, h
-
-try1:
-	TBZ   $0, n, finalize
-	MOVBU (p), x4
-
-	MUL prime5, x4
-	ROR $64-11, h
-	EOR x4 @> 64-11, h, h
-	MUL prime1, h
-
-finalize:
-	EOR h >> 33, h
-	MUL prime2, h
-	EOR h >> 29, h
-	MUL prime3, h
-	EOR h >> 32, h
-
-	MOVD h, ret+24(FP)
-	RET
-
-// func writeBlocks(s *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
-	LDP ·primes+0(SB), (prime1, prime2)
-
-	// Load state. Assume v[1-4] are stored contiguously.
-	MOVD s+0(FP), digest
-	LDP  0(digest), (v1, v2)
-	LDP  16(digest), (v3, v4)
-
-	LDP b_base+8(FP), (p, n)
-
-	blockLoop()
-
-	// Store updated state.
-	STP (v1, v2), 0(digest)
-	STP (v3, v4), 16(digest)
-
-	BIC  $31, n
-	MOVD n, ret+32(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
deleted file mode 100644
index d4221edf4..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
+++ /dev/null
@@ -1,16 +0,0 @@
-//go:build (amd64 || arm64) && !appengine && gc && !purego && !noasm
-// +build amd64 arm64
-// +build !appengine
-// +build gc
-// +build !purego
-// +build !noasm
-
-package xxhash
-
-// Sum64 computes the 64-bit xxHash digest of b.
-//
-//go:noescape
-func Sum64(b []byte) uint64
-
-//go:noescape
-func writeBlocks(s *Digest, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
deleted file mode 100644
index 0be16cefc..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ /dev/null
@@ -1,76 +0,0 @@
-//go:build (!amd64 && !arm64) || appengine || !gc || purego || noasm
-// +build !amd64,!arm64 appengine !gc purego noasm
-
-package xxhash
-
-// Sum64 computes the 64-bit xxHash digest of b.
-func Sum64(b []byte) uint64 {
-	// A simpler version would be
-	//   d := New()
-	//   d.Write(b)
-	//   return d.Sum64()
-	// but this is faster, particularly for small inputs.
-
-	n := len(b)
-	var h uint64
-
-	if n >= 32 {
-		v1 := primes[0] + prime2
-		v2 := prime2
-		v3 := uint64(0)
-		v4 := -primes[0]
-		for len(b) >= 32 {
-			v1 = round(v1, u64(b[0:8:len(b)]))
-			v2 = round(v2, u64(b[8:16:len(b)]))
-			v3 = round(v3, u64(b[16:24:len(b)]))
-			v4 = round(v4, u64(b[24:32:len(b)]))
-			b = b[32:len(b):len(b)]
-		}
-		h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
-		h = mergeRound(h, v1)
-		h = mergeRound(h, v2)
-		h = mergeRound(h, v3)
-		h = mergeRound(h, v4)
-	} else {
-		h = prime5
-	}
-
-	h += uint64(n)
-
-	for ; len(b) >= 8; b = b[8:] {
-		k1 := round(0, u64(b[:8]))
-		h ^= k1
-		h = rol27(h)*prime1 + prime4
-	}
-	if len(b) >= 4 {
-		h ^= uint64(u32(b[:4])) * prime1
-		h = rol23(h)*prime2 + prime3
-		b = b[4:]
-	}
-	for ; len(b) > 0; b = b[1:] {
-		h ^= uint64(b[0]) * prime5
-		h = rol11(h) * prime1
-	}
-
-	h ^= h >> 33
-	h *= prime2
-	h ^= h >> 29
-	h *= prime3
-	h ^= h >> 32
-
-	return h
-}
-
-func writeBlocks(d *Digest, b []byte) int {
-	v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
-	n := len(b)
-	for len(b) >= 32 {
-		v1 = round(v1, u64(b[0:8:len(b)]))
-		v2 = round(v2, u64(b[8:16:len(b)]))
-		v3 = round(v3, u64(b[16:24:len(b)]))
-		v4 = round(v4, u64(b[24:32:len(b)]))
-		b = b[32:len(b):len(b)]
-	}
-	d.v1, d.v2, d.v3, d.v4 = v1, v2, v3, v4
-	return n - len(b)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_safe.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_safe.go
deleted file mode 100644
index 6f3b0cb10..000000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_safe.go
+++ /dev/null
@@ -1,11 +0,0 @@
-package xxhash
-
-// Sum64String computes the 64-bit xxHash digest of s.
-func Sum64String(s string) uint64 {
-	return Sum64([]byte(s))
-}
-
-// WriteString adds more data to d. It always returns len(s), nil.
-func (d *Digest) WriteString(s string) (n int, err error) {
-	return d.Write([]byte(s))
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go
deleted file mode 100644
index f41932b7a..000000000
--- a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go
+++ /dev/null
@@ -1,16 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-
-package zstd
-
-// matchLen returns how many bytes match in a and b
-//
-// It assumes that:
-//
-//	len(a) <= len(b) and len(a) > 0
-//
-//go:noescape
-func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s
deleted file mode 100644
index 0782b86e3..000000000
--- a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copied from S2 implementation.
-
-//go:build !appengine && !noasm && gc && !noasm
-
-#include "textflag.h"
-
-// func matchLen(a []byte, b []byte) int
-TEXT ·matchLen(SB), NOSPLIT, $0-56
-	MOVQ a_base+0(FP), AX
-	MOVQ b_base+24(FP), CX
-	MOVQ a_len+8(FP), DX
-
-	// matchLen
-	XORL SI, SI
-	CMPL DX, $0x08
-	JB   matchlen_match4_standalone
-
-matchlen_loopback_standalone:
-	MOVQ (AX)(SI*1), BX
-	XORQ (CX)(SI*1), BX
-	JZ   matchlen_loop_standalone
-
-#ifdef GOAMD64_v3
-	TZCNTQ BX, BX
-#else
-	BSFQ BX, BX
-#endif
-	SHRL $0x03, BX
-	LEAL (SI)(BX*1), SI
-	JMP  gen_match_len_end
-
-matchlen_loop_standalone:
-	LEAL -8(DX), DX
-	LEAL 8(SI), SI
-	CMPL DX, $0x08
-	JAE  matchlen_loopback_standalone
-
-matchlen_match4_standalone:
-	CMPL DX, $0x04
-	JB   matchlen_match2_standalone
-	MOVL (AX)(SI*1), BX
-	CMPL (CX)(SI*1), BX
-	JNE  matchlen_match2_standalone
-	LEAL -4(DX), DX
-	LEAL 4(SI), SI
-
-matchlen_match2_standalone:
-	CMPL DX, $0x02
-	JB   matchlen_match1_standalone
-	MOVW (AX)(SI*1), BX
-	CMPW (CX)(SI*1), BX
-	JNE  matchlen_match1_standalone
-	LEAL -2(DX), DX
-	LEAL 2(SI), SI
-
-matchlen_match1_standalone:
-	CMPL DX, $0x01
-	JB   gen_match_len_end
-	MOVB (AX)(SI*1), BL
-	CMPB (CX)(SI*1), BL
-	JNE  gen_match_len_end
-	INCL SI
-
-gen_match_len_end:
-	MOVQ SI, ret+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
deleted file mode 100644
index bea1779e9..000000000
--- a/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
+++ /dev/null
@@ -1,38 +0,0 @@
-//go:build !amd64 || appengine || !gc || noasm
-// +build !amd64 appengine !gc noasm
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-
-package zstd
-
-import (
-	"math/bits"
-
-	"github.com/klauspost/compress/internal/le"
-)
-
-// matchLen returns the maximum common prefix length of a and b.
-// a must be the shortest of the two.
-func matchLen(a, b []byte) (n int) {
-	left := len(a)
-	for left >= 8 {
-		diff := le.Load64(a, n) ^ le.Load64(b, n)
-		if diff != 0 {
-			return n + bits.TrailingZeros64(diff)>>3
-		}
-		n += 8
-		left -= 8
-	}
-	a = a[n:]
-	b = b[n:]
-
-	for i := range a {
-		if a[i] != b[i] {
-			break
-		}
-		n++
-	}
-	return n
-
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
deleted file mode 100644
index 9a7de82f9..000000000
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ /dev/null
@@ -1,503 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"errors"
-	"fmt"
-	"io"
-)
-
-type seq struct {
-	litLen   uint32
-	matchLen uint32
-	offset   uint32
-
-	// Codes are stored here for the encoder
-	// so they only have to be looked up once.
-	llCode, mlCode, ofCode uint8
-}
-
-type seqVals struct {
-	ll, ml, mo int
-}
-
-func (s seq) String() string {
-	if s.offset <= 3 {
-		if s.offset == 0 {
-			return fmt.Sprint("litLen:", s.litLen, ", matchLen:", s.matchLen+zstdMinMatch, ", offset: INVALID (0)")
-		}
-		return fmt.Sprint("litLen:", s.litLen, ", matchLen:", s.matchLen+zstdMinMatch, ", offset:", s.offset, " (repeat)")
-	}
-	return fmt.Sprint("litLen:", s.litLen, ", matchLen:", s.matchLen+zstdMinMatch, ", offset:", s.offset-3, " (new)")
-}
-
-type seqCompMode uint8
-
-const (
-	compModePredefined seqCompMode = iota
-	compModeRLE
-	compModeFSE
-	compModeRepeat
-)
-
-type sequenceDec struct {
-	// decoder keeps track of the current state and updates it from the bitstream.
-	fse    *fseDecoder
-	state  fseState
-	repeat bool
-}
-
-// init the state of the decoder with input from stream.
-func (s *sequenceDec) init(br *bitReader) error {
-	if s.fse == nil {
-		return errors.New("sequence decoder not defined")
-	}
-	s.state.init(br, s.fse.actualTableLog, s.fse.dt[:1<<s.fse.actualTableLog])
-	return nil
-}
-
-// sequenceDecs contains all 3 sequence decoders and their state.
-type sequenceDecs struct {
-	litLengths   sequenceDec
-	offsets      sequenceDec
-	matchLengths sequenceDec
-	prevOffset   [3]int
-	dict         []byte
-	literals     []byte
-	out          []byte
-	nSeqs        int
-	br           *bitReader
-	seqSize      int
-	windowSize   int
-	maxBits      uint8
-	maxSyncLen   uint64
-}
-
-// initialize all 3 decoders from the stream input.
-func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error {
-	if err := s.litLengths.init(br); err != nil {
-		return errors.New("litLengths:" + err.Error())
-	}
-	if err := s.offsets.init(br); err != nil {
-		return errors.New("offsets:" + err.Error())
-	}
-	if err := s.matchLengths.init(br); err != nil {
-		return errors.New("matchLengths:" + err.Error())
-	}
-	s.br = br
-	s.prevOffset = hist.recentOffsets
-	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
-	s.windowSize = hist.windowSize
-	s.out = out
-	s.dict = nil
-	if hist.dict != nil {
-		s.dict = hist.dict.content
-	}
-	return nil
-}
-
-func (s *sequenceDecs) freeDecoders() {
-	if f := s.litLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-		s.litLengths.fse = nil
-	}
-	if f := s.offsets.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-		s.offsets.fse = nil
-	}
-	if f := s.matchLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-		s.matchLengths.fse = nil
-	}
-}
-
-// execute will execute the decoded sequence with the provided history.
-// The sequence must be evaluated before being sent.
-func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
-	if len(s.dict) == 0 {
-		return s.executeSimple(seqs, hist)
-	}
-
-	// Ensure we have enough output size...
-	if len(s.out)+s.seqSize > cap(s.out) {
-		addBytes := s.seqSize + len(s.out)
-		s.out = append(s.out, make([]byte, addBytes)...)
-		s.out = s.out[:len(s.out)-addBytes]
-	}
-
-	if debugDecoder {
-		printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize)
-	}
-
-	var t = len(s.out)
-	out := s.out[:t+s.seqSize]
-
-	for _, seq := range seqs {
-		// Add literals
-		copy(out[t:], s.literals[:seq.ll])
-		t += seq.ll
-		s.literals = s.literals[seq.ll:]
-
-		// Copy from dictionary...
-		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
-			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
-			}
-
-			// we may be in dictionary.
-			dictO := len(s.dict) - (seq.mo - (t + len(hist)))
-			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict))
-			}
-			end := dictO + seq.ml
-			if end > len(s.dict) {
-				n := len(s.dict) - dictO
-				copy(out[t:], s.dict[dictO:])
-				t += n
-				seq.ml -= n
-			} else {
-				copy(out[t:], s.dict[dictO:end])
-				t += end - dictO
-				continue
-			}
-		}
-
-		// Copy from history.
-		if v := seq.mo - t; v > 0 {
-			// v is the start position in history from end.
-			start := len(hist) - v
-			if seq.ml > v {
-				// Some goes into current block.
-				// Copy remainder of history
-				copy(out[t:], hist[start:])
-				t += v
-				seq.ml -= v
-			} else {
-				copy(out[t:], hist[start:start+seq.ml])
-				t += seq.ml
-				continue
-			}
-		}
-		// We must be in current buffer now
-		if seq.ml > 0 {
-			start := t - seq.mo
-			if seq.ml <= t-start {
-				// No overlap
-				copy(out[t:], out[start:start+seq.ml])
-				t += seq.ml
-				continue
-			} else {
-				// Overlapping copy
-				// Extend destination slice and copy one byte at the time.
-				src := out[start : start+seq.ml]
-				dst := out[t:]
-				dst = dst[:len(src)]
-				t += len(src)
-				// Destination is the space we just added.
-				for i := range src {
-					dst[i] = src[i]
-				}
-			}
-		}
-	}
-
-	// Add final literals
-	copy(out[t:], s.literals)
-	if debugDecoder {
-		t += len(s.literals)
-		if t != len(out) {
-			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
-		}
-	}
-	s.out = out
-
-	return nil
-}
-
-// decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decodeSync(hist []byte) error {
-	supported, err := s.decodeSyncSimple(hist)
-	if supported {
-		return err
-	}
-
-	br := s.br
-	seqs := s.nSeqs
-	startSize := len(s.out)
-	// Grab full sizes tables, to avoid bounds checks.
-	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
-	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-	out := s.out
-	maxBlockSize := maxCompressedBlockSize
-	if s.windowSize < maxBlockSize {
-		maxBlockSize = s.windowSize
-	}
-
-	if debugDecoder {
-		println("decodeSync: decoding", seqs, "sequences", br.remain(), "bits remain on stream")
-	}
-	for i := seqs - 1; i >= 0; i-- {
-		if br.overread() {
-			printf("reading sequence %d, exceeded available data. Overread by %d\n", seqs-i, -br.remain())
-			return io.ErrUnexpectedEOF
-		}
-		var ll, mo, ml int
-		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
-			// inlined function:
-			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
-
-			// Final will not read from stream.
-			var llB, mlB, moB uint8
-			ll, llB = llState.final()
-			ml, mlB = mlState.final()
-			mo, moB = ofState.final()
-
-			// extra bits are stored in reverse order.
-			br.fillFast()
-			mo += br.getBits(moB)
-			if s.maxBits > 32 {
-				br.fillFast()
-			}
-			ml += br.getBits(mlB)
-			ll += br.getBits(llB)
-
-			if moB > 1 {
-				s.prevOffset[2] = s.prevOffset[1]
-				s.prevOffset[1] = s.prevOffset[0]
-				s.prevOffset[0] = mo
-			} else {
-				// mo = s.adjustOffset(mo, ll, moB)
-				// Inlined for rather big speedup
-				if ll == 0 {
-					// There is an exception though, when current sequence's literals_length = 0.
-					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
-					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
-					mo++
-				}
-
-				if mo == 0 {
-					mo = s.prevOffset[0]
-				} else {
-					var temp int
-					if mo == 3 {
-						temp = s.prevOffset[0] - 1
-					} else {
-						temp = s.prevOffset[mo]
-					}
-
-					if temp == 0 {
-						// 0 is not valid; input is corrupted; force offset to 1
-						println("WARNING: temp was 0")
-						temp = 1
-					}
-
-					if mo != 1 {
-						s.prevOffset[2] = s.prevOffset[1]
-					}
-					s.prevOffset[1] = s.prevOffset[0]
-					s.prevOffset[0] = temp
-					mo = temp
-				}
-			}
-			br.fillFast()
-		} else {
-			ll, mo, ml = s.next(br, llState, mlState, ofState)
-			br.fill()
-		}
-
-		if debugSequences {
-			println("Seq", seqs-i-1, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
-		}
-
-		if ll > len(s.literals) {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
-		}
-		size := ll + ml + len(out)
-		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-		}
-		if size > cap(out) {
-			// Not enough size, which can happen under high volume block streaming conditions
-			// but could be if destination slice is too small for sync operations.
-			// over-allocating here can create a large amount of GC pressure so we try to keep
-			// it as contained as possible
-			used := len(out) - startSize
-			addBytes := 256 + ll + ml + used>>2
-			// Clamp to max block size.
-			if used+addBytes > maxBlockSize {
-				addBytes = maxBlockSize - used
-			}
-			out = append(out, make([]byte, addBytes)...)
-			out = out[:len(out)-addBytes]
-		}
-		if ml > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
-		}
-
-		// Add literals
-		out = append(out, s.literals[:ll]...)
-		s.literals = s.literals[ll:]
-
-		if mo == 0 && ml > 0 {
-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
-		}
-
-		if mo > len(out)+len(hist) || mo > s.windowSize {
-			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
-			}
-
-			// we may be in dictionary.
-			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
-			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
-			}
-			end := dictO + ml
-			if end > len(s.dict) {
-				out = append(out, s.dict[dictO:]...)
-				ml -= len(s.dict) - dictO
-			} else {
-				out = append(out, s.dict[dictO:end]...)
-				mo = 0
-				ml = 0
-			}
-		}
-
-		// Copy from history.
-		// TODO: Blocks without history could be made to ignore this completely.
-		if v := mo - len(out); v > 0 {
-			// v is the start position in history from end.
-			start := len(hist) - v
-			if ml > v {
-				// Some goes into current block.
-				// Copy remainder of history
-				out = append(out, hist[start:]...)
-				ml -= v
-			} else {
-				out = append(out, hist[start:start+ml]...)
-				ml = 0
-			}
-		}
-		// We must be in current buffer now
-		if ml > 0 {
-			start := len(out) - mo
-			if ml <= len(out)-start {
-				// No overlap
-				out = append(out, out[start:start+ml]...)
-			} else {
-				// Overlapping copy
-				// Extend destination slice and copy one byte at the time.
-				out = out[:len(out)+ml]
-				src := out[start : start+ml]
-				// Destination is the space we just added.
-				dst := out[len(out)-ml:]
-				dst = dst[:len(src)]
-				for i := range src {
-					dst[i] = src[i]
-				}
-			}
-		}
-		if i == 0 {
-			// This is the last sequence, so we shouldn't update state.
-			break
-		}
-
-		// Manually inlined, ~ 5-20% faster
-		// Update all 3 states at once. Approx 20% faster.
-		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
-		if nBits == 0 {
-			llState = llTable[llState.newState()&maxTableMask]
-			mlState = mlTable[mlState.newState()&maxTableMask]
-			ofState = ofTable[ofState.newState()&maxTableMask]
-		} else {
-			bits := br.get32BitsFast(nBits)
-
-			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
-			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
-
-			lowBits = uint16(bits >> (ofState.nbBits() & 31))
-			lowBits &= bitMask[mlState.nbBits()&15]
-			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
-
-			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
-			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
-		}
-	}
-
-	if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
-		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-	}
-
-	// Add final literals
-	s.out = append(out, s.literals...)
-	return br.close()
-}
-
-var bitMask [16]uint16
-
-func init() {
-	for i := range bitMask[:] {
-		bitMask[i] = uint16((1 << uint(i)) - 1)
-	}
-}
-
-func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
-	// Final will not read from stream.
-	ll, llB := llState.final()
-	ml, mlB := mlState.final()
-	mo, moB := ofState.final()
-
-	// extra bits are stored in reverse order.
-	br.fill()
-	mo += br.getBits(moB)
-	if s.maxBits > 32 {
-		br.fill()
-	}
-	// matchlength+literal length, max 32 bits
-	ml += br.getBits(mlB)
-	ll += br.getBits(llB)
-	mo = s.adjustOffset(mo, ll, moB)
-	return
-}
-
-func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int {
-	if offsetB > 1 {
-		s.prevOffset[2] = s.prevOffset[1]
-		s.prevOffset[1] = s.prevOffset[0]
-		s.prevOffset[0] = offset
-		return offset
-	}
-
-	if litLen == 0 {
-		// There is an exception though, when current sequence's literals_length = 0.
-		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
-		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
-		offset++
-	}
-
-	if offset == 0 {
-		return s.prevOffset[0]
-	}
-	var temp int
-	if offset == 3 {
-		temp = s.prevOffset[0] - 1
-	} else {
-		temp = s.prevOffset[offset]
-	}
-
-	if temp == 0 {
-		// 0 is not valid; input is corrupted; force offset to 1
-		println("temp was 0")
-		temp = 1
-	}
-
-	if offset != 1 {
-		s.prevOffset[2] = s.prevOffset[1]
-	}
-	s.prevOffset[1] = s.prevOffset[0]
-	s.prevOffset[0] = temp
-	return temp
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
deleted file mode 100644
index c59f17e07..000000000
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ /dev/null
@@ -1,394 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-package zstd
-
-import (
-	"fmt"
-	"io"
-
-	"github.com/klauspost/compress/internal/cpuinfo"
-)
-
-type decodeSyncAsmContext struct {
-	llTable     []decSymbol
-	mlTable     []decSymbol
-	ofTable     []decSymbol
-	llState     uint64
-	mlState     uint64
-	ofState     uint64
-	iteration   int
-	litRemain   int
-	out         []byte
-	outPosition int
-	literals    []byte
-	litPosition int
-	history     []byte
-	windowSize  int
-	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
-	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
-	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
-}
-
-// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
-//
-// Please refer to seqdec_generic.go for the reference implementation.
-//
-//go:noescape
-func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-
-// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
-//
-//go:noescape
-func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-
-// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
-//
-//go:noescape
-func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-
-// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
-//
-//go:noescape
-func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-
-// decode sequences from the stream with the provided history but without a dictionary.
-func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
-	if len(s.dict) > 0 {
-		return false, nil
-	}
-	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
-		return false, nil
-	}
-
-	// FIXME: Using unsafe memory copies leads to rare, random crashes
-	// with fuzz testing. It is therefore disabled for now.
-	const useSafe = true
-	/*
-		useSafe := false
-		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
-			useSafe = true
-		}
-		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
-			useSafe = true
-		}
-		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
-			useSafe = true
-		}
-	*/
-
-	br := s.br
-
-	maxBlockSize := maxCompressedBlockSize
-	if s.windowSize < maxBlockSize {
-		maxBlockSize = s.windowSize
-	}
-
-	ctx := decodeSyncAsmContext{
-		llTable:     s.litLengths.fse.dt[:maxTablesize],
-		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
-		ofTable:     s.offsets.fse.dt[:maxTablesize],
-		llState:     uint64(s.litLengths.state.state),
-		mlState:     uint64(s.matchLengths.state.state),
-		ofState:     uint64(s.offsets.state.state),
-		iteration:   s.nSeqs - 1,
-		litRemain:   len(s.literals),
-		out:         s.out,
-		outPosition: len(s.out),
-		literals:    s.literals,
-		windowSize:  s.windowSize,
-		history:     hist,
-	}
-
-	s.seqSize = 0
-	startSize := len(s.out)
-
-	var errCode int
-	if cpuinfo.HasBMI2() {
-		if useSafe {
-			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
-		} else {
-			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
-		}
-	} else {
-		if useSafe {
-			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
-		} else {
-			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
-		}
-	}
-	switch errCode {
-	case noError:
-		break
-
-	case errorMatchLenOfsMismatch:
-		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
-
-	case errorMatchLenTooBig:
-		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
-
-	case errorMatchOffTooBig:
-		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
-			ctx.mo, ctx.outPosition+len(hist)-startSize)
-
-	case errorNotEnoughLiterals:
-		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
-			ctx.ll, ctx.litRemain+ctx.ll)
-
-	case errorOverread:
-		return true, io.ErrUnexpectedEOF
-
-	case errorNotEnoughSpace:
-		size := ctx.outPosition + ctx.ll + ctx.ml
-		if debugDecoder {
-			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
-		}
-		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-
-	default:
-		return true, fmt.Errorf("sequenceDecs_decode returned erroneous code %d", errCode)
-	}
-
-	s.seqSize += ctx.litRemain
-	if s.seqSize > maxBlockSize {
-		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-	}
-	err := br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-		return true, err
-	}
-
-	s.literals = s.literals[ctx.litPosition:]
-	t := ctx.outPosition
-	s.out = s.out[:t]
-
-	// Add final literals
-	s.out = append(s.out, s.literals...)
-	if debugDecoder {
-		t += len(s.literals)
-		if t != len(s.out) {
-			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
-		}
-	}
-
-	return true, nil
-}
-
-// --------------------------------------------------------------------------------
-
-type decodeAsmContext struct {
-	llTable   []decSymbol
-	mlTable   []decSymbol
-	ofTable   []decSymbol
-	llState   uint64
-	mlState   uint64
-	ofState   uint64
-	iteration int
-	seqs      []seqVals
-	litRemain int
-}
-
-const noError = 0
-
-// error reported when mo == 0 && ml > 0
-const errorMatchLenOfsMismatch = 1
-
-// error reported when ml > maxMatchLen
-const errorMatchLenTooBig = 2
-
-// error reported when mo > available history or mo > s.windowSize
-const errorMatchOffTooBig = 3
-
-// error reported when the sum of literal lengths exeeceds the literal buffer size
-const errorNotEnoughLiterals = 4
-
-// error reported when capacity of `out` is too small
-const errorNotEnoughSpace = 5
-
-// error reported when bits are overread.
-const errorOverread = 6
-
-// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
-//
-// Please refer to seqdec_generic.go for the reference implementation.
-//
-//go:noescape
-func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-
-// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
-//
-// Please refer to seqdec_generic.go for the reference implementation.
-//
-//go:noescape
-func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-
-// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
-//
-//go:noescape
-func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-
-// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
-//
-//go:noescape
-func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-
-// decode sequences from the stream without the provided history.
-func (s *sequenceDecs) decode(seqs []seqVals) error {
-	br := s.br
-
-	maxBlockSize := maxCompressedBlockSize
-	if s.windowSize < maxBlockSize {
-		maxBlockSize = s.windowSize
-	}
-
-	ctx := decodeAsmContext{
-		llTable:   s.litLengths.fse.dt[:maxTablesize],
-		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
-		ofTable:   s.offsets.fse.dt[:maxTablesize],
-		llState:   uint64(s.litLengths.state.state),
-		mlState:   uint64(s.matchLengths.state.state),
-		ofState:   uint64(s.offsets.state.state),
-		seqs:      seqs,
-		iteration: len(seqs) - 1,
-		litRemain: len(s.literals),
-	}
-
-	if debugDecoder {
-		println("decode: decoding", len(seqs), "sequences", br.remain(), "bits remain on stream")
-	}
-
-	s.seqSize = 0
-	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
-	var errCode int
-	if cpuinfo.HasBMI2() {
-		if lte56bits {
-			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
-		} else {
-			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
-		}
-	} else {
-		if lte56bits {
-			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
-		} else {
-			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
-		}
-	}
-	if errCode != 0 {
-		i := len(seqs) - ctx.iteration - 1
-		switch errCode {
-		case errorMatchLenOfsMismatch:
-			ml := ctx.seqs[i].ml
-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
-
-		case errorMatchLenTooBig:
-			ml := ctx.seqs[i].ml
-			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
-
-		case errorNotEnoughLiterals:
-			ll := ctx.seqs[i].ll
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
-		case errorOverread:
-			return io.ErrUnexpectedEOF
-		}
-
-		return fmt.Errorf("sequenceDecs_decode_amd64 returned erroneous code %d", errCode)
-	}
-
-	if ctx.litRemain < 0 {
-		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
-			len(s.literals), len(s.literals)-ctx.litRemain)
-	}
-
-	s.seqSize += ctx.litRemain
-	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-	}
-	if debugDecoder {
-		println("decode: ", br.remain(), "bits remain on stream. code:", errCode)
-	}
-	err := br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
-	return err
-}
-
-// --------------------------------------------------------------------------------
-
-type executeAsmContext struct {
-	seqs        []seqVals
-	seqIndex    int
-	out         []byte
-	history     []byte
-	literals    []byte
-	outPosition int
-	litPosition int
-	windowSize  int
-}
-
-// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
-//
-// Returns false if a match offset is too big.
-//
-// Please refer to seqdec_generic.go for the reference implementation.
-//
-//go:noescape
-func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
-
-// Same as above, but with safe memcopies
-//
-//go:noescape
-func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
-
-// executeSimple handles cases when dictionary is not used.
-func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
-	// Ensure we have enough output size...
-	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
-		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
-		s.out = append(s.out, make([]byte, addBytes)...)
-		s.out = s.out[:len(s.out)-addBytes]
-	}
-
-	if debugDecoder {
-		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
-	}
-
-	var t = len(s.out)
-	out := s.out[:t+s.seqSize]
-
-	ctx := executeAsmContext{
-		seqs:        seqs,
-		seqIndex:    0,
-		out:         out,
-		history:     hist,
-		outPosition: t,
-		litPosition: 0,
-		literals:    s.literals,
-		windowSize:  s.windowSize,
-	}
-	var ok bool
-	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
-		ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
-	} else {
-		ok = sequenceDecs_executeSimple_amd64(&ctx)
-	}
-	if !ok {
-		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
-			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
-	}
-	s.literals = s.literals[ctx.litPosition:]
-	t = ctx.outPosition
-
-	// Add final literals
-	copy(out[t:], s.literals)
-	if debugDecoder {
-		t += len(s.literals)
-		if t != len(out) {
-			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
-		}
-	}
-	s.out = out
-
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
deleted file mode 100644
index a708ca6d3..000000000
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ /dev/null
@@ -1,4151 +0,0 @@
-// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
-
-//go:build !appengine && !noasm && gc && !noasm
-
-// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: CMOV
-TEXT ·sequenceDecs_decode_amd64(SB), $8-32
-	MOVQ    br+8(FP), CX
-	MOVQ    24(CX), DX
-	MOVBQZX 40(CX), BX
-	MOVQ    (CX), AX
-	MOVQ    32(CX), SI
-	ADDQ    SI, AX
-	MOVQ    AX, (SP)
-	MOVQ    ctx+16(FP), AX
-	MOVQ    72(AX), DI
-	MOVQ    80(AX), R8
-	MOVQ    88(AX), R9
-	MOVQ    104(AX), R10
-	MOVQ    s+0(FP), AX
-	MOVQ    144(AX), R11
-	MOVQ    152(AX), R12
-	MOVQ    160(AX), R13
-
-sequenceDecs_decode_amd64_main_loop:
-	MOVQ (SP), R14
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R14
-	MOVQ (R14), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decode_amd64_fill_end
-
-sequenceDecs_decode_amd64_fill_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decode_amd64_fill_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decode_amd64_fill_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R14
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R14), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
-
-sequenceDecs_decode_amd64_fill_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decode_amd64_fill_end:
-	// Update offset
-	MOVQ  R9, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R15
-	SHLQ  CL, R15
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decode_amd64_of_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decode_amd64_of_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decode_amd64_of_update_zero
-	NEGQ  CX
-	SHRQ  CL, R15
-	ADDQ  R15, AX
-
-sequenceDecs_decode_amd64_of_update_zero:
-	MOVQ AX, 16(R10)
-
-	// Update match length
-	MOVQ  R8, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R15
-	SHLQ  CL, R15
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decode_amd64_ml_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decode_amd64_ml_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decode_amd64_ml_update_zero
-	NEGQ  CX
-	SHRQ  CL, R15
-	ADDQ  R15, AX
-
-sequenceDecs_decode_amd64_ml_update_zero:
-	MOVQ AX, 8(R10)
-
-	// Fill bitreader to have enough for the remaining
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R14
-	MOVQ (R14), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decode_amd64_fill_2_end
-
-sequenceDecs_decode_amd64_fill_2_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decode_amd64_fill_2_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R14
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R14), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
-
-sequenceDecs_decode_amd64_fill_2_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decode_amd64_fill_2_end:
-	// Update literal length
-	MOVQ  DI, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R15
-	SHLQ  CL, R15
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decode_amd64_ll_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decode_amd64_ll_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decode_amd64_ll_update_zero
-	NEGQ  CX
-	SHRQ  CL, R15
-	ADDQ  R15, AX
-
-sequenceDecs_decode_amd64_ll_update_zero:
-	MOVQ AX, (R10)
-
-	// Fill bitreader for state updates
-	MOVQ    R14, (SP)
-	MOVQ    R9, AX
-	SHRQ    $0x08, AX
-	MOVBQZX AL, AX
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decode_amd64_skip_update
-
-	// Update Literal Length State
-	MOVBQZX DI, R14
-	SHRL    $0x10, DI
-	LEAQ    (BX)(R14*1), CX
-	MOVQ    DX, R15
-	MOVQ    CX, BX
-	ROLQ    CL, R15
-	MOVL    $0x00000001, BP
-	MOVB    R14, CL
-	SHLL    CL, BP
-	DECL    BP
-	ANDQ    BP, R15
-	ADDQ    R15, DI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Match Length State
-	MOVBQZX R8, R14
-	SHRL    $0x10, R8
-	LEAQ    (BX)(R14*1), CX
-	MOVQ    DX, R15
-	MOVQ    CX, BX
-	ROLQ    CL, R15
-	MOVL    $0x00000001, BP
-	MOVB    R14, CL
-	SHLL    CL, BP
-	DECL    BP
-	ANDQ    BP, R15
-	ADDQ    R15, R8
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Offset State
-	MOVBQZX R9, R14
-	SHRL    $0x10, R9
-	LEAQ    (BX)(R14*1), CX
-	MOVQ    DX, R15
-	MOVQ    CX, BX
-	ROLQ    CL, R15
-	MOVL    $0x00000001, BP
-	MOVB    R14, CL
-	SHLL    CL, BP
-	DECL    BP
-	ANDQ    BP, R15
-	ADDQ    R15, R9
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decode_amd64_skip_update:
-	// Adjust offset
-	MOVQ 16(R10), CX
-	CMPQ AX, $0x01
-	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
-	MOVQ R12, R13
-	MOVQ R11, R12
-	MOVQ CX, R11
-	JMP  sequenceDecs_decode_amd64_after_adjust
-
-sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
-	CMPQ (R10), $0x00000000
-	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
-	INCQ CX
-	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
-
-sequenceDecs_decode_amd64_adjust_offset_maybezero:
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
-	MOVQ  R11, CX
-	JMP   sequenceDecs_decode_amd64_after_adjust
-
-sequenceDecs_decode_amd64_adjust_offset_nonzero:
-	CMPQ CX, $0x01
-	JB   sequenceDecs_decode_amd64_adjust_zero
-	JEQ  sequenceDecs_decode_amd64_adjust_one
-	CMPQ CX, $0x02
-	JA   sequenceDecs_decode_amd64_adjust_three
-	JMP  sequenceDecs_decode_amd64_adjust_two
-
-sequenceDecs_decode_amd64_adjust_zero:
-	MOVQ R11, AX
-	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_amd64_adjust_one:
-	MOVQ R12, AX
-	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_amd64_adjust_two:
-	MOVQ R13, AX
-	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_amd64_adjust_three:
-	LEAQ -1(R11), AX
-
-sequenceDecs_decode_amd64_adjust_test_temp_valid:
-	TESTQ AX, AX
-	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
-	MOVQ  $0x00000001, AX
-
-sequenceDecs_decode_amd64_adjust_temp_valid:
-	CMPQ    CX, $0x01
-	CMOVQNE R12, R13
-	MOVQ    R11, R12
-	MOVQ    AX, R11
-	MOVQ    AX, CX
-
-sequenceDecs_decode_amd64_after_adjust:
-	MOVQ CX, 16(R10)
-
-	// Check values
-	MOVQ  8(R10), AX
-	MOVQ  (R10), R14
-	LEAQ  (AX)(R14*1), R15
-	MOVQ  s+0(FP), BP
-	ADDQ  R15, 256(BP)
-	MOVQ  ctx+16(FP), R15
-	SUBQ  R14, 128(R15)
-	JS    error_not_enough_literals
-	CMPQ  AX, $0x00020002
-	JA    sequenceDecs_decode_amd64_error_match_len_too_big
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
-	TESTQ AX, AX
-	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_amd64_match_len_ofs_ok:
-	ADDQ $0x18, R10
-	MOVQ ctx+16(FP), AX
-	DECQ 96(AX)
-	JNS  sequenceDecs_decode_amd64_main_loop
-	MOVQ s+0(FP), AX
-	MOVQ R11, 144(AX)
-	MOVQ R12, 152(AX)
-	MOVQ R13, 160(AX)
-	MOVQ br+8(FP), AX
-	MOVQ DX, 24(AX)
-	MOVB BL, 40(AX)
-	MOVQ SI, 32(AX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decode_amd64_error_match_len_too_big:
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: CMOV
-TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
-	MOVQ    br+8(FP), CX
-	MOVQ    24(CX), DX
-	MOVBQZX 40(CX), BX
-	MOVQ    (CX), AX
-	MOVQ    32(CX), SI
-	ADDQ    SI, AX
-	MOVQ    AX, (SP)
-	MOVQ    ctx+16(FP), AX
-	MOVQ    72(AX), DI
-	MOVQ    80(AX), R8
-	MOVQ    88(AX), R9
-	MOVQ    104(AX), R10
-	MOVQ    s+0(FP), AX
-	MOVQ    144(AX), R11
-	MOVQ    152(AX), R12
-	MOVQ    160(AX), R13
-
-sequenceDecs_decode_56_amd64_main_loop:
-	MOVQ (SP), R14
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R14
-	MOVQ (R14), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decode_56_amd64_fill_end
-
-sequenceDecs_decode_56_amd64_fill_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decode_56_amd64_fill_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R14
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R14), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
-
-sequenceDecs_decode_56_amd64_fill_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decode_56_amd64_fill_end:
-	// Update offset
-	MOVQ  R9, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R15
-	SHLQ  CL, R15
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decode_56_amd64_of_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decode_56_amd64_of_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decode_56_amd64_of_update_zero
-	NEGQ  CX
-	SHRQ  CL, R15
-	ADDQ  R15, AX
-
-sequenceDecs_decode_56_amd64_of_update_zero:
-	MOVQ AX, 16(R10)
-
-	// Update match length
-	MOVQ  R8, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R15
-	SHLQ  CL, R15
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decode_56_amd64_ml_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
-	NEGQ  CX
-	SHRQ  CL, R15
-	ADDQ  R15, AX
-
-sequenceDecs_decode_56_amd64_ml_update_zero:
-	MOVQ AX, 8(R10)
-
-	// Update literal length
-	MOVQ  DI, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R15
-	SHLQ  CL, R15
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decode_56_amd64_ll_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
-	NEGQ  CX
-	SHRQ  CL, R15
-	ADDQ  R15, AX
-
-sequenceDecs_decode_56_amd64_ll_update_zero:
-	MOVQ AX, (R10)
-
-	// Fill bitreader for state updates
-	MOVQ    R14, (SP)
-	MOVQ    R9, AX
-	SHRQ    $0x08, AX
-	MOVBQZX AL, AX
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decode_56_amd64_skip_update
-
-	// Update Literal Length State
-	MOVBQZX DI, R14
-	SHRL    $0x10, DI
-	LEAQ    (BX)(R14*1), CX
-	MOVQ    DX, R15
-	MOVQ    CX, BX
-	ROLQ    CL, R15
-	MOVL    $0x00000001, BP
-	MOVB    R14, CL
-	SHLL    CL, BP
-	DECL    BP
-	ANDQ    BP, R15
-	ADDQ    R15, DI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Match Length State
-	MOVBQZX R8, R14
-	SHRL    $0x10, R8
-	LEAQ    (BX)(R14*1), CX
-	MOVQ    DX, R15
-	MOVQ    CX, BX
-	ROLQ    CL, R15
-	MOVL    $0x00000001, BP
-	MOVB    R14, CL
-	SHLL    CL, BP
-	DECL    BP
-	ANDQ    BP, R15
-	ADDQ    R15, R8
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Offset State
-	MOVBQZX R9, R14
-	SHRL    $0x10, R9
-	LEAQ    (BX)(R14*1), CX
-	MOVQ    DX, R15
-	MOVQ    CX, BX
-	ROLQ    CL, R15
-	MOVL    $0x00000001, BP
-	MOVB    R14, CL
-	SHLL    CL, BP
-	DECL    BP
-	ANDQ    BP, R15
-	ADDQ    R15, R9
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decode_56_amd64_skip_update:
-	// Adjust offset
-	MOVQ 16(R10), CX
-	CMPQ AX, $0x01
-	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
-	MOVQ R12, R13
-	MOVQ R11, R12
-	MOVQ CX, R11
-	JMP  sequenceDecs_decode_56_amd64_after_adjust
-
-sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
-	CMPQ (R10), $0x00000000
-	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
-	INCQ CX
-	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
-
-sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
-	MOVQ  R11, CX
-	JMP   sequenceDecs_decode_56_amd64_after_adjust
-
-sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
-	CMPQ CX, $0x01
-	JB   sequenceDecs_decode_56_amd64_adjust_zero
-	JEQ  sequenceDecs_decode_56_amd64_adjust_one
-	CMPQ CX, $0x02
-	JA   sequenceDecs_decode_56_amd64_adjust_three
-	JMP  sequenceDecs_decode_56_amd64_adjust_two
-
-sequenceDecs_decode_56_amd64_adjust_zero:
-	MOVQ R11, AX
-	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_56_amd64_adjust_one:
-	MOVQ R12, AX
-	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_56_amd64_adjust_two:
-	MOVQ R13, AX
-	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_56_amd64_adjust_three:
-	LEAQ -1(R11), AX
-
-sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
-	TESTQ AX, AX
-	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
-	MOVQ  $0x00000001, AX
-
-sequenceDecs_decode_56_amd64_adjust_temp_valid:
-	CMPQ    CX, $0x01
-	CMOVQNE R12, R13
-	MOVQ    R11, R12
-	MOVQ    AX, R11
-	MOVQ    AX, CX
-
-sequenceDecs_decode_56_amd64_after_adjust:
-	MOVQ CX, 16(R10)
-
-	// Check values
-	MOVQ  8(R10), AX
-	MOVQ  (R10), R14
-	LEAQ  (AX)(R14*1), R15
-	MOVQ  s+0(FP), BP
-	ADDQ  R15, 256(BP)
-	MOVQ  ctx+16(FP), R15
-	SUBQ  R14, 128(R15)
-	JS    error_not_enough_literals
-	CMPQ  AX, $0x00020002
-	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
-	TESTQ AX, AX
-	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_56_amd64_match_len_ofs_ok:
-	ADDQ $0x18, R10
-	MOVQ ctx+16(FP), AX
-	DECQ 96(AX)
-	JNS  sequenceDecs_decode_56_amd64_main_loop
-	MOVQ s+0(FP), AX
-	MOVQ R11, 144(AX)
-	MOVQ R12, 152(AX)
-	MOVQ R13, 160(AX)
-	MOVQ br+8(FP), AX
-	MOVQ DX, 24(AX)
-	MOVB BL, 40(AX)
-	MOVQ SI, 32(AX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decode_56_amd64_error_match_len_too_big:
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: BMI, BMI2, CMOV
-TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
-	MOVQ    br+8(FP), BX
-	MOVQ    24(BX), AX
-	MOVBQZX 40(BX), DX
-	MOVQ    (BX), CX
-	MOVQ    32(BX), BX
-	ADDQ    BX, CX
-	MOVQ    CX, (SP)
-	MOVQ    ctx+16(FP), CX
-	MOVQ    72(CX), SI
-	MOVQ    80(CX), DI
-	MOVQ    88(CX), R8
-	MOVQ    104(CX), R9
-	MOVQ    s+0(FP), CX
-	MOVQ    144(CX), R10
-	MOVQ    152(CX), R11
-	MOVQ    160(CX), R12
-
-sequenceDecs_decode_bmi2_main_loop:
-	MOVQ (SP), R13
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R13
-	MOVQ (R13), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decode_bmi2_fill_end
-
-sequenceDecs_decode_bmi2_fill_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decode_bmi2_fill_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decode_bmi2_fill_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R13), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
-
-sequenceDecs_decode_bmi2_fill_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decode_bmi2_fill_end:
-	// Update offset
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, R8, R14
-	MOVQ   AX, R15
-	LEAQ   (DX)(R14*1), CX
-	ROLQ   CL, R15
-	BZHIQ  R14, R15, R15
-	MOVQ   CX, DX
-	MOVQ   R8, CX
-	SHRQ   $0x20, CX
-	ADDQ   R15, CX
-	MOVQ   CX, 16(R9)
-
-	// Update match length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, DI, R14
-	MOVQ   AX, R15
-	LEAQ   (DX)(R14*1), CX
-	ROLQ   CL, R15
-	BZHIQ  R14, R15, R15
-	MOVQ   CX, DX
-	MOVQ   DI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R15, CX
-	MOVQ   CX, 8(R9)
-
-	// Fill bitreader to have enough for the remaining
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R13
-	MOVQ (R13), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decode_bmi2_fill_2_end
-
-sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decode_bmi2_fill_2_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R13), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
-
-sequenceDecs_decode_bmi2_fill_2_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decode_bmi2_fill_2_end:
-	// Update literal length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, SI, R14
-	MOVQ   AX, R15
-	LEAQ   (DX)(R14*1), CX
-	ROLQ   CL, R15
-	BZHIQ  R14, R15, R15
-	MOVQ   CX, DX
-	MOVQ   SI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R15, CX
-	MOVQ   CX, (R9)
-
-	// Fill bitreader for state updates
-	MOVQ    R13, (SP)
-	MOVQ    $0x00000808, CX
-	BEXTRQ  CX, R8, R13
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decode_bmi2_skip_update
-	LEAQ    (SI)(DI*1), R14
-	ADDQ    R8, R14
-	MOVBQZX R14, R14
-	LEAQ    (DX)(R14*1), CX
-	MOVQ    AX, R15
-	MOVQ    CX, DX
-	ROLQ    CL, R15
-	BZHIQ   R14, R15, R15
-
-	// Update Offset State
-	BZHIQ R8, R15, CX
-	SHRXQ R8, R15, R15
-	SHRL  $0x10, R8
-	ADDQ  CX, R8
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Match Length State
-	BZHIQ DI, R15, CX
-	SHRXQ DI, R15, R15
-	SHRL  $0x10, DI
-	ADDQ  CX, DI
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Literal Length State
-	BZHIQ SI, R15, CX
-	SHRL  $0x10, SI
-	ADDQ  CX, SI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decode_bmi2_skip_update:
-	// Adjust offset
-	MOVQ 16(R9), CX
-	CMPQ R13, $0x01
-	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
-	MOVQ R11, R12
-	MOVQ R10, R11
-	MOVQ CX, R10
-	JMP  sequenceDecs_decode_bmi2_after_adjust
-
-sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
-	CMPQ (R9), $0x00000000
-	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
-	INCQ CX
-	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decode_bmi2_adjust_offset_maybezero:
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
-	MOVQ  R10, CX
-	JMP   sequenceDecs_decode_bmi2_after_adjust
-
-sequenceDecs_decode_bmi2_adjust_offset_nonzero:
-	CMPQ CX, $0x01
-	JB   sequenceDecs_decode_bmi2_adjust_zero
-	JEQ  sequenceDecs_decode_bmi2_adjust_one
-	CMPQ CX, $0x02
-	JA   sequenceDecs_decode_bmi2_adjust_three
-	JMP  sequenceDecs_decode_bmi2_adjust_two
-
-sequenceDecs_decode_bmi2_adjust_zero:
-	MOVQ R10, R13
-	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_bmi2_adjust_one:
-	MOVQ R11, R13
-	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_bmi2_adjust_two:
-	MOVQ R12, R13
-	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_bmi2_adjust_three:
-	LEAQ -1(R10), R13
-
-sequenceDecs_decode_bmi2_adjust_test_temp_valid:
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
-	MOVQ  $0x00000001, R13
-
-sequenceDecs_decode_bmi2_adjust_temp_valid:
-	CMPQ    CX, $0x01
-	CMOVQNE R11, R12
-	MOVQ    R10, R11
-	MOVQ    R13, R10
-	MOVQ    R13, CX
-
-sequenceDecs_decode_bmi2_after_adjust:
-	MOVQ CX, 16(R9)
-
-	// Check values
-	MOVQ  8(R9), R13
-	MOVQ  (R9), R14
-	LEAQ  (R13)(R14*1), R15
-	MOVQ  s+0(FP), BP
-	ADDQ  R15, 256(BP)
-	MOVQ  ctx+16(FP), R15
-	SUBQ  R14, 128(R15)
-	JS    error_not_enough_literals
-	CMPQ  R13, $0x00020002
-	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_bmi2_match_len_ofs_ok:
-	ADDQ $0x18, R9
-	MOVQ ctx+16(FP), CX
-	DECQ 96(CX)
-	JNS  sequenceDecs_decode_bmi2_main_loop
-	MOVQ s+0(FP), CX
-	MOVQ R10, 144(CX)
-	MOVQ R11, 152(CX)
-	MOVQ R12, 160(CX)
-	MOVQ br+8(FP), CX
-	MOVQ AX, 24(CX)
-	MOVB DL, 40(CX)
-	MOVQ BX, 32(CX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decode_bmi2_error_match_len_too_big:
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: BMI, BMI2, CMOV
-TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
-	MOVQ    br+8(FP), BX
-	MOVQ    24(BX), AX
-	MOVBQZX 40(BX), DX
-	MOVQ    (BX), CX
-	MOVQ    32(BX), BX
-	ADDQ    BX, CX
-	MOVQ    CX, (SP)
-	MOVQ    ctx+16(FP), CX
-	MOVQ    72(CX), SI
-	MOVQ    80(CX), DI
-	MOVQ    88(CX), R8
-	MOVQ    104(CX), R9
-	MOVQ    s+0(FP), CX
-	MOVQ    144(CX), R10
-	MOVQ    152(CX), R11
-	MOVQ    160(CX), R12
-
-sequenceDecs_decode_56_bmi2_main_loop:
-	MOVQ (SP), R13
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R13
-	MOVQ (R13), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decode_56_bmi2_fill_end
-
-sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decode_56_bmi2_fill_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R13), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
-
-sequenceDecs_decode_56_bmi2_fill_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decode_56_bmi2_fill_end:
-	// Update offset
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, R8, R14
-	MOVQ   AX, R15
-	LEAQ   (DX)(R14*1), CX
-	ROLQ   CL, R15
-	BZHIQ  R14, R15, R15
-	MOVQ   CX, DX
-	MOVQ   R8, CX
-	SHRQ   $0x20, CX
-	ADDQ   R15, CX
-	MOVQ   CX, 16(R9)
-
-	// Update match length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, DI, R14
-	MOVQ   AX, R15
-	LEAQ   (DX)(R14*1), CX
-	ROLQ   CL, R15
-	BZHIQ  R14, R15, R15
-	MOVQ   CX, DX
-	MOVQ   DI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R15, CX
-	MOVQ   CX, 8(R9)
-
-	// Update literal length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, SI, R14
-	MOVQ   AX, R15
-	LEAQ   (DX)(R14*1), CX
-	ROLQ   CL, R15
-	BZHIQ  R14, R15, R15
-	MOVQ   CX, DX
-	MOVQ   SI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R15, CX
-	MOVQ   CX, (R9)
-
-	// Fill bitreader for state updates
-	MOVQ    R13, (SP)
-	MOVQ    $0x00000808, CX
-	BEXTRQ  CX, R8, R13
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decode_56_bmi2_skip_update
-	LEAQ    (SI)(DI*1), R14
-	ADDQ    R8, R14
-	MOVBQZX R14, R14
-	LEAQ    (DX)(R14*1), CX
-	MOVQ    AX, R15
-	MOVQ    CX, DX
-	ROLQ    CL, R15
-	BZHIQ   R14, R15, R15
-
-	// Update Offset State
-	BZHIQ R8, R15, CX
-	SHRXQ R8, R15, R15
-	SHRL  $0x10, R8
-	ADDQ  CX, R8
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Match Length State
-	BZHIQ DI, R15, CX
-	SHRXQ DI, R15, R15
-	SHRL  $0x10, DI
-	ADDQ  CX, DI
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Literal Length State
-	BZHIQ SI, R15, CX
-	SHRL  $0x10, SI
-	ADDQ  CX, SI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decode_56_bmi2_skip_update:
-	// Adjust offset
-	MOVQ 16(R9), CX
-	CMPQ R13, $0x01
-	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
-	MOVQ R11, R12
-	MOVQ R10, R11
-	MOVQ CX, R10
-	JMP  sequenceDecs_decode_56_bmi2_after_adjust
-
-sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
-	CMPQ (R9), $0x00000000
-	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
-	INCQ CX
-	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
-	MOVQ  R10, CX
-	JMP   sequenceDecs_decode_56_bmi2_after_adjust
-
-sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
-	CMPQ CX, $0x01
-	JB   sequenceDecs_decode_56_bmi2_adjust_zero
-	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
-	CMPQ CX, $0x02
-	JA   sequenceDecs_decode_56_bmi2_adjust_three
-	JMP  sequenceDecs_decode_56_bmi2_adjust_two
-
-sequenceDecs_decode_56_bmi2_adjust_zero:
-	MOVQ R10, R13
-	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_56_bmi2_adjust_one:
-	MOVQ R11, R13
-	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_56_bmi2_adjust_two:
-	MOVQ R12, R13
-	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_56_bmi2_adjust_three:
-	LEAQ -1(R10), R13
-
-sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
-	MOVQ  $0x00000001, R13
-
-sequenceDecs_decode_56_bmi2_adjust_temp_valid:
-	CMPQ    CX, $0x01
-	CMOVQNE R11, R12
-	MOVQ    R10, R11
-	MOVQ    R13, R10
-	MOVQ    R13, CX
-
-sequenceDecs_decode_56_bmi2_after_adjust:
-	MOVQ CX, 16(R9)
-
-	// Check values
-	MOVQ  8(R9), R13
-	MOVQ  (R9), R14
-	LEAQ  (R13)(R14*1), R15
-	MOVQ  s+0(FP), BP
-	ADDQ  R15, 256(BP)
-	MOVQ  ctx+16(FP), R15
-	SUBQ  R14, 128(R15)
-	JS    error_not_enough_literals
-	CMPQ  R13, $0x00020002
-	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
-	ADDQ $0x18, R9
-	MOVQ ctx+16(FP), CX
-	DECQ 96(CX)
-	JNS  sequenceDecs_decode_56_bmi2_main_loop
-	MOVQ s+0(FP), CX
-	MOVQ R10, 144(CX)
-	MOVQ R11, 152(CX)
-	MOVQ R12, 160(CX)
-	MOVQ br+8(FP), CX
-	MOVQ AX, 24(CX)
-	MOVB DL, 40(CX)
-	MOVQ BX, 32(CX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decode_56_bmi2_error_match_len_too_big:
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
-// Requires: SSE
-TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
-	MOVQ  ctx+0(FP), R10
-	MOVQ  8(R10), CX
-	TESTQ CX, CX
-	JZ    empty_seqs
-	MOVQ  (R10), AX
-	MOVQ  24(R10), DX
-	MOVQ  32(R10), BX
-	MOVQ  80(R10), SI
-	MOVQ  104(R10), DI
-	MOVQ  120(R10), R8
-	MOVQ  56(R10), R9
-	MOVQ  64(R10), R10
-	ADDQ  R10, R9
-
-	// seqsBase += 24 * seqIndex
-	LEAQ (DX)(DX*2), R11
-	SHLQ $0x03, R11
-	ADDQ R11, AX
-
-	// outBase += outPosition
-	ADDQ DI, BX
-
-main_loop:
-	MOVQ (AX), R11
-	MOVQ 16(AX), R12
-	MOVQ 8(AX), R13
-
-	// Copy literals
-	TESTQ R11, R11
-	JZ    check_offset
-	XORQ  R14, R14
-
-copy_1:
-	MOVUPS (SI)(R14*1), X0
-	MOVUPS X0, (BX)(R14*1)
-	ADDQ   $0x10, R14
-	CMPQ   R14, R11
-	JB     copy_1
-	ADDQ   R11, SI
-	ADDQ   R11, BX
-	ADDQ   R11, DI
-
-	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
-	LEAQ (DI)(R10*1), R11
-	CMPQ R12, R11
-	JG   error_match_off_too_big
-	CMPQ R12, R8
-	JG   error_match_off_too_big
-
-	// Copy match from history
-	MOVQ R12, R11
-	SUBQ DI, R11
-	JLS  copy_match
-	MOVQ R9, R14
-	SUBQ R11, R14
-	CMPQ R13, R11
-	JG   copy_all_from_history
-	MOVQ R13, R11
-	SUBQ $0x10, R11
-	JB   copy_4_small
-
-copy_4_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (BX)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, BX
-	SUBQ   $0x10, R11
-	JAE    copy_4_loop
-	LEAQ   16(R14)(R11*1), R14
-	LEAQ   16(BX)(R11*1), BX
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(BX)
-	JMP    copy_4_end
-
-copy_4_small:
-	CMPQ R13, $0x03
-	JE   copy_4_move_3
-	CMPQ R13, $0x08
-	JB   copy_4_move_4through7
-	JMP  copy_4_move_8through16
-
-copy_4_move_3:
-	MOVW (R14), R11
-	MOVB 2(R14), R12
-	MOVW R11, (BX)
-	MOVB R12, 2(BX)
-	ADDQ R13, R14
-	ADDQ R13, BX
-	JMP  copy_4_end
-
-copy_4_move_4through7:
-	MOVL (R14), R11
-	MOVL -4(R14)(R13*1), R12
-	MOVL R11, (BX)
-	MOVL R12, -4(BX)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, BX
-	JMP  copy_4_end
-
-copy_4_move_8through16:
-	MOVQ (R14), R11
-	MOVQ -8(R14)(R13*1), R12
-	MOVQ R11, (BX)
-	MOVQ R12, -8(BX)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, BX
-
-copy_4_end:
-	ADDQ R13, DI
-	ADDQ $0x18, AX
-	INCQ DX
-	CMPQ DX, CX
-	JB   main_loop
-	JMP  loop_finished
-
-copy_all_from_history:
-	MOVQ R11, R15
-	SUBQ $0x10, R15
-	JB   copy_5_small
-
-copy_5_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (BX)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, BX
-	SUBQ   $0x10, R15
-	JAE    copy_5_loop
-	LEAQ   16(R14)(R15*1), R14
-	LEAQ   16(BX)(R15*1), BX
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(BX)
-	JMP    copy_5_end
-
-copy_5_small:
-	CMPQ R11, $0x03
-	JE   copy_5_move_3
-	JB   copy_5_move_1or2
-	CMPQ R11, $0x08
-	JB   copy_5_move_4through7
-	JMP  copy_5_move_8through16
-
-copy_5_move_1or2:
-	MOVB (R14), R15
-	MOVB -1(R14)(R11*1), BP
-	MOVB R15, (BX)
-	MOVB BP, -1(BX)(R11*1)
-	ADDQ R11, R14
-	ADDQ R11, BX
-	JMP  copy_5_end
-
-copy_5_move_3:
-	MOVW (R14), R15
-	MOVB 2(R14), BP
-	MOVW R15, (BX)
-	MOVB BP, 2(BX)
-	ADDQ R11, R14
-	ADDQ R11, BX
-	JMP  copy_5_end
-
-copy_5_move_4through7:
-	MOVL (R14), R15
-	MOVL -4(R14)(R11*1), BP
-	MOVL R15, (BX)
-	MOVL BP, -4(BX)(R11*1)
-	ADDQ R11, R14
-	ADDQ R11, BX
-	JMP  copy_5_end
-
-copy_5_move_8through16:
-	MOVQ (R14), R15
-	MOVQ -8(R14)(R11*1), BP
-	MOVQ R15, (BX)
-	MOVQ BP, -8(BX)(R11*1)
-	ADDQ R11, R14
-	ADDQ R11, BX
-
-copy_5_end:
-	ADDQ R11, DI
-	SUBQ R11, R13
-
-	// Copy match from the current buffer
-copy_match:
-	MOVQ BX, R11
-	SUBQ R12, R11
-
-	// ml <= mo
-	CMPQ R13, R12
-	JA   copy_overlapping_match
-
-	// Copy non-overlapping match
-	ADDQ R13, DI
-	MOVQ BX, R12
-	ADDQ R13, BX
-
-copy_2:
-	MOVUPS (R11), X0
-	MOVUPS X0, (R12)
-	ADDQ   $0x10, R11
-	ADDQ   $0x10, R12
-	SUBQ   $0x10, R13
-	JHI    copy_2
-	JMP    handle_loop
-
-	// Copy overlapping match
-copy_overlapping_match:
-	ADDQ R13, DI
-
-copy_slow_3:
-	MOVB (R11), R12
-	MOVB R12, (BX)
-	INCQ R11
-	INCQ BX
-	DECQ R13
-	JNZ  copy_slow_3
-
-handle_loop:
-	ADDQ $0x18, AX
-	INCQ DX
-	CMPQ DX, CX
-	JB   main_loop
-
-loop_finished:
-	// Return value
-	MOVB $0x01, ret+8(FP)
-
-	// Update the context
-	MOVQ ctx+0(FP), AX
-	MOVQ DX, 24(AX)
-	MOVQ DI, 104(AX)
-	SUBQ 80(AX), SI
-	MOVQ SI, 112(AX)
-	RET
-
-error_match_off_too_big:
-	// Return value
-	MOVB $0x00, ret+8(FP)
-
-	// Update the context
-	MOVQ ctx+0(FP), AX
-	MOVQ DX, 24(AX)
-	MOVQ DI, 104(AX)
-	SUBQ 80(AX), SI
-	MOVQ SI, 112(AX)
-	RET
-
-empty_seqs:
-	// Return value
-	MOVB $0x01, ret+8(FP)
-	RET
-
-// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
-// Requires: SSE
-TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
-	MOVQ  ctx+0(FP), R10
-	MOVQ  8(R10), CX
-	TESTQ CX, CX
-	JZ    empty_seqs
-	MOVQ  (R10), AX
-	MOVQ  24(R10), DX
-	MOVQ  32(R10), BX
-	MOVQ  80(R10), SI
-	MOVQ  104(R10), DI
-	MOVQ  120(R10), R8
-	MOVQ  56(R10), R9
-	MOVQ  64(R10), R10
-	ADDQ  R10, R9
-
-	// seqsBase += 24 * seqIndex
-	LEAQ (DX)(DX*2), R11
-	SHLQ $0x03, R11
-	ADDQ R11, AX
-
-	// outBase += outPosition
-	ADDQ DI, BX
-
-main_loop:
-	MOVQ (AX), R11
-	MOVQ 16(AX), R12
-	MOVQ 8(AX), R13
-
-	// Copy literals
-	TESTQ R11, R11
-	JZ    check_offset
-	MOVQ  R11, R14
-	SUBQ  $0x10, R14
-	JB    copy_1_small
-
-copy_1_loop:
-	MOVUPS (SI), X0
-	MOVUPS X0, (BX)
-	ADDQ   $0x10, SI
-	ADDQ   $0x10, BX
-	SUBQ   $0x10, R14
-	JAE    copy_1_loop
-	LEAQ   16(SI)(R14*1), SI
-	LEAQ   16(BX)(R14*1), BX
-	MOVUPS -16(SI), X0
-	MOVUPS X0, -16(BX)
-	JMP    copy_1_end
-
-copy_1_small:
-	CMPQ R11, $0x03
-	JE   copy_1_move_3
-	JB   copy_1_move_1or2
-	CMPQ R11, $0x08
-	JB   copy_1_move_4through7
-	JMP  copy_1_move_8through16
-
-copy_1_move_1or2:
-	MOVB (SI), R14
-	MOVB -1(SI)(R11*1), R15
-	MOVB R14, (BX)
-	MOVB R15, -1(BX)(R11*1)
-	ADDQ R11, SI
-	ADDQ R11, BX
-	JMP  copy_1_end
-
-copy_1_move_3:
-	MOVW (SI), R14
-	MOVB 2(SI), R15
-	MOVW R14, (BX)
-	MOVB R15, 2(BX)
-	ADDQ R11, SI
-	ADDQ R11, BX
-	JMP  copy_1_end
-
-copy_1_move_4through7:
-	MOVL (SI), R14
-	MOVL -4(SI)(R11*1), R15
-	MOVL R14, (BX)
-	MOVL R15, -4(BX)(R11*1)
-	ADDQ R11, SI
-	ADDQ R11, BX
-	JMP  copy_1_end
-
-copy_1_move_8through16:
-	MOVQ (SI), R14
-	MOVQ -8(SI)(R11*1), R15
-	MOVQ R14, (BX)
-	MOVQ R15, -8(BX)(R11*1)
-	ADDQ R11, SI
-	ADDQ R11, BX
-
-copy_1_end:
-	ADDQ R11, DI
-
-	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
-	LEAQ (DI)(R10*1), R11
-	CMPQ R12, R11
-	JG   error_match_off_too_big
-	CMPQ R12, R8
-	JG   error_match_off_too_big
-
-	// Copy match from history
-	MOVQ R12, R11
-	SUBQ DI, R11
-	JLS  copy_match
-	MOVQ R9, R14
-	SUBQ R11, R14
-	CMPQ R13, R11
-	JG   copy_all_from_history
-	MOVQ R13, R11
-	SUBQ $0x10, R11
-	JB   copy_4_small
-
-copy_4_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (BX)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, BX
-	SUBQ   $0x10, R11
-	JAE    copy_4_loop
-	LEAQ   16(R14)(R11*1), R14
-	LEAQ   16(BX)(R11*1), BX
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(BX)
-	JMP    copy_4_end
-
-copy_4_small:
-	CMPQ R13, $0x03
-	JE   copy_4_move_3
-	CMPQ R13, $0x08
-	JB   copy_4_move_4through7
-	JMP  copy_4_move_8through16
-
-copy_4_move_3:
-	MOVW (R14), R11
-	MOVB 2(R14), R12
-	MOVW R11, (BX)
-	MOVB R12, 2(BX)
-	ADDQ R13, R14
-	ADDQ R13, BX
-	JMP  copy_4_end
-
-copy_4_move_4through7:
-	MOVL (R14), R11
-	MOVL -4(R14)(R13*1), R12
-	MOVL R11, (BX)
-	MOVL R12, -4(BX)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, BX
-	JMP  copy_4_end
-
-copy_4_move_8through16:
-	MOVQ (R14), R11
-	MOVQ -8(R14)(R13*1), R12
-	MOVQ R11, (BX)
-	MOVQ R12, -8(BX)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, BX
-
-copy_4_end:
-	ADDQ R13, DI
-	ADDQ $0x18, AX
-	INCQ DX
-	CMPQ DX, CX
-	JB   main_loop
-	JMP  loop_finished
-
-copy_all_from_history:
-	MOVQ R11, R15
-	SUBQ $0x10, R15
-	JB   copy_5_small
-
-copy_5_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (BX)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, BX
-	SUBQ   $0x10, R15
-	JAE    copy_5_loop
-	LEAQ   16(R14)(R15*1), R14
-	LEAQ   16(BX)(R15*1), BX
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(BX)
-	JMP    copy_5_end
-
-copy_5_small:
-	CMPQ R11, $0x03
-	JE   copy_5_move_3
-	JB   copy_5_move_1or2
-	CMPQ R11, $0x08
-	JB   copy_5_move_4through7
-	JMP  copy_5_move_8through16
-
-copy_5_move_1or2:
-	MOVB (R14), R15
-	MOVB -1(R14)(R11*1), BP
-	MOVB R15, (BX)
-	MOVB BP, -1(BX)(R11*1)
-	ADDQ R11, R14
-	ADDQ R11, BX
-	JMP  copy_5_end
-
-copy_5_move_3:
-	MOVW (R14), R15
-	MOVB 2(R14), BP
-	MOVW R15, (BX)
-	MOVB BP, 2(BX)
-	ADDQ R11, R14
-	ADDQ R11, BX
-	JMP  copy_5_end
-
-copy_5_move_4through7:
-	MOVL (R14), R15
-	MOVL -4(R14)(R11*1), BP
-	MOVL R15, (BX)
-	MOVL BP, -4(BX)(R11*1)
-	ADDQ R11, R14
-	ADDQ R11, BX
-	JMP  copy_5_end
-
-copy_5_move_8through16:
-	MOVQ (R14), R15
-	MOVQ -8(R14)(R11*1), BP
-	MOVQ R15, (BX)
-	MOVQ BP, -8(BX)(R11*1)
-	ADDQ R11, R14
-	ADDQ R11, BX
-
-copy_5_end:
-	ADDQ R11, DI
-	SUBQ R11, R13
-
-	// Copy match from the current buffer
-copy_match:
-	MOVQ BX, R11
-	SUBQ R12, R11
-
-	// ml <= mo
-	CMPQ R13, R12
-	JA   copy_overlapping_match
-
-	// Copy non-overlapping match
-	ADDQ R13, DI
-	MOVQ R13, R12
-	SUBQ $0x10, R12
-	JB   copy_2_small
-
-copy_2_loop:
-	MOVUPS (R11), X0
-	MOVUPS X0, (BX)
-	ADDQ   $0x10, R11
-	ADDQ   $0x10, BX
-	SUBQ   $0x10, R12
-	JAE    copy_2_loop
-	LEAQ   16(R11)(R12*1), R11
-	LEAQ   16(BX)(R12*1), BX
-	MOVUPS -16(R11), X0
-	MOVUPS X0, -16(BX)
-	JMP    copy_2_end
-
-copy_2_small:
-	CMPQ R13, $0x03
-	JE   copy_2_move_3
-	JB   copy_2_move_1or2
-	CMPQ R13, $0x08
-	JB   copy_2_move_4through7
-	JMP  copy_2_move_8through16
-
-copy_2_move_1or2:
-	MOVB (R11), R12
-	MOVB -1(R11)(R13*1), R14
-	MOVB R12, (BX)
-	MOVB R14, -1(BX)(R13*1)
-	ADDQ R13, R11
-	ADDQ R13, BX
-	JMP  copy_2_end
-
-copy_2_move_3:
-	MOVW (R11), R12
-	MOVB 2(R11), R14
-	MOVW R12, (BX)
-	MOVB R14, 2(BX)
-	ADDQ R13, R11
-	ADDQ R13, BX
-	JMP  copy_2_end
-
-copy_2_move_4through7:
-	MOVL (R11), R12
-	MOVL -4(R11)(R13*1), R14
-	MOVL R12, (BX)
-	MOVL R14, -4(BX)(R13*1)
-	ADDQ R13, R11
-	ADDQ R13, BX
-	JMP  copy_2_end
-
-copy_2_move_8through16:
-	MOVQ (R11), R12
-	MOVQ -8(R11)(R13*1), R14
-	MOVQ R12, (BX)
-	MOVQ R14, -8(BX)(R13*1)
-	ADDQ R13, R11
-	ADDQ R13, BX
-
-copy_2_end:
-	JMP handle_loop
-
-	// Copy overlapping match
-copy_overlapping_match:
-	ADDQ R13, DI
-
-copy_slow_3:
-	MOVB (R11), R12
-	MOVB R12, (BX)
-	INCQ R11
-	INCQ BX
-	DECQ R13
-	JNZ  copy_slow_3
-
-handle_loop:
-	ADDQ $0x18, AX
-	INCQ DX
-	CMPQ DX, CX
-	JB   main_loop
-
-loop_finished:
-	// Return value
-	MOVB $0x01, ret+8(FP)
-
-	// Update the context
-	MOVQ ctx+0(FP), AX
-	MOVQ DX, 24(AX)
-	MOVQ DI, 104(AX)
-	SUBQ 80(AX), SI
-	MOVQ SI, 112(AX)
-	RET
-
-error_match_off_too_big:
-	// Return value
-	MOVB $0x00, ret+8(FP)
-
-	// Update the context
-	MOVQ ctx+0(FP), AX
-	MOVQ DX, 24(AX)
-	MOVQ DI, 104(AX)
-	SUBQ 80(AX), SI
-	MOVQ SI, 112(AX)
-	RET
-
-empty_seqs:
-	// Return value
-	MOVB $0x01, ret+8(FP)
-	RET
-
-// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
-	MOVQ    br+8(FP), CX
-	MOVQ    24(CX), DX
-	MOVBQZX 40(CX), BX
-	MOVQ    (CX), AX
-	MOVQ    32(CX), SI
-	ADDQ    SI, AX
-	MOVQ    AX, (SP)
-	MOVQ    ctx+16(FP), AX
-	MOVQ    72(AX), DI
-	MOVQ    80(AX), R8
-	MOVQ    88(AX), R9
-	XORQ    CX, CX
-	MOVQ    CX, 8(SP)
-	MOVQ    CX, 16(SP)
-	MOVQ    CX, 24(SP)
-	MOVQ    112(AX), R10
-	MOVQ    128(AX), CX
-	MOVQ    CX, 32(SP)
-	MOVQ    144(AX), R11
-	MOVQ    136(AX), R12
-	MOVQ    200(AX), CX
-	MOVQ    CX, 56(SP)
-	MOVQ    176(AX), CX
-	MOVQ    CX, 48(SP)
-	MOVQ    184(AX), AX
-	MOVQ    AX, 40(SP)
-	MOVQ    40(SP), AX
-	ADDQ    AX, 48(SP)
-
-	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
-	ADDQ R10, 32(SP)
-
-	// outBase += outPosition
-	ADDQ R12, R10
-
-sequenceDecs_decodeSync_amd64_main_loop:
-	MOVQ (SP), R13
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R13
-	MOVQ (R13), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decodeSync_amd64_fill_end
-
-sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decodeSync_amd64_fill_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R13), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
-
-sequenceDecs_decodeSync_amd64_fill_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_amd64_fill_end:
-	// Update offset
-	MOVQ  R9, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R14
-	SHLQ  CL, R14
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decodeSync_amd64_of_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
-	NEGQ  CX
-	SHRQ  CL, R14
-	ADDQ  R14, AX
-
-sequenceDecs_decodeSync_amd64_of_update_zero:
-	MOVQ AX, 8(SP)
-
-	// Update match length
-	MOVQ  R8, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R14
-	SHLQ  CL, R14
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
-	NEGQ  CX
-	SHRQ  CL, R14
-	ADDQ  R14, AX
-
-sequenceDecs_decodeSync_amd64_ml_update_zero:
-	MOVQ AX, 16(SP)
-
-	// Fill bitreader to have enough for the remaining
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R13
-	MOVQ (R13), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
-
-sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R13), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_amd64_fill_2_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_amd64_fill_2_end:
-	// Update literal length
-	MOVQ  DI, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R14
-	SHLQ  CL, R14
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
-	NEGQ  CX
-	SHRQ  CL, R14
-	ADDQ  R14, AX
-
-sequenceDecs_decodeSync_amd64_ll_update_zero:
-	MOVQ AX, 24(SP)
-
-	// Fill bitreader for state updates
-	MOVQ    R13, (SP)
-	MOVQ    R9, AX
-	SHRQ    $0x08, AX
-	MOVBQZX AL, AX
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decodeSync_amd64_skip_update
-
-	// Update Literal Length State
-	MOVBQZX DI, R13
-	SHRL    $0x10, DI
-	LEAQ    (BX)(R13*1), CX
-	MOVQ    DX, R14
-	MOVQ    CX, BX
-	ROLQ    CL, R14
-	MOVL    $0x00000001, R15
-	MOVB    R13, CL
-	SHLL    CL, R15
-	DECL    R15
-	ANDQ    R15, R14
-	ADDQ    R14, DI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Match Length State
-	MOVBQZX R8, R13
-	SHRL    $0x10, R8
-	LEAQ    (BX)(R13*1), CX
-	MOVQ    DX, R14
-	MOVQ    CX, BX
-	ROLQ    CL, R14
-	MOVL    $0x00000001, R15
-	MOVB    R13, CL
-	SHLL    CL, R15
-	DECL    R15
-	ANDQ    R15, R14
-	ADDQ    R14, R8
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Offset State
-	MOVBQZX R9, R13
-	SHRL    $0x10, R9
-	LEAQ    (BX)(R13*1), CX
-	MOVQ    DX, R14
-	MOVQ    CX, BX
-	ROLQ    CL, R14
-	MOVL    $0x00000001, R15
-	MOVB    R13, CL
-	SHLL    CL, R15
-	DECL    R15
-	ANDQ    R15, R14
-	ADDQ    R14, R9
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decodeSync_amd64_skip_update:
-	// Adjust offset
-	MOVQ   s+0(FP), CX
-	MOVQ   8(SP), R13
-	CMPQ   AX, $0x01
-	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
-	MOVUPS 144(CX), X0
-	MOVQ   R13, 144(CX)
-	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_amd64_after_adjust
-
-sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
-	CMPQ 24(SP), $0x00000000
-	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
-	INCQ R13
-	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
-	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_amd64_after_adjust
-
-sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
-	MOVQ    R13, AX
-	XORQ    R14, R14
-	MOVQ    $-1, R15
-	CMPQ    R13, $0x03
-	CMOVQEQ R14, AX
-	CMOVQEQ R15, R14
-	ADDQ    144(CX)(AX*8), R14
-	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
-	MOVQ    $0x00000001, R14
-
-sequenceDecs_decodeSync_amd64_adjust_temp_valid:
-	CMPQ R13, $0x01
-	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
-	MOVQ 152(CX), AX
-	MOVQ AX, 160(CX)
-
-sequenceDecs_decodeSync_amd64_adjust_skip:
-	MOVQ 144(CX), AX
-	MOVQ AX, 152(CX)
-	MOVQ R14, 144(CX)
-	MOVQ R14, R13
-
-sequenceDecs_decodeSync_amd64_after_adjust:
-	MOVQ R13, 8(SP)
-
-	// Check values
-	MOVQ  16(SP), AX
-	MOVQ  24(SP), CX
-	LEAQ  (AX)(CX*1), R14
-	MOVQ  s+0(FP), R15
-	ADDQ  R14, 256(R15)
-	MOVQ  ctx+16(FP), R14
-	SUBQ  CX, 104(R14)
-	JS    error_not_enough_literals
-	CMPQ  AX, $0x00020002
-	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
-	TESTQ AX, AX
-	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
-	MOVQ 24(SP), AX
-	MOVQ 8(SP), CX
-	MOVQ 16(SP), R13
-
-	// Check if we have enough space in s.out
-	LEAQ (AX)(R13*1), R14
-	ADDQ R10, R14
-	CMPQ R14, 32(SP)
-	JA   error_not_enough_space
-
-	// Copy literals
-	TESTQ AX, AX
-	JZ    check_offset
-	XORQ  R14, R14
-
-copy_1:
-	MOVUPS (R11)(R14*1), X0
-	MOVUPS X0, (R10)(R14*1)
-	ADDQ   $0x10, R14
-	CMPQ   R14, AX
-	JB     copy_1
-	ADDQ   AX, R11
-	ADDQ   AX, R10
-	ADDQ   AX, R12
-
-	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
-	MOVQ R12, AX
-	ADDQ 40(SP), AX
-	CMPQ CX, AX
-	JG   error_match_off_too_big
-	CMPQ CX, 56(SP)
-	JG   error_match_off_too_big
-
-	// Copy match from history
-	MOVQ CX, AX
-	SUBQ R12, AX
-	JLS  copy_match
-	MOVQ 48(SP), R14
-	SUBQ AX, R14
-	CMPQ R13, AX
-	JG   copy_all_from_history
-	MOVQ R13, AX
-	SUBQ $0x10, AX
-	JB   copy_4_small
-
-copy_4_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R10)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R10
-	SUBQ   $0x10, AX
-	JAE    copy_4_loop
-	LEAQ   16(R14)(AX*1), R14
-	LEAQ   16(R10)(AX*1), R10
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R10)
-	JMP    copy_4_end
-
-copy_4_small:
-	CMPQ R13, $0x03
-	JE   copy_4_move_3
-	CMPQ R13, $0x08
-	JB   copy_4_move_4through7
-	JMP  copy_4_move_8through16
-
-copy_4_move_3:
-	MOVW (R14), AX
-	MOVB 2(R14), CL
-	MOVW AX, (R10)
-	MOVB CL, 2(R10)
-	ADDQ R13, R14
-	ADDQ R13, R10
-	JMP  copy_4_end
-
-copy_4_move_4through7:
-	MOVL (R14), AX
-	MOVL -4(R14)(R13*1), CX
-	MOVL AX, (R10)
-	MOVL CX, -4(R10)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R10
-	JMP  copy_4_end
-
-copy_4_move_8through16:
-	MOVQ (R14), AX
-	MOVQ -8(R14)(R13*1), CX
-	MOVQ AX, (R10)
-	MOVQ CX, -8(R10)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R10
-
-copy_4_end:
-	ADDQ R13, R12
-	JMP  handle_loop
-	JMP loop_finished
-
-copy_all_from_history:
-	MOVQ AX, R15
-	SUBQ $0x10, R15
-	JB   copy_5_small
-
-copy_5_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R10)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R10
-	SUBQ   $0x10, R15
-	JAE    copy_5_loop
-	LEAQ   16(R14)(R15*1), R14
-	LEAQ   16(R10)(R15*1), R10
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R10)
-	JMP    copy_5_end
-
-copy_5_small:
-	CMPQ AX, $0x03
-	JE   copy_5_move_3
-	JB   copy_5_move_1or2
-	CMPQ AX, $0x08
-	JB   copy_5_move_4through7
-	JMP  copy_5_move_8through16
-
-copy_5_move_1or2:
-	MOVB (R14), R15
-	MOVB -1(R14)(AX*1), BP
-	MOVB R15, (R10)
-	MOVB BP, -1(R10)(AX*1)
-	ADDQ AX, R14
-	ADDQ AX, R10
-	JMP  copy_5_end
-
-copy_5_move_3:
-	MOVW (R14), R15
-	MOVB 2(R14), BP
-	MOVW R15, (R10)
-	MOVB BP, 2(R10)
-	ADDQ AX, R14
-	ADDQ AX, R10
-	JMP  copy_5_end
-
-copy_5_move_4through7:
-	MOVL (R14), R15
-	MOVL -4(R14)(AX*1), BP
-	MOVL R15, (R10)
-	MOVL BP, -4(R10)(AX*1)
-	ADDQ AX, R14
-	ADDQ AX, R10
-	JMP  copy_5_end
-
-copy_5_move_8through16:
-	MOVQ (R14), R15
-	MOVQ -8(R14)(AX*1), BP
-	MOVQ R15, (R10)
-	MOVQ BP, -8(R10)(AX*1)
-	ADDQ AX, R14
-	ADDQ AX, R10
-
-copy_5_end:
-	ADDQ AX, R12
-	SUBQ AX, R13
-
-	// Copy match from the current buffer
-copy_match:
-	MOVQ R10, AX
-	SUBQ CX, AX
-
-	// ml <= mo
-	CMPQ R13, CX
-	JA   copy_overlapping_match
-
-	// Copy non-overlapping match
-	ADDQ R13, R12
-	MOVQ R10, CX
-	ADDQ R13, R10
-
-copy_2:
-	MOVUPS (AX), X0
-	MOVUPS X0, (CX)
-	ADDQ   $0x10, AX
-	ADDQ   $0x10, CX
-	SUBQ   $0x10, R13
-	JHI    copy_2
-	JMP    handle_loop
-
-	// Copy overlapping match
-copy_overlapping_match:
-	ADDQ R13, R12
-
-copy_slow_3:
-	MOVB (AX), CL
-	MOVB CL, (R10)
-	INCQ AX
-	INCQ R10
-	DECQ R13
-	JNZ  copy_slow_3
-
-handle_loop:
-	MOVQ ctx+16(FP), AX
-	DECQ 96(AX)
-	JNS  sequenceDecs_decodeSync_amd64_main_loop
-
-loop_finished:
-	MOVQ br+8(FP), AX
-	MOVQ DX, 24(AX)
-	MOVB BL, 40(AX)
-	MOVQ SI, 32(AX)
-
-	// Update the context
-	MOVQ ctx+16(FP), AX
-	MOVQ R12, 136(AX)
-	MOVQ 144(AX), CX
-	SUBQ CX, R11
-	MOVQ R11, 168(AX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
-	MOVQ 16(SP), AX
-	MOVQ ctx+16(FP), CX
-	MOVQ AX, 216(CX)
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decodeSync_amd64_error_match_len_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-error_match_off_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 8(SP), CX
-	MOVQ CX, 224(AX)
-	MOVQ R12, 136(AX)
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-	// Return with not enough output space error
-error_not_enough_space:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ R12, 136(AX)
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
-// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: BMI, BMI2, CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
-	MOVQ    br+8(FP), BX
-	MOVQ    24(BX), AX
-	MOVBQZX 40(BX), DX
-	MOVQ    (BX), CX
-	MOVQ    32(BX), BX
-	ADDQ    BX, CX
-	MOVQ    CX, (SP)
-	MOVQ    ctx+16(FP), CX
-	MOVQ    72(CX), SI
-	MOVQ    80(CX), DI
-	MOVQ    88(CX), R8
-	XORQ    R9, R9
-	MOVQ    R9, 8(SP)
-	MOVQ    R9, 16(SP)
-	MOVQ    R9, 24(SP)
-	MOVQ    112(CX), R9
-	MOVQ    128(CX), R10
-	MOVQ    R10, 32(SP)
-	MOVQ    144(CX), R10
-	MOVQ    136(CX), R11
-	MOVQ    200(CX), R12
-	MOVQ    R12, 56(SP)
-	MOVQ    176(CX), R12
-	MOVQ    R12, 48(SP)
-	MOVQ    184(CX), CX
-	MOVQ    CX, 40(SP)
-	MOVQ    40(SP), CX
-	ADDQ    CX, 48(SP)
-
-	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
-	ADDQ R9, 32(SP)
-
-	// outBase += outPosition
-	ADDQ R11, R9
-
-sequenceDecs_decodeSync_bmi2_main_loop:
-	MOVQ (SP), R12
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R12
-	MOVQ (R12), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decodeSync_bmi2_fill_end
-
-sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decodeSync_bmi2_fill_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R12
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R12), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
-
-sequenceDecs_decodeSync_bmi2_fill_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_bmi2_fill_end:
-	// Update offset
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, R8, R13
-	MOVQ   AX, R14
-	LEAQ   (DX)(R13*1), CX
-	ROLQ   CL, R14
-	BZHIQ  R13, R14, R14
-	MOVQ   CX, DX
-	MOVQ   R8, CX
-	SHRQ   $0x20, CX
-	ADDQ   R14, CX
-	MOVQ   CX, 8(SP)
-
-	// Update match length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, DI, R13
-	MOVQ   AX, R14
-	LEAQ   (DX)(R13*1), CX
-	ROLQ   CL, R14
-	BZHIQ  R13, R14, R14
-	MOVQ   CX, DX
-	MOVQ   DI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R14, CX
-	MOVQ   CX, 16(SP)
-
-	// Fill bitreader to have enough for the remaining
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R12
-	MOVQ (R12), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
-
-sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R12
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R12), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_bmi2_fill_2_end:
-	// Update literal length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, SI, R13
-	MOVQ   AX, R14
-	LEAQ   (DX)(R13*1), CX
-	ROLQ   CL, R14
-	BZHIQ  R13, R14, R14
-	MOVQ   CX, DX
-	MOVQ   SI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R14, CX
-	MOVQ   CX, 24(SP)
-
-	// Fill bitreader for state updates
-	MOVQ    R12, (SP)
-	MOVQ    $0x00000808, CX
-	BEXTRQ  CX, R8, R12
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decodeSync_bmi2_skip_update
-	LEAQ    (SI)(DI*1), R13
-	ADDQ    R8, R13
-	MOVBQZX R13, R13
-	LEAQ    (DX)(R13*1), CX
-	MOVQ    AX, R14
-	MOVQ    CX, DX
-	ROLQ    CL, R14
-	BZHIQ   R13, R14, R14
-
-	// Update Offset State
-	BZHIQ R8, R14, CX
-	SHRXQ R8, R14, R14
-	SHRL  $0x10, R8
-	ADDQ  CX, R8
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Match Length State
-	BZHIQ DI, R14, CX
-	SHRXQ DI, R14, R14
-	SHRL  $0x10, DI
-	ADDQ  CX, DI
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Literal Length State
-	BZHIQ SI, R14, CX
-	SHRL  $0x10, SI
-	ADDQ  CX, SI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decodeSync_bmi2_skip_update:
-	// Adjust offset
-	MOVQ   s+0(FP), CX
-	MOVQ   8(SP), R13
-	CMPQ   R12, $0x01
-	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
-	MOVUPS 144(CX), X0
-	MOVQ   R13, 144(CX)
-	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
-
-sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
-	CMPQ 24(SP), $0x00000000
-	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
-	INCQ R13
-	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
-	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
-
-sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
-	MOVQ    R13, R12
-	XORQ    R14, R14
-	MOVQ    $-1, R15
-	CMPQ    R13, $0x03
-	CMOVQEQ R14, R12
-	CMOVQEQ R15, R14
-	ADDQ    144(CX)(R12*8), R14
-	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
-	MOVQ    $0x00000001, R14
-
-sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
-	CMPQ R13, $0x01
-	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
-	MOVQ 152(CX), R12
-	MOVQ R12, 160(CX)
-
-sequenceDecs_decodeSync_bmi2_adjust_skip:
-	MOVQ 144(CX), R12
-	MOVQ R12, 152(CX)
-	MOVQ R14, 144(CX)
-	MOVQ R14, R13
-
-sequenceDecs_decodeSync_bmi2_after_adjust:
-	MOVQ R13, 8(SP)
-
-	// Check values
-	MOVQ  16(SP), CX
-	MOVQ  24(SP), R12
-	LEAQ  (CX)(R12*1), R14
-	MOVQ  s+0(FP), R15
-	ADDQ  R14, 256(R15)
-	MOVQ  ctx+16(FP), R14
-	SUBQ  R12, 104(R14)
-	JS    error_not_enough_literals
-	CMPQ  CX, $0x00020002
-	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
-	MOVQ 24(SP), CX
-	MOVQ 8(SP), R12
-	MOVQ 16(SP), R13
-
-	// Check if we have enough space in s.out
-	LEAQ (CX)(R13*1), R14
-	ADDQ R9, R14
-	CMPQ R14, 32(SP)
-	JA   error_not_enough_space
-
-	// Copy literals
-	TESTQ CX, CX
-	JZ    check_offset
-	XORQ  R14, R14
-
-copy_1:
-	MOVUPS (R10)(R14*1), X0
-	MOVUPS X0, (R9)(R14*1)
-	ADDQ   $0x10, R14
-	CMPQ   R14, CX
-	JB     copy_1
-	ADDQ   CX, R10
-	ADDQ   CX, R9
-	ADDQ   CX, R11
-
-	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
-	MOVQ R11, CX
-	ADDQ 40(SP), CX
-	CMPQ R12, CX
-	JG   error_match_off_too_big
-	CMPQ R12, 56(SP)
-	JG   error_match_off_too_big
-
-	// Copy match from history
-	MOVQ R12, CX
-	SUBQ R11, CX
-	JLS  copy_match
-	MOVQ 48(SP), R14
-	SUBQ CX, R14
-	CMPQ R13, CX
-	JG   copy_all_from_history
-	MOVQ R13, CX
-	SUBQ $0x10, CX
-	JB   copy_4_small
-
-copy_4_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R9)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R9
-	SUBQ   $0x10, CX
-	JAE    copy_4_loop
-	LEAQ   16(R14)(CX*1), R14
-	LEAQ   16(R9)(CX*1), R9
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R9)
-	JMP    copy_4_end
-
-copy_4_small:
-	CMPQ R13, $0x03
-	JE   copy_4_move_3
-	CMPQ R13, $0x08
-	JB   copy_4_move_4through7
-	JMP  copy_4_move_8through16
-
-copy_4_move_3:
-	MOVW (R14), CX
-	MOVB 2(R14), R12
-	MOVW CX, (R9)
-	MOVB R12, 2(R9)
-	ADDQ R13, R14
-	ADDQ R13, R9
-	JMP  copy_4_end
-
-copy_4_move_4through7:
-	MOVL (R14), CX
-	MOVL -4(R14)(R13*1), R12
-	MOVL CX, (R9)
-	MOVL R12, -4(R9)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R9
-	JMP  copy_4_end
-
-copy_4_move_8through16:
-	MOVQ (R14), CX
-	MOVQ -8(R14)(R13*1), R12
-	MOVQ CX, (R9)
-	MOVQ R12, -8(R9)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R9
-
-copy_4_end:
-	ADDQ R13, R11
-	JMP  handle_loop
-	JMP loop_finished
-
-copy_all_from_history:
-	MOVQ CX, R15
-	SUBQ $0x10, R15
-	JB   copy_5_small
-
-copy_5_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R9)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R9
-	SUBQ   $0x10, R15
-	JAE    copy_5_loop
-	LEAQ   16(R14)(R15*1), R14
-	LEAQ   16(R9)(R15*1), R9
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R9)
-	JMP    copy_5_end
-
-copy_5_small:
-	CMPQ CX, $0x03
-	JE   copy_5_move_3
-	JB   copy_5_move_1or2
-	CMPQ CX, $0x08
-	JB   copy_5_move_4through7
-	JMP  copy_5_move_8through16
-
-copy_5_move_1or2:
-	MOVB (R14), R15
-	MOVB -1(R14)(CX*1), BP
-	MOVB R15, (R9)
-	MOVB BP, -1(R9)(CX*1)
-	ADDQ CX, R14
-	ADDQ CX, R9
-	JMP  copy_5_end
-
-copy_5_move_3:
-	MOVW (R14), R15
-	MOVB 2(R14), BP
-	MOVW R15, (R9)
-	MOVB BP, 2(R9)
-	ADDQ CX, R14
-	ADDQ CX, R9
-	JMP  copy_5_end
-
-copy_5_move_4through7:
-	MOVL (R14), R15
-	MOVL -4(R14)(CX*1), BP
-	MOVL R15, (R9)
-	MOVL BP, -4(R9)(CX*1)
-	ADDQ CX, R14
-	ADDQ CX, R9
-	JMP  copy_5_end
-
-copy_5_move_8through16:
-	MOVQ (R14), R15
-	MOVQ -8(R14)(CX*1), BP
-	MOVQ R15, (R9)
-	MOVQ BP, -8(R9)(CX*1)
-	ADDQ CX, R14
-	ADDQ CX, R9
-
-copy_5_end:
-	ADDQ CX, R11
-	SUBQ CX, R13
-
-	// Copy match from the current buffer
-copy_match:
-	MOVQ R9, CX
-	SUBQ R12, CX
-
-	// ml <= mo
-	CMPQ R13, R12
-	JA   copy_overlapping_match
-
-	// Copy non-overlapping match
-	ADDQ R13, R11
-	MOVQ R9, R12
-	ADDQ R13, R9
-
-copy_2:
-	MOVUPS (CX), X0
-	MOVUPS X0, (R12)
-	ADDQ   $0x10, CX
-	ADDQ   $0x10, R12
-	SUBQ   $0x10, R13
-	JHI    copy_2
-	JMP    handle_loop
-
-	// Copy overlapping match
-copy_overlapping_match:
-	ADDQ R13, R11
-
-copy_slow_3:
-	MOVB (CX), R12
-	MOVB R12, (R9)
-	INCQ CX
-	INCQ R9
-	DECQ R13
-	JNZ  copy_slow_3
-
-handle_loop:
-	MOVQ ctx+16(FP), CX
-	DECQ 96(CX)
-	JNS  sequenceDecs_decodeSync_bmi2_main_loop
-
-loop_finished:
-	MOVQ br+8(FP), CX
-	MOVQ AX, 24(CX)
-	MOVB DL, 40(CX)
-	MOVQ BX, 32(CX)
-
-	// Update the context
-	MOVQ ctx+16(FP), AX
-	MOVQ R11, 136(AX)
-	MOVQ 144(AX), CX
-	SUBQ CX, R10
-	MOVQ R10, 168(AX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
-	MOVQ 16(SP), AX
-	MOVQ ctx+16(FP), CX
-	MOVQ AX, 216(CX)
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-error_match_off_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 8(SP), CX
-	MOVQ CX, 224(AX)
-	MOVQ R11, 136(AX)
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-	// Return with not enough output space error
-error_not_enough_space:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ R11, 136(AX)
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
-// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
-	MOVQ    br+8(FP), CX
-	MOVQ    24(CX), DX
-	MOVBQZX 40(CX), BX
-	MOVQ    (CX), AX
-	MOVQ    32(CX), SI
-	ADDQ    SI, AX
-	MOVQ    AX, (SP)
-	MOVQ    ctx+16(FP), AX
-	MOVQ    72(AX), DI
-	MOVQ    80(AX), R8
-	MOVQ    88(AX), R9
-	XORQ    CX, CX
-	MOVQ    CX, 8(SP)
-	MOVQ    CX, 16(SP)
-	MOVQ    CX, 24(SP)
-	MOVQ    112(AX), R10
-	MOVQ    128(AX), CX
-	MOVQ    CX, 32(SP)
-	MOVQ    144(AX), R11
-	MOVQ    136(AX), R12
-	MOVQ    200(AX), CX
-	MOVQ    CX, 56(SP)
-	MOVQ    176(AX), CX
-	MOVQ    CX, 48(SP)
-	MOVQ    184(AX), AX
-	MOVQ    AX, 40(SP)
-	MOVQ    40(SP), AX
-	ADDQ    AX, 48(SP)
-
-	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
-	ADDQ R10, 32(SP)
-
-	// outBase += outPosition
-	ADDQ R12, R10
-
-sequenceDecs_decodeSync_safe_amd64_main_loop:
-	MOVQ (SP), R13
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R13
-	MOVQ (R13), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
-
-sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R13), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
-
-sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_safe_amd64_fill_end:
-	// Update offset
-	MOVQ  R9, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R14
-	SHLQ  CL, R14
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
-	NEGQ  CX
-	SHRQ  CL, R14
-	ADDQ  R14, AX
-
-sequenceDecs_decodeSync_safe_amd64_of_update_zero:
-	MOVQ AX, 8(SP)
-
-	// Update match length
-	MOVQ  R8, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R14
-	SHLQ  CL, R14
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
-	NEGQ  CX
-	SHRQ  CL, R14
-	ADDQ  R14, AX
-
-sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
-	MOVQ AX, 16(SP)
-
-	// Fill bitreader to have enough for the remaining
-	CMPQ SI, $0x08
-	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
-	MOVQ BX, AX
-	SHRQ $0x03, AX
-	SUBQ AX, R13
-	MOVQ (R13), DX
-	SUBQ AX, SI
-	ANDQ $0x07, BX
-	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
-
-sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
-	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
-	CMPQ    BX, $0x07
-	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
-	SHLQ    $0x08, DX
-	SUBQ    $0x01, R13
-	SUBQ    $0x01, SI
-	SUBQ    $0x08, BX
-	MOVBQZX (R13), AX
-	ORQ     AX, DX
-	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
-	CMPQ BX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_safe_amd64_fill_2_end:
-	// Update literal length
-	MOVQ  DI, AX
-	MOVQ  BX, CX
-	MOVQ  DX, R14
-	SHLQ  CL, R14
-	MOVB  AH, CL
-	SHRQ  $0x20, AX
-	TESTQ CX, CX
-	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
-	ADDQ  CX, BX
-	CMPQ  BX, $0x40
-	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
-	CMPQ  CX, $0x40
-	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
-	NEGQ  CX
-	SHRQ  CL, R14
-	ADDQ  R14, AX
-
-sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
-	MOVQ AX, 24(SP)
-
-	// Fill bitreader for state updates
-	MOVQ    R13, (SP)
-	MOVQ    R9, AX
-	SHRQ    $0x08, AX
-	MOVBQZX AL, AX
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
-
-	// Update Literal Length State
-	MOVBQZX DI, R13
-	SHRL    $0x10, DI
-	LEAQ    (BX)(R13*1), CX
-	MOVQ    DX, R14
-	MOVQ    CX, BX
-	ROLQ    CL, R14
-	MOVL    $0x00000001, R15
-	MOVB    R13, CL
-	SHLL    CL, R15
-	DECL    R15
-	ANDQ    R15, R14
-	ADDQ    R14, DI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Match Length State
-	MOVBQZX R8, R13
-	SHRL    $0x10, R8
-	LEAQ    (BX)(R13*1), CX
-	MOVQ    DX, R14
-	MOVQ    CX, BX
-	ROLQ    CL, R14
-	MOVL    $0x00000001, R15
-	MOVB    R13, CL
-	SHLL    CL, R15
-	DECL    R15
-	ANDQ    R15, R14
-	ADDQ    R14, R8
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Offset State
-	MOVBQZX R9, R13
-	SHRL    $0x10, R9
-	LEAQ    (BX)(R13*1), CX
-	MOVQ    DX, R14
-	MOVQ    CX, BX
-	ROLQ    CL, R14
-	MOVL    $0x00000001, R15
-	MOVB    R13, CL
-	SHLL    CL, R15
-	DECL    R15
-	ANDQ    R15, R14
-	ADDQ    R14, R9
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decodeSync_safe_amd64_skip_update:
-	// Adjust offset
-	MOVQ   s+0(FP), CX
-	MOVQ   8(SP), R13
-	CMPQ   AX, $0x01
-	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
-	MOVUPS 144(CX), X0
-	MOVQ   R13, 144(CX)
-	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
-
-sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
-	CMPQ 24(SP), $0x00000000
-	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
-	INCQ R13
-	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
-	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
-
-sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
-	MOVQ    R13, AX
-	XORQ    R14, R14
-	MOVQ    $-1, R15
-	CMPQ    R13, $0x03
-	CMOVQEQ R14, AX
-	CMOVQEQ R15, R14
-	ADDQ    144(CX)(AX*8), R14
-	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
-	MOVQ    $0x00000001, R14
-
-sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
-	CMPQ R13, $0x01
-	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
-	MOVQ 152(CX), AX
-	MOVQ AX, 160(CX)
-
-sequenceDecs_decodeSync_safe_amd64_adjust_skip:
-	MOVQ 144(CX), AX
-	MOVQ AX, 152(CX)
-	MOVQ R14, 144(CX)
-	MOVQ R14, R13
-
-sequenceDecs_decodeSync_safe_amd64_after_adjust:
-	MOVQ R13, 8(SP)
-
-	// Check values
-	MOVQ  16(SP), AX
-	MOVQ  24(SP), CX
-	LEAQ  (AX)(CX*1), R14
-	MOVQ  s+0(FP), R15
-	ADDQ  R14, 256(R15)
-	MOVQ  ctx+16(FP), R14
-	SUBQ  CX, 104(R14)
-	JS    error_not_enough_literals
-	CMPQ  AX, $0x00020002
-	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
-	TESTQ AX, AX
-	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
-	MOVQ 24(SP), AX
-	MOVQ 8(SP), CX
-	MOVQ 16(SP), R13
-
-	// Check if we have enough space in s.out
-	LEAQ (AX)(R13*1), R14
-	ADDQ R10, R14
-	CMPQ R14, 32(SP)
-	JA   error_not_enough_space
-
-	// Copy literals
-	TESTQ AX, AX
-	JZ    check_offset
-	MOVQ  AX, R14
-	SUBQ  $0x10, R14
-	JB    copy_1_small
-
-copy_1_loop:
-	MOVUPS (R11), X0
-	MOVUPS X0, (R10)
-	ADDQ   $0x10, R11
-	ADDQ   $0x10, R10
-	SUBQ   $0x10, R14
-	JAE    copy_1_loop
-	LEAQ   16(R11)(R14*1), R11
-	LEAQ   16(R10)(R14*1), R10
-	MOVUPS -16(R11), X0
-	MOVUPS X0, -16(R10)
-	JMP    copy_1_end
-
-copy_1_small:
-	CMPQ AX, $0x03
-	JE   copy_1_move_3
-	JB   copy_1_move_1or2
-	CMPQ AX, $0x08
-	JB   copy_1_move_4through7
-	JMP  copy_1_move_8through16
-
-copy_1_move_1or2:
-	MOVB (R11), R14
-	MOVB -1(R11)(AX*1), R15
-	MOVB R14, (R10)
-	MOVB R15, -1(R10)(AX*1)
-	ADDQ AX, R11
-	ADDQ AX, R10
-	JMP  copy_1_end
-
-copy_1_move_3:
-	MOVW (R11), R14
-	MOVB 2(R11), R15
-	MOVW R14, (R10)
-	MOVB R15, 2(R10)
-	ADDQ AX, R11
-	ADDQ AX, R10
-	JMP  copy_1_end
-
-copy_1_move_4through7:
-	MOVL (R11), R14
-	MOVL -4(R11)(AX*1), R15
-	MOVL R14, (R10)
-	MOVL R15, -4(R10)(AX*1)
-	ADDQ AX, R11
-	ADDQ AX, R10
-	JMP  copy_1_end
-
-copy_1_move_8through16:
-	MOVQ (R11), R14
-	MOVQ -8(R11)(AX*1), R15
-	MOVQ R14, (R10)
-	MOVQ R15, -8(R10)(AX*1)
-	ADDQ AX, R11
-	ADDQ AX, R10
-
-copy_1_end:
-	ADDQ AX, R12
-
-	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
-	MOVQ R12, AX
-	ADDQ 40(SP), AX
-	CMPQ CX, AX
-	JG   error_match_off_too_big
-	CMPQ CX, 56(SP)
-	JG   error_match_off_too_big
-
-	// Copy match from history
-	MOVQ CX, AX
-	SUBQ R12, AX
-	JLS  copy_match
-	MOVQ 48(SP), R14
-	SUBQ AX, R14
-	CMPQ R13, AX
-	JG   copy_all_from_history
-	MOVQ R13, AX
-	SUBQ $0x10, AX
-	JB   copy_4_small
-
-copy_4_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R10)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R10
-	SUBQ   $0x10, AX
-	JAE    copy_4_loop
-	LEAQ   16(R14)(AX*1), R14
-	LEAQ   16(R10)(AX*1), R10
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R10)
-	JMP    copy_4_end
-
-copy_4_small:
-	CMPQ R13, $0x03
-	JE   copy_4_move_3
-	CMPQ R13, $0x08
-	JB   copy_4_move_4through7
-	JMP  copy_4_move_8through16
-
-copy_4_move_3:
-	MOVW (R14), AX
-	MOVB 2(R14), CL
-	MOVW AX, (R10)
-	MOVB CL, 2(R10)
-	ADDQ R13, R14
-	ADDQ R13, R10
-	JMP  copy_4_end
-
-copy_4_move_4through7:
-	MOVL (R14), AX
-	MOVL -4(R14)(R13*1), CX
-	MOVL AX, (R10)
-	MOVL CX, -4(R10)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R10
-	JMP  copy_4_end
-
-copy_4_move_8through16:
-	MOVQ (R14), AX
-	MOVQ -8(R14)(R13*1), CX
-	MOVQ AX, (R10)
-	MOVQ CX, -8(R10)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R10
-
-copy_4_end:
-	ADDQ R13, R12
-	JMP  handle_loop
-	JMP loop_finished
-
-copy_all_from_history:
-	MOVQ AX, R15
-	SUBQ $0x10, R15
-	JB   copy_5_small
-
-copy_5_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R10)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R10
-	SUBQ   $0x10, R15
-	JAE    copy_5_loop
-	LEAQ   16(R14)(R15*1), R14
-	LEAQ   16(R10)(R15*1), R10
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R10)
-	JMP    copy_5_end
-
-copy_5_small:
-	CMPQ AX, $0x03
-	JE   copy_5_move_3
-	JB   copy_5_move_1or2
-	CMPQ AX, $0x08
-	JB   copy_5_move_4through7
-	JMP  copy_5_move_8through16
-
-copy_5_move_1or2:
-	MOVB (R14), R15
-	MOVB -1(R14)(AX*1), BP
-	MOVB R15, (R10)
-	MOVB BP, -1(R10)(AX*1)
-	ADDQ AX, R14
-	ADDQ AX, R10
-	JMP  copy_5_end
-
-copy_5_move_3:
-	MOVW (R14), R15
-	MOVB 2(R14), BP
-	MOVW R15, (R10)
-	MOVB BP, 2(R10)
-	ADDQ AX, R14
-	ADDQ AX, R10
-	JMP  copy_5_end
-
-copy_5_move_4through7:
-	MOVL (R14), R15
-	MOVL -4(R14)(AX*1), BP
-	MOVL R15, (R10)
-	MOVL BP, -4(R10)(AX*1)
-	ADDQ AX, R14
-	ADDQ AX, R10
-	JMP  copy_5_end
-
-copy_5_move_8through16:
-	MOVQ (R14), R15
-	MOVQ -8(R14)(AX*1), BP
-	MOVQ R15, (R10)
-	MOVQ BP, -8(R10)(AX*1)
-	ADDQ AX, R14
-	ADDQ AX, R10
-
-copy_5_end:
-	ADDQ AX, R12
-	SUBQ AX, R13
-
-	// Copy match from the current buffer
-copy_match:
-	MOVQ R10, AX
-	SUBQ CX, AX
-
-	// ml <= mo
-	CMPQ R13, CX
-	JA   copy_overlapping_match
-
-	// Copy non-overlapping match
-	ADDQ R13, R12
-	MOVQ R13, CX
-	SUBQ $0x10, CX
-	JB   copy_2_small
-
-copy_2_loop:
-	MOVUPS (AX), X0
-	MOVUPS X0, (R10)
-	ADDQ   $0x10, AX
-	ADDQ   $0x10, R10
-	SUBQ   $0x10, CX
-	JAE    copy_2_loop
-	LEAQ   16(AX)(CX*1), AX
-	LEAQ   16(R10)(CX*1), R10
-	MOVUPS -16(AX), X0
-	MOVUPS X0, -16(R10)
-	JMP    copy_2_end
-
-copy_2_small:
-	CMPQ R13, $0x03
-	JE   copy_2_move_3
-	JB   copy_2_move_1or2
-	CMPQ R13, $0x08
-	JB   copy_2_move_4through7
-	JMP  copy_2_move_8through16
-
-copy_2_move_1or2:
-	MOVB (AX), CL
-	MOVB -1(AX)(R13*1), R14
-	MOVB CL, (R10)
-	MOVB R14, -1(R10)(R13*1)
-	ADDQ R13, AX
-	ADDQ R13, R10
-	JMP  copy_2_end
-
-copy_2_move_3:
-	MOVW (AX), CX
-	MOVB 2(AX), R14
-	MOVW CX, (R10)
-	MOVB R14, 2(R10)
-	ADDQ R13, AX
-	ADDQ R13, R10
-	JMP  copy_2_end
-
-copy_2_move_4through7:
-	MOVL (AX), CX
-	MOVL -4(AX)(R13*1), R14
-	MOVL CX, (R10)
-	MOVL R14, -4(R10)(R13*1)
-	ADDQ R13, AX
-	ADDQ R13, R10
-	JMP  copy_2_end
-
-copy_2_move_8through16:
-	MOVQ (AX), CX
-	MOVQ -8(AX)(R13*1), R14
-	MOVQ CX, (R10)
-	MOVQ R14, -8(R10)(R13*1)
-	ADDQ R13, AX
-	ADDQ R13, R10
-
-copy_2_end:
-	JMP handle_loop
-
-	// Copy overlapping match
-copy_overlapping_match:
-	ADDQ R13, R12
-
-copy_slow_3:
-	MOVB (AX), CL
-	MOVB CL, (R10)
-	INCQ AX
-	INCQ R10
-	DECQ R13
-	JNZ  copy_slow_3
-
-handle_loop:
-	MOVQ ctx+16(FP), AX
-	DECQ 96(AX)
-	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
-
-loop_finished:
-	MOVQ br+8(FP), AX
-	MOVQ DX, 24(AX)
-	MOVB BL, 40(AX)
-	MOVQ SI, 32(AX)
-
-	// Update the context
-	MOVQ ctx+16(FP), AX
-	MOVQ R12, 136(AX)
-	MOVQ 144(AX), CX
-	SUBQ CX, R11
-	MOVQ R11, 168(AX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
-	MOVQ 16(SP), AX
-	MOVQ ctx+16(FP), CX
-	MOVQ AX, 216(CX)
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-error_match_off_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 8(SP), CX
-	MOVQ CX, 224(AX)
-	MOVQ R12, 136(AX)
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-	// Return with not enough output space error
-error_not_enough_space:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ R12, 136(AX)
-	MOVQ $0x00000005, ret+24(FP)
-	RET
-
-// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: BMI, BMI2, CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
-	MOVQ    br+8(FP), BX
-	MOVQ    24(BX), AX
-	MOVBQZX 40(BX), DX
-	MOVQ    (BX), CX
-	MOVQ    32(BX), BX
-	ADDQ    BX, CX
-	MOVQ    CX, (SP)
-	MOVQ    ctx+16(FP), CX
-	MOVQ    72(CX), SI
-	MOVQ    80(CX), DI
-	MOVQ    88(CX), R8
-	XORQ    R9, R9
-	MOVQ    R9, 8(SP)
-	MOVQ    R9, 16(SP)
-	MOVQ    R9, 24(SP)
-	MOVQ    112(CX), R9
-	MOVQ    128(CX), R10
-	MOVQ    R10, 32(SP)
-	MOVQ    144(CX), R10
-	MOVQ    136(CX), R11
-	MOVQ    200(CX), R12
-	MOVQ    R12, 56(SP)
-	MOVQ    176(CX), R12
-	MOVQ    R12, 48(SP)
-	MOVQ    184(CX), CX
-	MOVQ    CX, 40(SP)
-	MOVQ    40(SP), CX
-	ADDQ    CX, 48(SP)
-
-	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
-	ADDQ R9, 32(SP)
-
-	// outBase += outPosition
-	ADDQ R11, R9
-
-sequenceDecs_decodeSync_safe_bmi2_main_loop:
-	MOVQ (SP), R12
-
-	// Fill bitreader to have enough for the offset and match length.
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R12
-	MOVQ (R12), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
-
-sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R12
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R12), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
-
-sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_safe_bmi2_fill_end:
-	// Update offset
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, R8, R13
-	MOVQ   AX, R14
-	LEAQ   (DX)(R13*1), CX
-	ROLQ   CL, R14
-	BZHIQ  R13, R14, R14
-	MOVQ   CX, DX
-	MOVQ   R8, CX
-	SHRQ   $0x20, CX
-	ADDQ   R14, CX
-	MOVQ   CX, 8(SP)
-
-	// Update match length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, DI, R13
-	MOVQ   AX, R14
-	LEAQ   (DX)(R13*1), CX
-	ROLQ   CL, R14
-	BZHIQ  R13, R14, R14
-	MOVQ   CX, DX
-	MOVQ   DI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R14, CX
-	MOVQ   CX, 16(SP)
-
-	// Fill bitreader to have enough for the remaining
-	CMPQ BX, $0x08
-	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
-	MOVQ DX, CX
-	SHRQ $0x03, CX
-	SUBQ CX, R12
-	MOVQ (R12), AX
-	SUBQ CX, BX
-	ANDQ $0x07, DX
-	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
-
-sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
-	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
-	CMPQ    DX, $0x07
-	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
-	SHLQ    $0x08, AX
-	SUBQ    $0x01, R12
-	SUBQ    $0x01, BX
-	SUBQ    $0x08, DX
-	MOVBQZX (R12), CX
-	ORQ     CX, AX
-	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
-	CMPQ DX, $0x40
-	JA   error_overread
-
-sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
-	// Update literal length
-	MOVQ   $0x00000808, CX
-	BEXTRQ CX, SI, R13
-	MOVQ   AX, R14
-	LEAQ   (DX)(R13*1), CX
-	ROLQ   CL, R14
-	BZHIQ  R13, R14, R14
-	MOVQ   CX, DX
-	MOVQ   SI, CX
-	SHRQ   $0x20, CX
-	ADDQ   R14, CX
-	MOVQ   CX, 24(SP)
-
-	// Fill bitreader for state updates
-	MOVQ    R12, (SP)
-	MOVQ    $0x00000808, CX
-	BEXTRQ  CX, R8, R12
-	MOVQ    ctx+16(FP), CX
-	CMPQ    96(CX), $0x00
-	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
-	LEAQ    (SI)(DI*1), R13
-	ADDQ    R8, R13
-	MOVBQZX R13, R13
-	LEAQ    (DX)(R13*1), CX
-	MOVQ    AX, R14
-	MOVQ    CX, DX
-	ROLQ    CL, R14
-	BZHIQ   R13, R14, R14
-
-	// Update Offset State
-	BZHIQ R8, R14, CX
-	SHRXQ R8, R14, R14
-	SHRL  $0x10, R8
-	ADDQ  CX, R8
-
-	// Load ctx.ofTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 48(CX), CX
-	MOVQ (CX)(R8*8), R8
-
-	// Update Match Length State
-	BZHIQ DI, R14, CX
-	SHRXQ DI, R14, R14
-	SHRL  $0x10, DI
-	ADDQ  CX, DI
-
-	// Load ctx.mlTable
-	MOVQ ctx+16(FP), CX
-	MOVQ 24(CX), CX
-	MOVQ (CX)(DI*8), DI
-
-	// Update Literal Length State
-	BZHIQ SI, R14, CX
-	SHRL  $0x10, SI
-	ADDQ  CX, SI
-
-	// Load ctx.llTable
-	MOVQ ctx+16(FP), CX
-	MOVQ (CX), CX
-	MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decodeSync_safe_bmi2_skip_update:
-	// Adjust offset
-	MOVQ   s+0(FP), CX
-	MOVQ   8(SP), R13
-	CMPQ   R12, $0x01
-	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
-	MOVUPS 144(CX), X0
-	MOVQ   R13, 144(CX)
-	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
-	CMPQ 24(SP), $0x00000000
-	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
-	INCQ R13
-	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
-	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
-	MOVQ    R13, R12
-	XORQ    R14, R14
-	MOVQ    $-1, R15
-	CMPQ    R13, $0x03
-	CMOVQEQ R14, R12
-	CMOVQEQ R15, R14
-	ADDQ    144(CX)(R12*8), R14
-	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
-	MOVQ    $0x00000001, R14
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
-	CMPQ R13, $0x01
-	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
-	MOVQ 152(CX), R12
-	MOVQ R12, 160(CX)
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
-	MOVQ 144(CX), R12
-	MOVQ R12, 152(CX)
-	MOVQ R14, 144(CX)
-	MOVQ R14, R13
-
-sequenceDecs_decodeSync_safe_bmi2_after_adjust:
-	MOVQ R13, 8(SP)
-
-	// Check values
-	MOVQ  16(SP), CX
-	MOVQ  24(SP), R12
-	LEAQ  (CX)(R12*1), R14
-	MOVQ  s+0(FP), R15
-	ADDQ  R14, 256(R15)
-	MOVQ  ctx+16(FP), R14
-	SUBQ  R12, 104(R14)
-	JS    error_not_enough_literals
-	CMPQ  CX, $0x00020002
-	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
-	TESTQ R13, R13
-	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
-	TESTQ CX, CX
-	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
-	MOVQ 24(SP), CX
-	MOVQ 8(SP), R12
-	MOVQ 16(SP), R13
-
-	// Check if we have enough space in s.out
-	LEAQ (CX)(R13*1), R14
-	ADDQ R9, R14
-	CMPQ R14, 32(SP)
-	JA   error_not_enough_space
-
-	// Copy literals
-	TESTQ CX, CX
-	JZ    check_offset
-	MOVQ  CX, R14
-	SUBQ  $0x10, R14
-	JB    copy_1_small
-
-copy_1_loop:
-	MOVUPS (R10), X0
-	MOVUPS X0, (R9)
-	ADDQ   $0x10, R10
-	ADDQ   $0x10, R9
-	SUBQ   $0x10, R14
-	JAE    copy_1_loop
-	LEAQ   16(R10)(R14*1), R10
-	LEAQ   16(R9)(R14*1), R9
-	MOVUPS -16(R10), X0
-	MOVUPS X0, -16(R9)
-	JMP    copy_1_end
-
-copy_1_small:
-	CMPQ CX, $0x03
-	JE   copy_1_move_3
-	JB   copy_1_move_1or2
-	CMPQ CX, $0x08
-	JB   copy_1_move_4through7
-	JMP  copy_1_move_8through16
-
-copy_1_move_1or2:
-	MOVB (R10), R14
-	MOVB -1(R10)(CX*1), R15
-	MOVB R14, (R9)
-	MOVB R15, -1(R9)(CX*1)
-	ADDQ CX, R10
-	ADDQ CX, R9
-	JMP  copy_1_end
-
-copy_1_move_3:
-	MOVW (R10), R14
-	MOVB 2(R10), R15
-	MOVW R14, (R9)
-	MOVB R15, 2(R9)
-	ADDQ CX, R10
-	ADDQ CX, R9
-	JMP  copy_1_end
-
-copy_1_move_4through7:
-	MOVL (R10), R14
-	MOVL -4(R10)(CX*1), R15
-	MOVL R14, (R9)
-	MOVL R15, -4(R9)(CX*1)
-	ADDQ CX, R10
-	ADDQ CX, R9
-	JMP  copy_1_end
-
-copy_1_move_8through16:
-	MOVQ (R10), R14
-	MOVQ -8(R10)(CX*1), R15
-	MOVQ R14, (R9)
-	MOVQ R15, -8(R9)(CX*1)
-	ADDQ CX, R10
-	ADDQ CX, R9
-
-copy_1_end:
-	ADDQ CX, R11
-
-	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
-	MOVQ R11, CX
-	ADDQ 40(SP), CX
-	CMPQ R12, CX
-	JG   error_match_off_too_big
-	CMPQ R12, 56(SP)
-	JG   error_match_off_too_big
-
-	// Copy match from history
-	MOVQ R12, CX
-	SUBQ R11, CX
-	JLS  copy_match
-	MOVQ 48(SP), R14
-	SUBQ CX, R14
-	CMPQ R13, CX
-	JG   copy_all_from_history
-	MOVQ R13, CX
-	SUBQ $0x10, CX
-	JB   copy_4_small
-
-copy_4_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R9)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R9
-	SUBQ   $0x10, CX
-	JAE    copy_4_loop
-	LEAQ   16(R14)(CX*1), R14
-	LEAQ   16(R9)(CX*1), R9
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R9)
-	JMP    copy_4_end
-
-copy_4_small:
-	CMPQ R13, $0x03
-	JE   copy_4_move_3
-	CMPQ R13, $0x08
-	JB   copy_4_move_4through7
-	JMP  copy_4_move_8through16
-
-copy_4_move_3:
-	MOVW (R14), CX
-	MOVB 2(R14), R12
-	MOVW CX, (R9)
-	MOVB R12, 2(R9)
-	ADDQ R13, R14
-	ADDQ R13, R9
-	JMP  copy_4_end
-
-copy_4_move_4through7:
-	MOVL (R14), CX
-	MOVL -4(R14)(R13*1), R12
-	MOVL CX, (R9)
-	MOVL R12, -4(R9)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R9
-	JMP  copy_4_end
-
-copy_4_move_8through16:
-	MOVQ (R14), CX
-	MOVQ -8(R14)(R13*1), R12
-	MOVQ CX, (R9)
-	MOVQ R12, -8(R9)(R13*1)
-	ADDQ R13, R14
-	ADDQ R13, R9
-
-copy_4_end:
-	ADDQ R13, R11
-	JMP  handle_loop
-	JMP loop_finished
-
-copy_all_from_history:
-	MOVQ CX, R15
-	SUBQ $0x10, R15
-	JB   copy_5_small
-
-copy_5_loop:
-	MOVUPS (R14), X0
-	MOVUPS X0, (R9)
-	ADDQ   $0x10, R14
-	ADDQ   $0x10, R9
-	SUBQ   $0x10, R15
-	JAE    copy_5_loop
-	LEAQ   16(R14)(R15*1), R14
-	LEAQ   16(R9)(R15*1), R9
-	MOVUPS -16(R14), X0
-	MOVUPS X0, -16(R9)
-	JMP    copy_5_end
-
-copy_5_small:
-	CMPQ CX, $0x03
-	JE   copy_5_move_3
-	JB   copy_5_move_1or2
-	CMPQ CX, $0x08
-	JB   copy_5_move_4through7
-	JMP  copy_5_move_8through16
-
-copy_5_move_1or2:
-	MOVB (R14), R15
-	MOVB -1(R14)(CX*1), BP
-	MOVB R15, (R9)
-	MOVB BP, -1(R9)(CX*1)
-	ADDQ CX, R14
-	ADDQ CX, R9
-	JMP  copy_5_end
-
-copy_5_move_3:
-	MOVW (R14), R15
-	MOVB 2(R14), BP
-	MOVW R15, (R9)
-	MOVB BP, 2(R9)
-	ADDQ CX, R14
-	ADDQ CX, R9
-	JMP  copy_5_end
-
-copy_5_move_4through7:
-	MOVL (R14), R15
-	MOVL -4(R14)(CX*1), BP
-	MOVL R15, (R9)
-	MOVL BP, -4(R9)(CX*1)
-	ADDQ CX, R14
-	ADDQ CX, R9
-	JMP  copy_5_end
-
-copy_5_move_8through16:
-	MOVQ (R14), R15
-	MOVQ -8(R14)(CX*1), BP
-	MOVQ R15, (R9)
-	MOVQ BP, -8(R9)(CX*1)
-	ADDQ CX, R14
-	ADDQ CX, R9
-
-copy_5_end:
-	ADDQ CX, R11
-	SUBQ CX, R13
-
-	// Copy match from the current buffer
-copy_match:
-	MOVQ R9, CX
-	SUBQ R12, CX
-
-	// ml <= mo
-	CMPQ R13, R12
-	JA   copy_overlapping_match
-
-	// Copy non-overlapping match
-	ADDQ R13, R11
-	MOVQ R13, R12
-	SUBQ $0x10, R12
-	JB   copy_2_small
-
-copy_2_loop:
-	MOVUPS (CX), X0
-	MOVUPS X0, (R9)
-	ADDQ   $0x10, CX
-	ADDQ   $0x10, R9
-	SUBQ   $0x10, R12
-	JAE    copy_2_loop
-	LEAQ   16(CX)(R12*1), CX
-	LEAQ   16(R9)(R12*1), R9
-	MOVUPS -16(CX), X0
-	MOVUPS X0, -16(R9)
-	JMP    copy_2_end
-
-copy_2_small:
-	CMPQ R13, $0x03
-	JE   copy_2_move_3
-	JB   copy_2_move_1or2
-	CMPQ R13, $0x08
-	JB   copy_2_move_4through7
-	JMP  copy_2_move_8through16
-
-copy_2_move_1or2:
-	MOVB (CX), R12
-	MOVB -1(CX)(R13*1), R14
-	MOVB R12, (R9)
-	MOVB R14, -1(R9)(R13*1)
-	ADDQ R13, CX
-	ADDQ R13, R9
-	JMP  copy_2_end
-
-copy_2_move_3:
-	MOVW (CX), R12
-	MOVB 2(CX), R14
-	MOVW R12, (R9)
-	MOVB R14, 2(R9)
-	ADDQ R13, CX
-	ADDQ R13, R9
-	JMP  copy_2_end
-
-copy_2_move_4through7:
-	MOVL (CX), R12
-	MOVL -4(CX)(R13*1), R14
-	MOVL R12, (R9)
-	MOVL R14, -4(R9)(R13*1)
-	ADDQ R13, CX
-	ADDQ R13, R9
-	JMP  copy_2_end
-
-copy_2_move_8through16:
-	MOVQ (CX), R12
-	MOVQ -8(CX)(R13*1), R14
-	MOVQ R12, (R9)
-	MOVQ R14, -8(R9)(R13*1)
-	ADDQ R13, CX
-	ADDQ R13, R9
-
-copy_2_end:
-	JMP handle_loop
-
-	// Copy overlapping match
-copy_overlapping_match:
-	ADDQ R13, R11
-
-copy_slow_3:
-	MOVB (CX), R12
-	MOVB R12, (R9)
-	INCQ CX
-	INCQ R9
-	DECQ R13
-	JNZ  copy_slow_3
-
-handle_loop:
-	MOVQ ctx+16(FP), CX
-	DECQ 96(CX)
-	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
-
-loop_finished:
-	MOVQ br+8(FP), CX
-	MOVQ AX, 24(CX)
-	MOVB DL, 40(CX)
-	MOVQ BX, 32(CX)
-
-	// Update the context
-	MOVQ ctx+16(FP), AX
-	MOVQ R11, 136(AX)
-	MOVQ 144(AX), CX
-	SUBQ CX, R10
-	MOVQ R10, 168(AX)
-
-	// Return success
-	MOVQ $0x00000000, ret+24(FP)
-	RET
-
-	// Return with match length error
-sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
-	MOVQ 16(SP), AX
-	MOVQ ctx+16(FP), CX
-	MOVQ AX, 216(CX)
-	MOVQ $0x00000001, ret+24(FP)
-	RET
-
-	// Return with match too long error
-sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ $0x00000002, ret+24(FP)
-	RET
-
-	// Return with match offset too long error
-error_match_off_too_big:
-	MOVQ ctx+16(FP), AX
-	MOVQ 8(SP), CX
-	MOVQ CX, 224(AX)
-	MOVQ R11, 136(AX)
-	MOVQ $0x00000003, ret+24(FP)
-	RET
-
-	// Return with not enough literals error
-error_not_enough_literals:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ $0x00000004, ret+24(FP)
-	RET
-
-	// Return with overread error
-error_overread:
-	MOVQ $0x00000006, ret+24(FP)
-	RET
-
-	// Return with not enough output space error
-error_not_enough_space:
-	MOVQ ctx+16(FP), AX
-	MOVQ 24(SP), CX
-	MOVQ CX, 208(AX)
-	MOVQ 16(SP), CX
-	MOVQ CX, 216(AX)
-	MOVQ R11, 136(AX)
-	MOVQ $0x00000005, ret+24(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
deleted file mode 100644
index 7cec2197c..000000000
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ /dev/null
@@ -1,237 +0,0 @@
-//go:build !amd64 || appengine || !gc || noasm
-// +build !amd64 appengine !gc noasm
-
-package zstd
-
-import (
-	"fmt"
-	"io"
-)
-
-// decode sequences from the stream with the provided history but without dictionary.
-func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
-	return false, nil
-}
-
-// decode sequences from the stream without the provided history.
-func (s *sequenceDecs) decode(seqs []seqVals) error {
-	br := s.br
-
-	// Grab full sizes tables, to avoid bounds checks.
-	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
-	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-	s.seqSize = 0
-	litRemain := len(s.literals)
-
-	maxBlockSize := maxCompressedBlockSize
-	if s.windowSize < maxBlockSize {
-		maxBlockSize = s.windowSize
-	}
-	for i := range seqs {
-		var ll, mo, ml int
-		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
-			// inlined function:
-			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
-
-			// Final will not read from stream.
-			var llB, mlB, moB uint8
-			ll, llB = llState.final()
-			ml, mlB = mlState.final()
-			mo, moB = ofState.final()
-
-			// extra bits are stored in reverse order.
-			br.fillFast()
-			mo += br.getBits(moB)
-			if s.maxBits > 32 {
-				br.fillFast()
-			}
-			ml += br.getBits(mlB)
-			ll += br.getBits(llB)
-
-			if moB > 1 {
-				s.prevOffset[2] = s.prevOffset[1]
-				s.prevOffset[1] = s.prevOffset[0]
-				s.prevOffset[0] = mo
-			} else {
-				// mo = s.adjustOffset(mo, ll, moB)
-				// Inlined for rather big speedup
-				if ll == 0 {
-					// There is an exception though, when current sequence's literals_length = 0.
-					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
-					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
-					mo++
-				}
-
-				if mo == 0 {
-					mo = s.prevOffset[0]
-				} else {
-					var temp int
-					if mo == 3 {
-						temp = s.prevOffset[0] - 1
-					} else {
-						temp = s.prevOffset[mo]
-					}
-
-					if temp == 0 {
-						// 0 is not valid; input is corrupted; force offset to 1
-						println("WARNING: temp was 0")
-						temp = 1
-					}
-
-					if mo != 1 {
-						s.prevOffset[2] = s.prevOffset[1]
-					}
-					s.prevOffset[1] = s.prevOffset[0]
-					s.prevOffset[0] = temp
-					mo = temp
-				}
-			}
-			br.fillFast()
-		} else {
-			if br.overread() {
-				if debugDecoder {
-					printf("reading sequence %d, exceeded available data\n", i)
-				}
-				return io.ErrUnexpectedEOF
-			}
-			ll, mo, ml = s.next(br, llState, mlState, ofState)
-			br.fill()
-		}
-
-		if debugSequences {
-			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
-		}
-		// Evaluate.
-		// We might be doing this async, so do it early.
-		if mo == 0 && ml > 0 {
-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
-		}
-		if ml > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
-		}
-		s.seqSize += ll + ml
-		if s.seqSize > maxBlockSize {
-			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-		}
-		litRemain -= ll
-		if litRemain < 0 {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
-		}
-		seqs[i] = seqVals{
-			ll: ll,
-			ml: ml,
-			mo: mo,
-		}
-		if i == len(seqs)-1 {
-			// This is the last sequence, so we shouldn't update state.
-			break
-		}
-
-		// Manually inlined, ~ 5-20% faster
-		// Update all 3 states at once. Approx 20% faster.
-		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
-		if nBits == 0 {
-			llState = llTable[llState.newState()&maxTableMask]
-			mlState = mlTable[mlState.newState()&maxTableMask]
-			ofState = ofTable[ofState.newState()&maxTableMask]
-		} else {
-			bits := br.get32BitsFast(nBits)
-			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
-			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
-
-			lowBits = uint16(bits >> (ofState.nbBits() & 31))
-			lowBits &= bitMask[mlState.nbBits()&15]
-			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
-
-			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
-			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
-		}
-	}
-	s.seqSize += litRemain
-	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
-	}
-	err := br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
-	return err
-}
-
-// executeSimple handles cases when a dictionary is not used.
-func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
-	// Ensure we have enough output size...
-	if len(s.out)+s.seqSize > cap(s.out) {
-		addBytes := s.seqSize + len(s.out)
-		s.out = append(s.out, make([]byte, addBytes)...)
-		s.out = s.out[:len(s.out)-addBytes]
-	}
-
-	if debugDecoder {
-		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
-	}
-
-	var t = len(s.out)
-	out := s.out[:t+s.seqSize]
-
-	for _, seq := range seqs {
-		// Add literals
-		copy(out[t:], s.literals[:seq.ll])
-		t += seq.ll
-		s.literals = s.literals[seq.ll:]
-
-		// Malformed input
-		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
-			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
-		}
-
-		// Copy from history.
-		if v := seq.mo - t; v > 0 {
-			// v is the start position in history from end.
-			start := len(hist) - v
-			if seq.ml > v {
-				// Some goes into the current block.
-				// Copy remainder of history
-				copy(out[t:], hist[start:])
-				t += v
-				seq.ml -= v
-			} else {
-				copy(out[t:], hist[start:start+seq.ml])
-				t += seq.ml
-				continue
-			}
-		}
-
-		// We must be in the current buffer now
-		if seq.ml > 0 {
-			start := t - seq.mo
-			if seq.ml <= t-start {
-				// No overlap
-				copy(out[t:], out[start:start+seq.ml])
-				t += seq.ml
-			} else {
-				// Overlapping copy
-				// Extend destination slice and copy one byte at the time.
-				src := out[start : start+seq.ml]
-				dst := out[t:]
-				dst = dst[:len(src)]
-				t += len(src)
-				// Destination is the space we just added.
-				for i := range src {
-					dst[i] = src[i]
-				}
-			}
-		}
-	}
-	// Add final literals
-	copy(out[t:], s.literals)
-	if debugDecoder {
-		t += len(s.literals)
-		if t != len(out) {
-			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
-		}
-	}
-	s.out = out
-
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
deleted file mode 100644
index 65045eabd..000000000
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import "math/bits"
-
-type seqCoders struct {
-	llEnc, ofEnc, mlEnc    *fseEncoder
-	llPrev, ofPrev, mlPrev *fseEncoder
-}
-
-// swap coders with another (block).
-func (s *seqCoders) swap(other *seqCoders) {
-	*s, *other = *other, *s
-}
-
-// setPrev will update the previous encoders to the actually used ones
-// and make sure a fresh one is in the main slot.
-func (s *seqCoders) setPrev(ll, ml, of *fseEncoder) {
-	compareSwap := func(used *fseEncoder, current, prev **fseEncoder) {
-		// We used the new one, more current to history and reuse the previous history
-		if *current == used {
-			*prev, *current = *current, *prev
-			c := *current
-			p := *prev
-			c.reUsed = false
-			p.reUsed = true
-			return
-		}
-		if used == *prev {
-			return
-		}
-		// Ensure we cannot reuse by accident
-		prevEnc := *prev
-		prevEnc.symbolLen = 0
-	}
-	compareSwap(ll, &s.llEnc, &s.llPrev)
-	compareSwap(ml, &s.mlEnc, &s.mlPrev)
-	compareSwap(of, &s.ofEnc, &s.ofPrev)
-}
-
-func highBit(val uint32) (n uint32) {
-	return uint32(bits.Len32(val) - 1)
-}
-
-var llCodeTable = [64]byte{0, 1, 2, 3, 4, 5, 6, 7,
-	8, 9, 10, 11, 12, 13, 14, 15,
-	16, 16, 17, 17, 18, 18, 19, 19,
-	20, 20, 20, 20, 21, 21, 21, 21,
-	22, 22, 22, 22, 22, 22, 22, 22,
-	23, 23, 23, 23, 23, 23, 23, 23,
-	24, 24, 24, 24, 24, 24, 24, 24,
-	24, 24, 24, 24, 24, 24, 24, 24}
-
-// Up to 6 bits
-const maxLLCode = 35
-
-// llBitsTable translates from ll code to number of bits.
-var llBitsTable = [maxLLCode + 1]byte{
-	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 2, 2, 3, 3,
-	4, 6, 7, 8, 9, 10, 11, 12,
-	13, 14, 15, 16}
-
-// llCode returns the code that represents the literal length requested.
-func llCode(litLength uint32) uint8 {
-	const llDeltaCode = 19
-	if litLength <= 63 {
-		return llCodeTable[litLength&63]
-	}
-	return uint8(highBit(litLength)) + llDeltaCode
-}
-
-var mlCodeTable = [128]byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-	16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-	32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
-	38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
-	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
-	41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
-	42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
-	42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42}
-
-// Up to 6 bits
-const maxMLCode = 52
-
-// mlBitsTable translates from ml code to number of bits.
-var mlBitsTable = [maxMLCode + 1]byte{
-	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 2, 2, 3, 3,
-	4, 4, 5, 7, 8, 9, 10, 11,
-	12, 13, 14, 15, 16}
-
-// note : mlBase = matchLength - MINMATCH;
-// because it's the format it's stored in seqStore->sequences
-func mlCode(mlBase uint32) uint8 {
-	const mlDeltaCode = 36
-	if mlBase <= 127 {
-		return mlCodeTable[mlBase&127]
-	}
-	return uint8(highBit(mlBase)) + mlDeltaCode
-}
-
-func ofCode(offset uint32) uint8 {
-	// A valid offset will always be > 0.
-	return uint8(bits.Len32(offset) - 1)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
deleted file mode 100644
index a17381b8f..000000000
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ /dev/null
@@ -1,434 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-import (
-	"encoding/binary"
-	"errors"
-	"hash/crc32"
-	"io"
-
-	"github.com/klauspost/compress/huff0"
-	snappy "github.com/klauspost/compress/internal/snapref"
-)
-
-const (
-	snappyTagLiteral = 0x00
-	snappyTagCopy1   = 0x01
-	snappyTagCopy2   = 0x02
-	snappyTagCopy4   = 0x03
-)
-
-const (
-	snappyChecksumSize = 4
-	snappyMagicBody    = "sNaPpY"
-
-	// snappyMaxBlockSize is the maximum size of the input to encodeBlock. It is not
-	// part of the wire format per se, but some parts of the encoder assume
-	// that an offset fits into a uint16.
-	//
-	// Also, for the framing format (Writer type instead of Encode function),
-	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536
-	// bytes".
-	snappyMaxBlockSize = 65536
-
-	// snappyMaxEncodedLenOfMaxBlockSize equals MaxEncodedLen(snappyMaxBlockSize), but is
-	// hard coded to be a const instead of a variable, so that obufLen can also
-	// be a const. Their equivalence is confirmed by
-	// TestMaxEncodedLenOfMaxBlockSize.
-	snappyMaxEncodedLenOfMaxBlockSize = 76490
-)
-
-const (
-	chunkTypeCompressedData   = 0x00
-	chunkTypeUncompressedData = 0x01
-	chunkTypePadding          = 0xfe
-	chunkTypeStreamIdentifier = 0xff
-)
-
-var (
-	// ErrSnappyCorrupt reports that the input is invalid.
-	ErrSnappyCorrupt = errors.New("snappy: corrupt input")
-	// ErrSnappyTooLarge reports that the uncompressed length is too large.
-	ErrSnappyTooLarge = errors.New("snappy: decoded block is too large")
-	// ErrSnappyUnsupported reports that the input isn't supported.
-	ErrSnappyUnsupported = errors.New("snappy: unsupported input")
-
-	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
-)
-
-// SnappyConverter can read SnappyConverter-compressed streams and convert them to zstd.
-// Conversion is done by converting the stream directly from Snappy without intermediate
-// full decoding.
-// Therefore the compression ratio is much less than what can be done by a full decompression
-// and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
-// any errors being generated.
-// No CRC value is being generated and not all CRC values of the Snappy stream are checked.
-// However, it provides really fast recompression of Snappy streams.
-// The converter can be reused to avoid allocations, even after errors.
-type SnappyConverter struct {
-	r     io.Reader
-	err   error
-	buf   []byte
-	block *blockEnc
-}
-
-// Convert the Snappy stream supplied in 'in' and write the zStandard stream to 'w'.
-// If any error is detected on the Snappy stream it is returned.
-// The number of bytes written is returned.
-func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
-	initPredefined()
-	r.err = nil
-	r.r = in
-	if r.block == nil {
-		r.block = &blockEnc{}
-		r.block.init()
-	}
-	r.block.initNewEncode()
-	if len(r.buf) != snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize {
-		r.buf = make([]byte, snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize)
-	}
-	r.block.litEnc.Reuse = huff0.ReusePolicyNone
-	var written int64
-	var readHeader bool
-	{
-		header := frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
-
-		var n int
-		n, r.err = w.Write(header)
-		if r.err != nil {
-			return written, r.err
-		}
-		written += int64(n)
-	}
-
-	for {
-		if !r.readFull(r.buf[:4], true) {
-			// Add empty last block
-			r.block.reset(nil)
-			r.block.last = true
-			err := r.block.encodeLits(r.block.literals, false)
-			if err != nil {
-				return written, err
-			}
-			n, err := w.Write(r.block.output)
-			if err != nil {
-				return written, err
-			}
-			written += int64(n)
-
-			return written, r.err
-		}
-		chunkType := r.buf[0]
-		if !readHeader {
-			if chunkType != chunkTypeStreamIdentifier {
-				println("chunkType != chunkTypeStreamIdentifier", chunkType)
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			readHeader = true
-		}
-		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
-		if chunkLen > len(r.buf) {
-			println("chunkLen > len(r.buf)", chunkType)
-			r.err = ErrSnappyUnsupported
-			return written, r.err
-		}
-
-		// The chunk types are specified at
-		// https://github.com/google/snappy/blob/master/framing_format.txt
-		switch chunkType {
-		case chunkTypeCompressedData:
-			// Section 4.2. Compressed data (chunk type 0x00).
-			if chunkLen < snappyChecksumSize {
-				println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			buf := r.buf[:chunkLen]
-			if !r.readFull(buf, false) {
-				return written, r.err
-			}
-			//checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			buf = buf[snappyChecksumSize:]
-
-			n, hdr, err := snappyDecodedLen(buf)
-			if err != nil {
-				r.err = err
-				return written, r.err
-			}
-			buf = buf[hdr:]
-			if n > snappyMaxBlockSize {
-				println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			r.block.reset(nil)
-			r.block.pushOffsets()
-			if err := decodeSnappy(r.block, buf); err != nil {
-				r.err = err
-				return written, r.err
-			}
-			if r.block.size+r.block.extraLits != n {
-				printf("invalid size, want %d, got %d\n", n, r.block.size+r.block.extraLits)
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			err = r.block.encode(nil, false, false)
-			switch err {
-			case errIncompressible:
-				r.block.popOffsets()
-				r.block.reset(nil)
-				r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
-				if err != nil {
-					return written, err
-				}
-				err = r.block.encodeLits(r.block.literals, false)
-				if err != nil {
-					return written, err
-				}
-			case nil:
-			default:
-				return written, err
-			}
-
-			n, r.err = w.Write(r.block.output)
-			if r.err != nil {
-				return written, r.err
-			}
-			written += int64(n)
-			continue
-		case chunkTypeUncompressedData:
-			if debugEncoder {
-				println("Uncompressed, chunklen", chunkLen)
-			}
-			// Section 4.3. Uncompressed data (chunk type 0x01).
-			if chunkLen < snappyChecksumSize {
-				println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			r.block.reset(nil)
-			buf := r.buf[:snappyChecksumSize]
-			if !r.readFull(buf, false) {
-				return written, r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			// Read directly into r.decoded instead of via r.buf.
-			n := chunkLen - snappyChecksumSize
-			if n > snappyMaxBlockSize {
-				println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			r.block.literals = r.block.literals[:n]
-			if !r.readFull(r.block.literals, false) {
-				return written, r.err
-			}
-			if snappyCRC(r.block.literals) != checksum {
-				println("literals crc mismatch")
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			err := r.block.encodeLits(r.block.literals, false)
-			if err != nil {
-				return written, err
-			}
-			n, r.err = w.Write(r.block.output)
-			if r.err != nil {
-				return written, r.err
-			}
-			written += int64(n)
-			continue
-
-		case chunkTypeStreamIdentifier:
-			if debugEncoder {
-				println("stream id", chunkLen, len(snappyMagicBody))
-			}
-			// Section 4.1. Stream identifier (chunk type 0xff).
-			if chunkLen != len(snappyMagicBody) {
-				println("chunkLen != len(snappyMagicBody)", chunkLen, len(snappyMagicBody))
-				r.err = ErrSnappyCorrupt
-				return written, r.err
-			}
-			if !r.readFull(r.buf[:len(snappyMagicBody)], false) {
-				return written, r.err
-			}
-			for i := 0; i < len(snappyMagicBody); i++ {
-				if r.buf[i] != snappyMagicBody[i] {
-					println("r.buf[i] != snappyMagicBody[i]", r.buf[i], snappyMagicBody[i], i)
-					r.err = ErrSnappyCorrupt
-					return written, r.err
-				}
-			}
-			continue
-		}
-
-		if chunkType <= 0x7f {
-			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
-			println("chunkType <= 0x7f")
-			r.err = ErrSnappyUnsupported
-			return written, r.err
-		}
-		// Section 4.4 Padding (chunk type 0xfe).
-		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
-		if !r.readFull(r.buf[:chunkLen], false) {
-			return written, r.err
-		}
-	}
-}
-
-// decodeSnappy writes the decoding of src to dst. It assumes that the varint-encoded
-// length of the decompressed bytes has already been read.
-func decodeSnappy(blk *blockEnc, src []byte) error {
-	//decodeRef(make([]byte, snappyMaxBlockSize), src)
-	var s, length int
-	lits := blk.extraLits
-	var offset uint32
-	for s < len(src) {
-		switch src[s] & 0x03 {
-		case snappyTagLiteral:
-			x := uint32(src[s] >> 2)
-			switch {
-			case x < 60:
-				s++
-			case x == 60:
-				s += 2
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					println("uint(s) > uint(len(src)", s, src)
-					return ErrSnappyCorrupt
-				}
-				x = uint32(src[s-1])
-			case x == 61:
-				s += 3
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					println("uint(s) > uint(len(src)", s, src)
-					return ErrSnappyCorrupt
-				}
-				x = uint32(src[s-2]) | uint32(src[s-1])<<8
-			case x == 62:
-				s += 4
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					println("uint(s) > uint(len(src)", s, src)
-					return ErrSnappyCorrupt
-				}
-				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-			case x == 63:
-				s += 5
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					println("uint(s) > uint(len(src)", s, src)
-					return ErrSnappyCorrupt
-				}
-				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-			}
-			if x > snappyMaxBlockSize {
-				println("x > snappyMaxBlockSize", x, snappyMaxBlockSize)
-				return ErrSnappyCorrupt
-			}
-			length = int(x) + 1
-			if length <= 0 {
-				println("length <= 0 ", length)
-
-				return errUnsupportedLiteralLength
-			}
-			//if length > snappyMaxBlockSize-d || uint32(length) > len(src)-s {
-			//	return ErrSnappyCorrupt
-			//}
-
-			blk.literals = append(blk.literals, src[s:s+length]...)
-			//println(length, "litLen")
-			lits += length
-			s += length
-			continue
-
-		case snappyTagCopy1:
-			s += 2
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				println("uint(s) > uint(len(src)", s, len(src))
-				return ErrSnappyCorrupt
-			}
-			length = 4 + int(src[s-2])>>2&0x7
-			offset = uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])
-
-		case snappyTagCopy2:
-			s += 3
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				println("uint(s) > uint(len(src)", s, len(src))
-				return ErrSnappyCorrupt
-			}
-			length = 1 + int(src[s-3])>>2
-			offset = uint32(src[s-2]) | uint32(src[s-1])<<8
-
-		case snappyTagCopy4:
-			s += 5
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				println("uint(s) > uint(len(src)", s, len(src))
-				return ErrSnappyCorrupt
-			}
-			length = 1 + int(src[s-5])>>2
-			offset = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-		}
-
-		if offset <= 0 || blk.size+lits < int(offset) /*|| length > len(blk)-d */ {
-			println("offset <= 0 || blk.size+lits < int(offset)", offset, blk.size+lits, int(offset), blk.size, lits)
-
-			return ErrSnappyCorrupt
-		}
-
-		// Check if offset is one of the recent offsets.
-		// Adjusts the output offset accordingly.
-		// Gives a tiny bit of compression, typically around 1%.
-		if false {
-			offset = blk.matchOffset(offset, uint32(lits))
-		} else {
-			offset += 3
-		}
-
-		blk.sequences = append(blk.sequences, seq{
-			litLen:   uint32(lits),
-			offset:   offset,
-			matchLen: uint32(length) - zstdMinMatch,
-		})
-		blk.size += length + lits
-		lits = 0
-	}
-	blk.extraLits = lits
-	return nil
-}
-
-func (r *SnappyConverter) readFull(p []byte, allowEOF bool) (ok bool) {
-	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
-		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
-			r.err = ErrSnappyCorrupt
-		}
-		return false
-	}
-	return true
-}
-
-var crcTable = crc32.MakeTable(crc32.Castagnoli)
-
-// crc implements the checksum specified in section 3 of
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func snappyCRC(b []byte) uint32 {
-	c := crc32.Update(0, crcTable, b)
-	return c>>15 | c<<17 + 0xa282ead8
-}
-
-// snappyDecodedLen returns the length of the decoded block and the number of bytes
-// that the length header occupied.
-func snappyDecodedLen(src []byte) (blockLen, headerLen int, err error) {
-	v, n := binary.Uvarint(src)
-	if n <= 0 || v > 0xffffffff {
-		return 0, 0, ErrSnappyCorrupt
-	}
-
-	const wordSize = 32 << (^uint(0) >> 32 & 1)
-	if wordSize == 32 && v > 0x7fffffff {
-		return 0, 0, ErrSnappyTooLarge
-	}
-	return int(v), n, nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
deleted file mode 100644
index 29c15c8c4..000000000
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-
-package zstd
-
-import (
-	"errors"
-	"io"
-	"sync"
-)
-
-// ZipMethodWinZip is the method for Zstandard compressed data inside Zip files for WinZip.
-// See https://www.winzip.com/win/en/comp_info.html
-const ZipMethodWinZip = 93
-
-// ZipMethodPKWare is the original method number used by PKWARE to indicate Zstandard compression.
-// Deprecated: This has been deprecated by PKWARE, use ZipMethodWinZip instead for compression.
-// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
-const ZipMethodPKWare = 20
-
-// zipReaderPool is the default reader pool.
-var zipReaderPool = sync.Pool{New: func() interface{} {
-	z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
-	if err != nil {
-		panic(err)
-	}
-	return z
-}}
-
-// newZipReader creates a pooled zip decompressor.
-func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
-	pool := &zipReaderPool
-	if len(opts) > 0 {
-		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
-		// Force concurrency 1
-		opts = append(opts, WithDecoderConcurrency(1))
-		// Create our own pool
-		pool = &sync.Pool{}
-	}
-	return func(r io.Reader) io.ReadCloser {
-		dec, ok := pool.Get().(*Decoder)
-		if ok {
-			dec.Reset(r)
-		} else {
-			d, err := NewReader(r, opts...)
-			if err != nil {
-				panic(err)
-			}
-			dec = d
-		}
-		return &pooledZipReader{dec: dec, pool: pool}
-	}
-}
-
-type pooledZipReader struct {
-	mu   sync.Mutex // guards Close and Read
-	pool *sync.Pool
-	dec  *Decoder
-}
-
-func (r *pooledZipReader) Read(p []byte) (n int, err error) {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-	if r.dec == nil {
-		return 0, errors.New("read after close or EOF")
-	}
-	dec, err := r.dec.Read(p)
-	if err == io.EOF {
-		r.dec.Reset(nil)
-		r.pool.Put(r.dec)
-		r.dec = nil
-	}
-	return dec, err
-}
-
-func (r *pooledZipReader) Close() error {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-	var err error
-	if r.dec != nil {
-		err = r.dec.Reset(nil)
-		r.pool.Put(r.dec)
-		r.dec = nil
-	}
-	return err
-}
-
-type pooledZipWriter struct {
-	mu   sync.Mutex // guards Close and Read
-	enc  *Encoder
-	pool *sync.Pool
-}
-
-func (w *pooledZipWriter) Write(p []byte) (n int, err error) {
-	w.mu.Lock()
-	defer w.mu.Unlock()
-	if w.enc == nil {
-		return 0, errors.New("Write after Close")
-	}
-	return w.enc.Write(p)
-}
-
-func (w *pooledZipWriter) Close() error {
-	w.mu.Lock()
-	defer w.mu.Unlock()
-	var err error
-	if w.enc != nil {
-		err = w.enc.Close()
-		w.pool.Put(w.enc)
-		w.enc = nil
-	}
-	return err
-}
-
-// ZipCompressor returns a compressor that can be registered with zip libraries.
-// The provided encoder options will be used on all encodes.
-func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
-	var pool sync.Pool
-	return func(w io.Writer) (io.WriteCloser, error) {
-		enc, ok := pool.Get().(*Encoder)
-		if ok {
-			enc.Reset(w)
-		} else {
-			var err error
-			enc, err = NewWriter(w, opts...)
-			if err != nil {
-				return nil, err
-			}
-		}
-		return &pooledZipWriter{enc: enc, pool: &pool}, nil
-	}
-}
-
-// ZipDecompressor returns a decompressor that can be registered with zip libraries.
-// See ZipCompressor for example.
-// Options can be specified. WithDecoderConcurrency(1) is forced,
-// and by default a 128MB maximum decompression window is specified.
-// The window size can be overridden if required.
-func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
-	return newZipReader(opts...)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
deleted file mode 100644
index 6252b46ae..000000000
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ /dev/null
@@ -1,126 +0,0 @@
-// Package zstd provides decompression of zstandard files.
-//
-// For advanced usage and examples, go to the README: https://github.com/klauspost/compress/tree/master/zstd#zstd
-package zstd
-
-import (
-	"bytes"
-	"errors"
-	"log"
-	"math"
-
-	"github.com/klauspost/compress/internal/le"
-)
-
-// enable debug printing
-const debug = false
-
-// enable encoding debug printing
-const debugEncoder = debug
-
-// enable decoding debug printing
-const debugDecoder = debug
-
-// Enable extra assertions.
-const debugAsserts = debug || false
-
-// print sequence details
-const debugSequences = false
-
-// print detailed matching information
-const debugMatches = false
-
-// force encoder to use predefined tables.
-const forcePreDef = false
-
-// zstdMinMatch is the minimum zstd match length.
-const zstdMinMatch = 3
-
-// fcsUnknown is used for unknown frame content size.
-const fcsUnknown = math.MaxUint64
-
-var (
-	// ErrReservedBlockType is returned when a reserved block type is found.
-	// Typically this indicates wrong or corrupted input.
-	ErrReservedBlockType = errors.New("invalid input: reserved block type encountered")
-
-	// ErrCompressedSizeTooBig is returned when a block is bigger than allowed.
-	// Typically this indicates wrong or corrupted input.
-	ErrCompressedSizeTooBig = errors.New("invalid input: compressed size too big")
-
-	// ErrBlockTooSmall is returned when a block is too small to be decoded.
-	// Typically returned on invalid input.
-	ErrBlockTooSmall = errors.New("block too small")
-
-	// ErrUnexpectedBlockSize is returned when a block has unexpected size.
-	// Typically returned on invalid input.
-	ErrUnexpectedBlockSize = errors.New("unexpected block size")
-
-	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
-	// Typically this indicates wrong or corrupted input.
-	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
-
-	// ErrWindowSizeExceeded is returned when a reference exceeds the valid window size.
-	// Typically this indicates wrong or corrupted input.
-	ErrWindowSizeExceeded = errors.New("window size exceeded")
-
-	// ErrWindowSizeTooSmall is returned when no window size is specified.
-	// Typically this indicates wrong or corrupted input.
-	ErrWindowSizeTooSmall = errors.New("invalid input: window size was too small")
-
-	// ErrDecoderSizeExceeded is returned if decompressed size exceeds the configured limit.
-	ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
-
-	// ErrUnknownDictionary is returned if the dictionary ID is unknown.
-	ErrUnknownDictionary = errors.New("unknown dictionary")
-
-	// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
-	// This is only returned if SingleSegment is specified on the frame.
-	ErrFrameSizeExceeded = errors.New("frame size exceeded")
-
-	// ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size.
-	// This is only returned if SingleSegment is specified on the frame.
-	ErrFrameSizeMismatch = errors.New("frame size does not match size on stream")
-
-	// ErrCRCMismatch is returned if CRC mismatches.
-	ErrCRCMismatch = errors.New("CRC check failed")
-
-	// ErrDecoderClosed will be returned if the Decoder was used after
-	// Close has been called.
-	ErrDecoderClosed = errors.New("decoder used after Close")
-
-	// ErrEncoderClosed will be returned if the Encoder was used after
-	// Close has been called.
-	ErrEncoderClosed = errors.New("encoder used after Close")
-
-	// ErrDecoderNilInput is returned when a nil Reader was provided
-	// and an operation other than Reset/DecodeAll/Close was attempted.
-	ErrDecoderNilInput = errors.New("nil input provided as reader")
-)
-
-func println(a ...interface{}) {
-	if debug || debugDecoder || debugEncoder {
-		log.Println(a...)
-	}
-}
-
-func printf(format string, a ...interface{}) {
-	if debug || debugDecoder || debugEncoder {
-		log.Printf(format, a...)
-	}
-}
-
-func load3232(b []byte, i int32) uint32 {
-	return le.Load32(b, i)
-}
-
-func load6432(b []byte, i int32) uint64 {
-	return le.Load64(b, i)
-}
-
-type byter interface {
-	Bytes() []byte
-	Len() int
-}
-
-var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/prometheus/client_golang/prometheus/collectorfunc.go b/vendor/github.com/prometheus/client_golang/prometheus/collectorfunc.go
new file mode 100644
index 000000000..9a71a15db
--- /dev/null
+++ b/vendor/github.com/prometheus/client_golang/prometheus/collectorfunc.go
@@ -0,0 +1,30 @@
+// Copyright 2025 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package prometheus
+
+// CollectorFunc is a convenient way to implement a Prometheus Collector
+// without interface boilerplate.
+// This implementation is based on DescribeByCollect method.
+// familiarize yourself to it before using.
+type CollectorFunc func(chan<- Metric)
+
+// Collect calls the defined CollectorFunc function with the provided Metrics channel
+func (f CollectorFunc) Collect(ch chan<- Metric) {
+	f(ch)
+}
+
+// Describe sends the descriptor information using DescribeByCollect
+func (f CollectorFunc) Describe(ch chan<- *Desc) {
+	DescribeByCollect(f, ch)
+}
diff --git a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go
index 28eed2672..763d99e36 100644
--- a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go
+++ b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go
@@ -41,11 +41,11 @@ import (
 	"sync"
 	"time"
 
-	"github.com/klauspost/compress/zstd"
 	"github.com/prometheus/common/expfmt"
 
 	"github.com/prometheus/client_golang/internal/github.com/golang/gddo/httputil"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp/internal"
 )
 
 const (
@@ -65,7 +65,13 @@ const (
 	Zstd     Compression = "zstd"
 )
 
-var defaultCompressionFormats = []Compression{Identity, Gzip, Zstd}
+func defaultCompressionFormats() []Compression {
+	if internal.NewZstdWriter != nil {
+		return []Compression{Identity, Gzip, Zstd}
+	} else {
+		return []Compression{Identity, Gzip}
+	}
+}
 
 var gzipPool = sync.Pool{
 	New: func() interface{} {
@@ -138,7 +144,7 @@ func HandlerForTransactional(reg prometheus.TransactionalGatherer, opts HandlerO
 	// Select compression formats to offer based on default or user choice.
 	var compressions []string
 	if !opts.DisableCompression {
-		offers := defaultCompressionFormats
+		offers := defaultCompressionFormats()
 		if len(opts.OfferedCompressions) > 0 {
 			offers = opts.OfferedCompressions
 		}
@@ -466,14 +472,12 @@ func negotiateEncodingWriter(r *http.Request, rw io.Writer, compressions []strin
 
 	switch selected {
 	case "zstd":
-		// TODO(mrueg): Replace klauspost/compress with stdlib implementation once https://github.com/golang/go/issues/62513 is implemented.
-		z, err := zstd.NewWriter(rw, zstd.WithEncoderLevel(zstd.SpeedFastest))
-		if err != nil {
-			return nil, "", func() {}, err
+		if internal.NewZstdWriter == nil {
+			// The content encoding was not implemented yet.
+			return nil, "", func() {}, fmt.Errorf("content compression format not recognized: %s. Valid formats are: %s", selected, defaultCompressionFormats())
 		}
-
-		z.Reset(rw)
-		return z, selected, func() { _ = z.Close() }, nil
+		writer, closeWriter, err := internal.NewZstdWriter(rw)
+		return writer, selected, closeWriter, err
 	case "gzip":
 		gz := gzipPool.Get().(*gzip.Writer)
 		gz.Reset(rw)
@@ -483,6 +487,6 @@ func negotiateEncodingWriter(r *http.Request, rw io.Writer, compressions []strin
 		return rw, selected, func() {}, nil
 	default:
 		// The content encoding was not implemented yet.
-		return nil, "", func() {}, fmt.Errorf("content compression format not recognized: %s. Valid formats are: %s", selected, defaultCompressionFormats)
+		return nil, "", func() {}, fmt.Errorf("content compression format not recognized: %s. Valid formats are: %s", selected, defaultCompressionFormats())
 	}
 }
diff --git a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/internal/compression.go b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/internal/compression.go
new file mode 100644
index 000000000..c5039590f
--- /dev/null
+++ b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/internal/compression.go
@@ -0,0 +1,21 @@
+// Copyright 2025 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package internal
+
+import (
+	"io"
+)
+
+// NewZstdWriter enables zstd write support if non-nil.
+var NewZstdWriter func(rw io.Writer) (_ io.Writer, closeWriter func(), _ error)
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 1ba6fe441..e3a5321aa 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -627,16 +627,9 @@ github.com/json-iterator/go
 github.com/k3a/html2text
 # github.com/klauspost/compress v1.18.0
 ## explicit; go 1.22
-github.com/klauspost/compress
-github.com/klauspost/compress/fse
-github.com/klauspost/compress/huff0
-github.com/klauspost/compress/internal/cpuinfo
 github.com/klauspost/compress/internal/le
 github.com/klauspost/compress/internal/race
-github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/s2
-github.com/klauspost/compress/zstd
-github.com/klauspost/compress/zstd/internal/xxhash
 # github.com/klauspost/cpuid/v2 v2.2.10
 ## explicit; go 1.22
 github.com/klauspost/cpuid/v2
@@ -745,13 +738,14 @@ github.com/pquerna/otp
 github.com/pquerna/otp/hotp
 github.com/pquerna/otp/internal
 github.com/pquerna/otp/totp
-# github.com/prometheus/client_golang v1.21.1
-## explicit; go 1.21
+# github.com/prometheus/client_golang v1.22.0
+## explicit; go 1.22
 github.com/prometheus/client_golang/internal/github.com/golang/gddo/httputil
 github.com/prometheus/client_golang/internal/github.com/golang/gddo/httputil/header
 github.com/prometheus/client_golang/prometheus
 github.com/prometheus/client_golang/prometheus/internal
 github.com/prometheus/client_golang/prometheus/promhttp
+github.com/prometheus/client_golang/prometheus/promhttp/internal
 # github.com/prometheus/client_model v0.6.1
 ## explicit; go 1.19
 github.com/prometheus/client_model/go