diff --git a/go.mod b/go.mod
index 69b625f4116..0a83866a571 100644
--- a/go.mod
+++ b/go.mod
@@ -22,7 +22,7 @@ require (
 	github.com/containers/image/v5 v5.17.0
 	github.com/containers/ocicrypt v1.1.5
 	github.com/containers/podman/v3 v3.2.0-rc1.0.20211005134800-8bcc086b1b9d
-	github.com/containers/storage v1.37.0
+	github.com/containers/storage v1.41.0
 	github.com/coreos/go-systemd/v22 v22.3.2
 	github.com/cpuguy83/go-md2man v1.0.10
 	github.com/creack/pty v1.1.18
@@ -104,7 +104,7 @@ require (
 	github.com/cilium/ebpf v0.7.0 // indirect
 	github.com/containerd/console v1.0.3 // indirect
 	github.com/containerd/go-runc v1.0.0 // indirect
-	github.com/containerd/stargz-snapshotter/estargz v0.10.1 // indirect
+	github.com/containerd/stargz-snapshotter/estargz v0.11.4 // indirect
 	github.com/containers/libtrust v0.0.0-20190913040956-14b96171aa3b // indirect
 	github.com/containers/psgo v1.7.1 // indirect
 	github.com/coreos/go-systemd v0.0.0-20190620071333-e64a0ec8b42a // indirect
@@ -158,7 +158,7 @@ require (
 	github.com/jmespath/go-jmespath v0.4.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/kevinburke/ssh_config v1.1.0 // indirect
-	github.com/klauspost/compress v1.14.2 // indirect
+	github.com/klauspost/compress v1.15.4 // indirect
 	github.com/klauspost/pgzip v1.2.5 // indirect
 	github.com/lithammer/dedent v1.1.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
@@ -175,7 +175,7 @@ require (
 	github.com/mmarkdown/mmark v2.0.40+incompatible // indirect
 	github.com/moby/spdystream v0.2.0 // indirect
 	github.com/moby/sys/mount v0.2.0 // indirect
-	github.com/moby/sys/mountinfo v0.6.0 // indirect
+	github.com/moby/sys/mountinfo v0.6.1 // indirect
 	github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
diff --git a/go.sum b/go.sum
index f91d23d5f96..20919f1bde7 100644
--- a/go.sum
+++ b/go.sum
@@ -377,8 +377,8 @@ github.com/containerd/stargz-snapshotter/estargz v0.0.0-20201217071531-2b97b5837
 github.com/containerd/stargz-snapshotter/estargz v0.4.1/go.mod h1:x7Q9dg9QYb4+ELgxmo4gBUeJB0tl5dqH1Sdz0nJU1QM=
 github.com/containerd/stargz-snapshotter/estargz v0.8.0/go.mod h1:mwIwuwb+D8FX2t45Trwi0hmWmZm5VW7zPP/rekwhWQU=
 github.com/containerd/stargz-snapshotter/estargz v0.9.0/go.mod h1:aE5PCyhFMwR8sbrErO5eM2GcvkyXTTJremG883D4qF0=
-github.com/containerd/stargz-snapshotter/estargz v0.10.1 h1:hd1EoVjI2Ax8Cr64tdYqnJ4i4pZU49FkEf5kU8KxQng=
-github.com/containerd/stargz-snapshotter/estargz v0.10.1/go.mod h1:aE5PCyhFMwR8sbrErO5eM2GcvkyXTTJremG883D4qF0=
+github.com/containerd/stargz-snapshotter/estargz v0.11.4 h1:LjrYUZpyOhiSaU7hHrdR82/RBoxfGWSaC0VeSSMXqnk=
+github.com/containerd/stargz-snapshotter/estargz v0.11.4/go.mod h1:7vRJIcImfY8bpifnMjt+HTJoQxASq7T28MYbP15/Nf0=
 github.com/containerd/ttrpc v0.0.0-20190828154514-0e0f228740de/go.mod h1:PvCDdDGpgqzQIzDW1TphrGLssLDZp2GuS+X5DkEJB8o=
 github.com/containerd/ttrpc v0.0.0-20190828172938-92c8520ef9f8/go.mod h1:PvCDdDGpgqzQIzDW1TphrGLssLDZp2GuS+X5DkEJB8o=
 github.com/containerd/ttrpc v0.0.0-20191028202541-4f1b8fe65a5c/go.mod h1:LPm1u0xBw8r8NOKoOdNMeVHSawSsltak+Ihv+etqsE8=
@@ -437,8 +437,9 @@ github.com/containers/storage v1.23.5/go.mod h1:ha26Q6ngehFNhf3AWoXldvAvwI4jFe3E
 github.com/containers/storage v1.24.8/go.mod h1:YC+2pY8SkfEAcZkwycxYbpK8EiRbx5soPPwz9dxe4IQ=
 github.com/containers/storage v1.35.0/go.mod h1:qzYhasQP2/V9D9XdO+vRwkHBhsBO0oznMLzzRDQ8s20=
 github.com/containers/storage v1.36.0/go.mod h1:vbd3SKVQNHdmU5qQI6hTEcKPxnZkGqydG4f6uwrI5a8=
-github.com/containers/storage v1.37.0 h1:HVhDsur6sx889ZIZ1d1kEiOzv3gsr5q0diX2VZmOdSg=
 github.com/containers/storage v1.37.0/go.mod h1:kqeJeS0b7DO2ZT1nVWs0XufrmPFbgV3c+Q/45RlH6r4=
+github.com/containers/storage v1.41.0 h1:IsoAJ1q3s/jfHB7eoiRhvTRTcuV+ywY3CkbbgKtT5Bo=
+github.com/containers/storage v1.41.0/go.mod h1:Pb0l5Sm/89kolX3o2KolKQ5cCHk5vPNpJrhNaLcdS5s=
 github.com/coredns/caddy v1.1.0/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4=
 github.com/coredns/corefile-migration v1.0.14/go.mod h1:XnhgULOEouimnzgn0t4WPuFDN2/PJQcTxdWKC5eXNGE=
 github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
@@ -1042,8 +1043,9 @@ github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdY
 github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
 github.com/klauspost/compress v1.13.5/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
-github.com/klauspost/compress v1.14.2 h1:S0OHlFk/Gbon/yauFJ4FfJJF5V0fc5HbBTJazi28pRw=
-github.com/klauspost/compress v1.14.2/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/klauspost/compress v1.15.4 h1:1kn4/7MepF/CHmYub99/nNX8az0IJjfSOU/jbnTVfqQ=
+github.com/klauspost/compress v1.15.4/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
 github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/pgzip v1.2.4/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
@@ -1200,8 +1202,9 @@ github.com/moby/sys/mountinfo v0.1.3/go.mod h1:w2t2Avltqx8vE7gX5l+QiBKxODu2TX0+S
 github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A=
 github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A=
 github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
-github.com/moby/sys/mountinfo v0.6.0 h1:gUDhXQx58YNrpHlK4nSL+7y2pxFZkUcXqzFDKWdC0Oo=
 github.com/moby/sys/mountinfo v0.6.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
+github.com/moby/sys/mountinfo v0.6.1 h1:+H/KnGEAGRpTrEAqNVQ2AM3SiwMgJUt/TXj+Z8cmCIc=
+github.com/moby/sys/mountinfo v0.6.1/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
 github.com/moby/sys/symlink v0.1.0/go.mod h1:GGDODQmbFOjFsXvfLVn3+ZRxkch54RkSiGqsZeMYowQ=
 github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc=
 github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 h1:dcztxKSvZ4Id8iPpHERQBbIJfabdt4wUm5qy3wOL2Zc=
@@ -1576,6 +1579,7 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s=
 github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
 github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
diff --git a/vendor/github.com/containerd/stargz-snapshotter/estargz/build.go b/vendor/github.com/containerd/stargz-snapshotter/estargz/build.go
index 708b2668990..0da3efe4c21 100644
--- a/vendor/github.com/containerd/stargz-snapshotter/estargz/build.go
+++ b/vendor/github.com/containerd/stargz-snapshotter/estargz/build.go
@@ -26,9 +26,10 @@ import (
 	"archive/tar"
 	"bytes"
 	"compress/gzip"
+	"context"
+	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"os"
 	"path"
 	"runtime"
@@ -38,7 +39,6 @@ import (
 	"github.com/containerd/stargz-snapshotter/estargz/errorutil"
 	"github.com/klauspost/compress/zstd"
 	digest "github.com/opencontainers/go-digest"
-	"github.com/pkg/errors"
 	"golang.org/x/sync/errgroup"
 )
 
@@ -48,6 +48,7 @@ type options struct {
 	prioritizedFiles       []string
 	missedPrioritizedFiles *[]string
 	compression            Compression
+	ctx                    context.Context
 }
 
 type Option func(o *options) error
@@ -104,6 +105,14 @@ func WithCompression(compression Compression) Option {
 	}
 }
 
+// WithContext specifies a context that can be used for clean canceleration.
+func WithContext(ctx context.Context) Option {
+	return func(o *options) error {
+		o.ctx = ctx
+		return nil
+	}
+}
+
 // Blob is an eStargz blob.
 type Blob struct {
 	io.ReadCloser
@@ -139,12 +148,29 @@ func Build(tarBlob *io.SectionReader, opt ...Option) (_ *Blob, rErr error) {
 		opts.compression = newGzipCompressionWithLevel(opts.compressionLevel)
 	}
 	layerFiles := newTempFiles()
+	ctx := opts.ctx
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	done := make(chan struct{})
+	defer close(done)
+	go func() {
+		select {
+		case <-done:
+			// nop
+		case <-ctx.Done():
+			layerFiles.CleanupAll()
+		}
+	}()
 	defer func() {
 		if rErr != nil {
 			if err := layerFiles.CleanupAll(); err != nil {
-				rErr = errors.Wrapf(rErr, "failed to cleanup tmp files: %v", err)
+				rErr = fmt.Errorf("failed to cleanup tmp files: %v: %w", err, rErr)
 			}
 		}
+		if cErr := ctx.Err(); cErr != nil {
+			rErr = fmt.Errorf("error from context %q: %w", cErr, rErr)
+		}
 	}()
 	tarBlob, err := decompressBlob(tarBlob, layerFiles)
 	if err != nil {
@@ -307,7 +333,7 @@ func sortEntries(in io.ReaderAt, prioritized []string, missedPrioritized *[]stri
 	// Import tar file.
 	intar, err := importTar(in)
 	if err != nil {
-		return nil, errors.Wrap(err, "failed to sort")
+		return nil, fmt.Errorf("failed to sort: %w", err)
 	}
 
 	// Sort the tar file respecting to the prioritized files list.
@@ -318,7 +344,7 @@ func sortEntries(in io.ReaderAt, prioritized []string, missedPrioritized *[]stri
 				*missedPrioritized = append(*missedPrioritized, l)
 				continue // allow not found
 			}
-			return nil, errors.Wrap(err, "failed to sort tar entries")
+			return nil, fmt.Errorf("failed to sort tar entries: %w", err)
 		}
 	}
 	if len(prioritized) == 0 {
@@ -371,7 +397,7 @@ func importTar(in io.ReaderAt) (*tarFile, error) {
 	tf := &tarFile{}
 	pw, err := newCountReader(in)
 	if err != nil {
-		return nil, errors.Wrap(err, "failed to make position watcher")
+		return nil, fmt.Errorf("failed to make position watcher: %w", err)
 	}
 	tr := tar.NewReader(pw)
 
@@ -383,7 +409,7 @@ func importTar(in io.ReaderAt) (*tarFile, error) {
 			if err == io.EOF {
 				break
 			} else {
-				return nil, errors.Wrap(err, "failed to parse tar file")
+				return nil, fmt.Errorf("failed to parse tar file, %w", err)
 			}
 		}
 		switch cleanEntryName(h.Name) {
@@ -420,7 +446,7 @@ func moveRec(name string, in *tarFile, out *tarFile) error {
 	_, okIn := in.get(name)
 	_, okOut := out.get(name)
 	if !okIn && !okOut {
-		return errors.Wrapf(errNotFound, "file: %q", name)
+		return fmt.Errorf("file: %q: %w", name, errNotFound)
 	}
 
 	parent, _ := path.Split(strings.TrimSuffix(name, "/"))
@@ -506,12 +532,13 @@ func newTempFiles() *tempFiles {
 }
 
 type tempFiles struct {
-	files   []*os.File
-	filesMu sync.Mutex
+	files       []*os.File
+	filesMu     sync.Mutex
+	cleanupOnce sync.Once
 }
 
 func (tf *tempFiles) TempFile(dir, pattern string) (*os.File, error) {
-	f, err := ioutil.TempFile(dir, pattern)
+	f, err := os.CreateTemp(dir, pattern)
 	if err != nil {
 		return nil, err
 	}
@@ -521,7 +548,14 @@ func (tf *tempFiles) TempFile(dir, pattern string) (*os.File, error) {
 	return f, nil
 }
 
-func (tf *tempFiles) CleanupAll() error {
+func (tf *tempFiles) CleanupAll() (err error) {
+	tf.cleanupOnce.Do(func() {
+		err = tf.cleanupAll()
+	})
+	return
+}
+
+func (tf *tempFiles) cleanupAll() error {
 	tf.filesMu.Lock()
 	defer tf.filesMu.Unlock()
 	var allErr []error
diff --git a/vendor/github.com/containerd/stargz-snapshotter/estargz/estargz.go b/vendor/github.com/containerd/stargz-snapshotter/estargz/estargz.go
index e56319545e6..921e59ec6ef 100644
--- a/vendor/github.com/containerd/stargz-snapshotter/estargz/estargz.go
+++ b/vendor/github.com/containerd/stargz-snapshotter/estargz/estargz.go
@@ -27,10 +27,10 @@ import (
 	"bytes"
 	"compress/gzip"
 	"crypto/sha256"
+	"errors"
 	"fmt"
 	"hash"
 	"io"
-	"io/ioutil"
 	"os"
 	"path"
 	"sort"
@@ -40,7 +40,6 @@ import (
 
 	"github.com/containerd/stargz-snapshotter/estargz/errorutil"
 	digest "github.com/opencontainers/go-digest"
-	"github.com/pkg/errors"
 	"github.com/vbatts/tar-split/archive/tar"
 )
 
@@ -107,7 +106,7 @@ type Telemetry struct {
 }
 
 // Open opens a stargz file for reading.
-// The behaviour is configurable using options.
+// The behavior is configurable using options.
 //
 // Note that each entry name is normalized as the path that is relative to root.
 func Open(sr *io.SectionReader, opt ...OpenOption) (*Reader, error) {
@@ -385,8 +384,7 @@ func (r *Reader) Verifiers() (TOCEntryVerifier, error) {
 			if e.Digest != "" {
 				d, err := digest.Parse(e.Digest)
 				if err != nil {
-					return nil, errors.Wrapf(err,
-						"failed to parse regular file digest %q", e.Digest)
+					return nil, fmt.Errorf("failed to parse regular file digest %q: %w", e.Digest, err)
 				}
 				regDigestMap[e.Offset] = d
 			} else {
@@ -401,8 +399,7 @@ func (r *Reader) Verifiers() (TOCEntryVerifier, error) {
 		if e.ChunkDigest != "" {
 			d, err := digest.Parse(e.ChunkDigest)
 			if err != nil {
-				return nil, errors.Wrapf(err,
-					"failed to parse chunk digest %q", e.ChunkDigest)
+				return nil, fmt.Errorf("failed to parse chunk digest %q: %w", e.ChunkDigest, err)
 			}
 			chunkDigestMap[e.Offset] = d
 		} else {
@@ -581,7 +578,7 @@ func (fr *fileReader) ReadAt(p []byte, off int64) (n int, err error) {
 		return 0, fmt.Errorf("fileReader.ReadAt.decompressor.Reader: %v", err)
 	}
 	defer dr.Close()
-	if n, err := io.CopyN(ioutil.Discard, dr, off); n != off || err != nil {
+	if n, err := io.CopyN(io.Discard, dr, off); n != off || err != nil {
 		return 0, fmt.Errorf("discard of %d bytes = %v, %v", off, n, err)
 	}
 	return io.ReadFull(dr, p)
@@ -647,7 +644,7 @@ func Unpack(sr *io.SectionReader, c Decompressor) (io.ReadCloser, error) {
 	}
 	blobPayloadSize, _, _, err := c.ParseFooter(footer)
 	if err != nil {
-		return nil, errors.Wrapf(err, "failed to parse footer")
+		return nil, fmt.Errorf("failed to parse footer: %w", err)
 	}
 	return c.Reader(io.LimitReader(sr, blobPayloadSize))
 }
@@ -935,7 +932,7 @@ func (w *Writer) appendTar(r io.Reader, lossless bool) error {
 			}
 		}
 	}
-	remainDest := ioutil.Discard
+	remainDest := io.Discard
 	if lossless {
 		remainDest = dst // Preserve the remaining bytes in lossless mode
 	}
diff --git a/vendor/github.com/containerd/stargz-snapshotter/estargz/gzip.go b/vendor/github.com/containerd/stargz-snapshotter/estargz/gzip.go
index 7330849cb89..591d7a62e11 100644
--- a/vendor/github.com/containerd/stargz-snapshotter/estargz/gzip.go
+++ b/vendor/github.com/containerd/stargz-snapshotter/estargz/gzip.go
@@ -34,7 +34,6 @@ import (
 	"strconv"
 
 	digest "github.com/opencontainers/go-digest"
-	"github.com/pkg/errors"
 )
 
 type gzipCompression struct {
@@ -150,7 +149,7 @@ func (gz *GzipDecompressor) ParseFooter(p []byte) (blobPayloadSize, tocOffset, t
 	}
 	tocOffset, err = strconv.ParseInt(string(subfield[:16]), 16, 64)
 	if err != nil {
-		return 0, 0, 0, errors.Wrapf(err, "legacy: failed to parse toc offset")
+		return 0, 0, 0, fmt.Errorf("legacy: failed to parse toc offset: %w", err)
 	}
 	return tocOffset, tocOffset, 0, nil
 }
@@ -179,7 +178,7 @@ func (gz *LegacyGzipDecompressor) ParseFooter(p []byte) (blobPayloadSize, tocOff
 	}
 	zr, err := gzip.NewReader(bytes.NewReader(p))
 	if err != nil {
-		return 0, 0, 0, errors.Wrapf(err, "legacy: failed to get footer gzip reader")
+		return 0, 0, 0, fmt.Errorf("legacy: failed to get footer gzip reader: %w", err)
 	}
 	defer zr.Close()
 	extra := zr.Header.Extra
@@ -191,7 +190,7 @@ func (gz *LegacyGzipDecompressor) ParseFooter(p []byte) (blobPayloadSize, tocOff
 	}
 	tocOffset, err = strconv.ParseInt(string(extra[:16]), 16, 64)
 	if err != nil {
-		return 0, 0, 0, errors.Wrapf(err, "legacy: failed to parse toc offset")
+		return 0, 0, 0, fmt.Errorf("legacy: failed to parse toc offset: %w", err)
 	}
 	return tocOffset, tocOffset, 0, nil
 }
diff --git a/vendor/github.com/containerd/stargz-snapshotter/estargz/testutil.go b/vendor/github.com/containerd/stargz-snapshotter/estargz/testutil.go
index 9224e456dde..8f27dfb3ea2 100644
--- a/vendor/github.com/containerd/stargz-snapshotter/estargz/testutil.go
+++ b/vendor/github.com/containerd/stargz-snapshotter/estargz/testutil.go
@@ -28,9 +28,9 @@ import (
 	"compress/gzip"
 	"crypto/sha256"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"os"
 	"reflect"
 	"sort"
@@ -41,7 +41,6 @@ import (
 	"github.com/containerd/stargz-snapshotter/estargz/errorutil"
 	"github.com/klauspost/compress/zstd"
 	digest "github.com/opencontainers/go-digest"
-	"github.com/pkg/errors"
 )
 
 // TestingController is Compression with some helper methods necessary for testing.
@@ -287,11 +286,11 @@ func isSameTarGz(t *testing.T, controller TestingController, a, b []byte) bool {
 			return false
 
 		}
-		aFile, err := ioutil.ReadAll(aTar)
+		aFile, err := io.ReadAll(aTar)
 		if err != nil {
 			t.Fatal("failed to read tar payload of A")
 		}
-		bFile, err := ioutil.ReadAll(bTar)
+		bFile, err := io.ReadAll(bTar)
 		if err != nil {
 			t.Fatal("failed to read tar payload of B")
 		}
@@ -1062,18 +1061,18 @@ func parseStargz(sgz *io.SectionReader, controller TestingController) (decodedJT
 	fSize := controller.FooterSize()
 	footer := make([]byte, fSize)
 	if _, err := sgz.ReadAt(footer, sgz.Size()-fSize); err != nil {
-		return nil, 0, errors.Wrap(err, "error reading footer")
+		return nil, 0, fmt.Errorf("error reading footer: %w", err)
 	}
 	_, tocOffset, _, err := controller.ParseFooter(footer[positive(int64(len(footer))-fSize):])
 	if err != nil {
-		return nil, 0, errors.Wrapf(err, "failed to parse footer")
+		return nil, 0, fmt.Errorf("failed to parse footer: %w", err)
 	}
 
 	// Decode the TOC JSON
 	tocReader := io.NewSectionReader(sgz, tocOffset, sgz.Size()-tocOffset-fSize)
 	decodedJTOC, _, err = controller.ParseTOC(tocReader)
 	if err != nil {
-		return nil, 0, errors.Wrap(err, "failed to parse TOC")
+		return nil, 0, fmt.Errorf("failed to parse TOC: %w", err)
 	}
 	return decodedJTOC, tocOffset, nil
 }
diff --git a/vendor/github.com/containers/storage/.cirrus.yml b/vendor/github.com/containers/storage/.cirrus.yml
index d080d790c1a..fd3d310548d 100644
--- a/vendor/github.com/containers/storage/.cirrus.yml
+++ b/vendor/github.com/containers/storage/.cirrus.yml
@@ -17,17 +17,17 @@ env:
     ####
     #### Cache-image names to test with (double-quotes around names are critical)
     ###
-    FEDORA_NAME: "fedora-34"
-    PRIOR_FEDORA_NAME: "fedora-33"
+    FEDORA_NAME: "fedora-35"
+    PRIOR_FEDORA_NAME: "fedora-34"
     UBUNTU_NAME: "ubuntu-2104"
 
     # GCE project where images live
     IMAGE_PROJECT: "libpod-218412"
     # VM Image built in containers/automation_images
-    _BUILT_IMAGE_SUFFIX: "c6431352024203264"
-    FEDORA_CACHE_IMAGE_NAME: "fedora-${_BUILT_IMAGE_SUFFIX}"
-    PRIOR_FEDORA_CACHE_IMAGE_NAME: "prior-fedora-${_BUILT_IMAGE_SUFFIX}"
-    UBUNTU_CACHE_IMAGE_NAME: "ubuntu-${_BUILT_IMAGE_SUFFIX}"
+    IMAGE_SUFFIX: "c4512539143831552"
+    FEDORA_CACHE_IMAGE_NAME: "fedora-${IMAGE_SUFFIX}"
+    PRIOR_FEDORA_CACHE_IMAGE_NAME: "prior-fedora-${IMAGE_SUFFIX}"
+    UBUNTU_CACHE_IMAGE_NAME: "ubuntu-${IMAGE_SUFFIX}"
 
     ####
     #### Command variables to help avoid duplication
@@ -117,7 +117,7 @@ lint_task:
     env:
         CIRRUS_WORKING_DIR: "/go/src/github.com/containers/storage"
     container:
-        image: golang:1.15
+        image: golang:1.16
     modules_cache:
         fingerprint_script: cat go.sum
         folder: $GOPATH/pkg/mod
@@ -132,7 +132,7 @@ lint_task:
 meta_task:
 
     container:
-        image: "quay.io/libpod/imgts:${_BUILT_IMAGE_SUFFIX}"
+        image: "quay.io/libpod/imgts:${IMAGE_SUFFIX}"
         cpu: 1
         memory: 1
 
@@ -154,7 +154,7 @@ meta_task:
 
 vendor_task:
     container:
-        image: golang:1.15
+        image: golang:1.16
     modules_cache:
         fingerprint_script: cat go.sum
         folder: $GOPATH/pkg/mod
@@ -172,6 +172,6 @@ success_task:
         - meta
         - vendor
     container:
-        image: golang:1.15
+        image: golang:1.16
     clone_script: 'mkdir -p "$CIRRUS_WORKING_DIR"'  # Source code not needed
     script: /bin/true
diff --git a/vendor/github.com/containers/storage/Makefile b/vendor/github.com/containers/storage/Makefile
index dbc1f7c9987..244576d546a 100644
--- a/vendor/github.com/containers/storage/Makefile
+++ b/vendor/github.com/containers/storage/Makefile
@@ -51,13 +51,16 @@ sources := $(wildcard *.go cmd/containers-storage/*.go drivers/*.go drivers/*/*.
 containers-storage: $(sources) ## build using gc on the host
 	$(GO) build $(MOD_VENDOR) -compiler gc $(BUILDFLAGS) ./cmd/containers-storage
 
+codespell:
+	codespell -S Makefile,build,buildah,buildah.spec,imgtype,copy,AUTHORS,bin,vendor,.git,go.sum,CHANGELOG.md,changelog.txt,seccomp.json,.cirrus.yml,"*.xz,*.gz,*.tar,*.tgz,*ico,*.png,*.1,*.5,*.orig,*.rej" -L flate,uint,iff,od,ERRO -w
+
 binary local-binary: containers-storage
 
 local-gccgo: ## build using gccgo on the host
 	GCCGO=$(PWD)/hack/gccgo-wrapper.sh $(GO) build $(MOD_VENDOR) -compiler gccgo $(BUILDFLAGS) -o containers-storage.gccgo ./cmd/containers-storage
 
-local-cross: ## cross build the binaries for arm, darwin, and\nfreebsd
-	@for target in linux/amd64 linux/386 linux/arm linux/arm64 linux/ppc64 linux/ppc64le darwin/amd64 windows/amd64 ; do \
+local-cross: ## cross build the binaries for arm, darwin, and freebsd
+	@for target in linux/amd64 linux/386 linux/arm linux/arm64 linux/ppc64 linux/ppc64le darwin/amd64 windows/amd64 freebsd/amd64 freebsd/arm64 ; do \
 		os=`echo $${target} | cut -f1 -d/` ; \
 		arch=`echo $${target} | cut -f2 -d/` ; \
 		suffix=$${os}.$${arch} ; \
@@ -66,44 +69,44 @@ local-cross: ## cross build the binaries for arm, darwin, and\nfreebsd
 	done
 
 cross: ## cross build the binaries for arm, darwin, and\nfreebsd using VMs
-	$(RUNINVM) make local-$@
+	$(RUNINVM) $(MAKE) local-$@
 
 docs: install.tools ## build the docs on the host
 	$(MAKE) -C docs docs
 
 gccgo: ## build using gccgo using VMs
-	$(RUNINVM) make local-$@
+	$(RUNINVM) $(MAKE) local-$@
 
 test: local-binary ## build the binaries and run the tests using VMs
-	$(RUNINVM) make local-binary local-cross local-test-unit local-test-integration
+	$(RUNINVM) $(MAKE) local-binary local-cross local-test-unit local-test-integration
 
 local-test-unit: local-binary ## run the unit tests on the host (requires\nsuperuser privileges)
 	@$(GO) test $(MOD_VENDOR) $(BUILDFLAGS) $(TESTFLAGS) $(shell $(GO) list ./... | grep -v ^$(PACKAGE)/vendor)
 
 test-unit: local-binary ## run the unit tests using VMs
-	$(RUNINVM) make local-$@
+	$(RUNINVM) $(MAKE) local-$@
 
 local-test-integration: local-binary ## run the integration tests on the host (requires\nsuperuser privileges)
 	@cd tests; ./test_runner.bash
 
 test-integration: local-binary ## run the integration tests using VMs
-	$(RUNINVM) make local-$@
+	$(RUNINVM) $(MAKE) local-$@
 
 local-validate: ## validate DCO and gofmt on the host
 	@./hack/git-validation.sh
 	@./hack/gofmt.sh
 
 validate: ## validate DCO, gofmt, ./pkg/ isolation, golint,\ngo vet and vendor using VMs
-	$(RUNINVM) make local-$@
+	$(RUNINVM) $(MAKE) local-$@
 
 install.tools:
-	make -C tests/tools
+	$(MAKE) -C tests/tools
 
 $(FFJSON):
-	make -C tests/tools
+	$(MAKE) -C tests/tools
 
 install.docs: docs
-	make -C docs install
+	$(MAKE) -C docs install
 
 install: install.docs
 
diff --git a/vendor/github.com/containers/storage/VERSION b/vendor/github.com/containers/storage/VERSION
index bf50e910e62..7d47e599800 100644
--- a/vendor/github.com/containers/storage/VERSION
+++ b/vendor/github.com/containers/storage/VERSION
@@ -1 +1 @@
-1.37.0
+1.41.0
diff --git a/vendor/github.com/containers/storage/Vagrantfile b/vendor/github.com/containers/storage/Vagrantfile
deleted file mode 100644
index c82c1f81bd3..00000000000
--- a/vendor/github.com/containers/storage/Vagrantfile
+++ /dev/null
@@ -1,25 +0,0 @@
-# -*- mode: ruby -*-
-# vi: set ft=ruby :
-#
-#  The fedora/28-cloud-base and debian/jessie64 boxes are also available for
-#  the "virtualbox" provider.  Set the VAGRANT_PROVIDER environment variable to
-#  "virtualbox" to use them instead.
-#
-Vagrant.configure("2") do |config|
-  config.vm.define "fedora" do |c|
-    c.vm.box = "fedora/28-cloud-base"
-    c.vm.synced_folder ".", "/vagrant", type: "rsync",
-      rsync__exclude: "bundles", rsync__args: ["-vadz", "--delete"]
-    c.vm.provision "shell", inline: <<-SHELL
-      sudo /vagrant/vagrant/provision.sh
-    SHELL
-  end
-  config.vm.define "debian" do |c|
-    c.vm.box = "debian/jessie64"
-    c.vm.synced_folder ".", "/vagrant", type: "rsync",
-      rsync__exclude: "bundles", rsync__args: ["-vadz", "--delete"]
-    c.vm.provision "shell", inline: <<-SHELL
-      sudo /vagrant/vagrant/provision.sh
-    SHELL
-  end
-end
diff --git a/vendor/github.com/containers/storage/containers.go b/vendor/github.com/containers/storage/containers.go
index b4f773f2b73..a8b20f03a01 100644
--- a/vendor/github.com/containers/storage/containers.go
+++ b/vendor/github.com/containers/storage/containers.go
@@ -84,8 +84,17 @@ type ContainerStore interface {
 
 	// SetNames updates the list of names associated with the container
 	// with the specified ID.
+	// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 	SetNames(id string, names []string) error
 
+	// AddNames adds the supplied values to the list of names associated with the container with
+	// the specified id.
+	AddNames(id string, names []string) error
+
+	// RemoveNames removes the supplied values from the list of names associated with the container with
+	// the specified id.
+	RemoveNames(id string, names []string) error
+
 	// Get retrieves information about a container given an ID or name.
 	Get(id string) (*Container, error)
 
@@ -324,6 +333,12 @@ func (r *containerStore) Create(id string, names []string, image, layer, metadat
 				fmt.Sprintf("the container name \"%s\" is already in use by \"%s\". You have to remove that container to be able to reuse that name.", name, r.byname[name].ID))
 		}
 	}
+	if err := hasOverlappingRanges(options.UIDMap); err != nil {
+		return nil, err
+	}
+	if err := hasOverlappingRanges(options.GIDMap); err != nil {
+		return nil, err
+	}
 	if err == nil {
 		container = &Container{
 			ID:             id,
@@ -371,22 +386,40 @@ func (r *containerStore) removeName(container *Container, name string) {
 	container.Names = stringSliceWithoutValue(container.Names, name)
 }
 
+// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 func (r *containerStore) SetNames(id string, names []string) error {
-	names = dedupeNames(names)
-	if container, ok := r.lookup(id); ok {
-		for _, name := range container.Names {
-			delete(r.byname, name)
-		}
-		for _, name := range names {
-			if otherContainer, ok := r.byname[name]; ok {
-				r.removeName(otherContainer, name)
-			}
-			r.byname[name] = container
+	return r.updateNames(id, names, setNames)
+}
+
+func (r *containerStore) AddNames(id string, names []string) error {
+	return r.updateNames(id, names, addNames)
+}
+
+func (r *containerStore) RemoveNames(id string, names []string) error {
+	return r.updateNames(id, names, removeNames)
+}
+
+func (r *containerStore) updateNames(id string, names []string, op updateNameOperation) error {
+	container, ok := r.lookup(id)
+	if !ok {
+		return ErrContainerUnknown
+	}
+	oldNames := container.Names
+	names, err := applyNameOperation(oldNames, names, op)
+	if err != nil {
+		return err
+	}
+	for _, name := range oldNames {
+		delete(r.byname, name)
+	}
+	for _, name := range names {
+		if otherContainer, ok := r.byname[name]; ok {
+			r.removeName(otherContainer, name)
 		}
-		container.Names = names
-		return r.Save()
+		r.byname[name] = container
 	}
-	return ErrContainerUnknown
+	container.Names = names
+	return r.Save()
 }
 
 func (r *containerStore) Delete(id string) error {
diff --git a/vendor/github.com/containers/storage/drivers/aufs/aufs.go b/vendor/github.com/containers/storage/drivers/aufs/aufs.go
index a566fbffa0f..e66613c098a 100644
--- a/vendor/github.com/containers/storage/drivers/aufs/aufs.go
+++ b/vendor/github.com/containers/storage/drivers/aufs/aufs.go
@@ -1,3 +1,4 @@
+//go:build linux
 // +build linux
 
 /*
@@ -26,6 +27,7 @@ import (
 	"bufio"
 	"fmt"
 	"io"
+	"io/fs"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -649,11 +651,11 @@ func (a *Driver) mounted(mountpoint string) (bool, error) {
 // Cleanup aufs and unmount all mountpoints
 func (a *Driver) Cleanup() error {
 	var dirs []string
-	if err := filepath.Walk(a.mntPath(), func(path string, info os.FileInfo, err error) error {
+	if err := filepath.WalkDir(a.mntPath(), func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			return err
 		}
-		if !info.IsDir() {
+		if !d.IsDir() {
 			return nil
 		}
 		dirs = append(dirs, path)
diff --git a/vendor/github.com/containers/storage/drivers/btrfs/btrfs.go b/vendor/github.com/containers/storage/drivers/btrfs/btrfs.go
index 3903b1dddd9..339aa0d3809 100644
--- a/vendor/github.com/containers/storage/drivers/btrfs/btrfs.go
+++ b/vendor/github.com/containers/storage/drivers/btrfs/btrfs.go
@@ -1,3 +1,4 @@
+//go:build linux && cgo
 // +build linux,cgo
 
 package btrfs
@@ -16,6 +17,7 @@ import "C"
 
 import (
 	"fmt"
+	"io/fs"
 	"io/ioutil"
 	"math"
 	"os"
@@ -256,7 +258,7 @@ func subvolDelete(dirpath, name string, quotaEnabled bool) error {
 	var args C.struct_btrfs_ioctl_vol_args
 
 	// walk the btrfs subvolumes
-	walkSubvolumes := func(p string, f os.FileInfo, err error) error {
+	walkSubvolumes := func(p string, d fs.DirEntry, err error) error {
 		if err != nil {
 			if os.IsNotExist(err) && p != fullPath {
 				// missing most likely because the path was a subvolume that got removed in the previous iteration
@@ -267,20 +269,20 @@ func subvolDelete(dirpath, name string, quotaEnabled bool) error {
 		}
 		// we want to check children only so skip itself
 		// it will be removed after the filepath walk anyways
-		if f.IsDir() && p != fullPath {
+		if d.IsDir() && p != fullPath {
 			sv, err := isSubvolume(p)
 			if err != nil {
 				return fmt.Errorf("Failed to test if %s is a btrfs subvolume: %v", p, err)
 			}
 			if sv {
-				if err := subvolDelete(path.Dir(p), f.Name(), quotaEnabled); err != nil {
+				if err := subvolDelete(path.Dir(p), d.Name(), quotaEnabled); err != nil {
 					return fmt.Errorf("Failed to destroy btrfs child subvolume (%s) of parent (%s): %v", p, dirpath, err)
 				}
 			}
 		}
 		return nil
 	}
-	if err := filepath.Walk(path.Join(dirpath, name), walkSubvolumes); err != nil {
+	if err := filepath.WalkDir(path.Join(dirpath, name), walkSubvolumes); err != nil {
 		return fmt.Errorf("Recursively walking subvolumes for %s failed: %v", dirpath, err)
 	}
 
diff --git a/vendor/github.com/containers/storage/drivers/chown.go b/vendor/github.com/containers/storage/drivers/chown.go
index 63bfd2d136f..2db6764c91b 100644
--- a/vendor/github.com/containers/storage/drivers/chown.go
+++ b/vendor/github.com/containers/storage/drivers/chown.go
@@ -50,11 +50,14 @@ func chownByMapsMain() {
 	if len(toHost.UIDs()) == 0 && len(toHost.GIDs()) == 0 {
 		toHost = nil
 	}
+
+	chowner := newLChowner()
+
 	chown := func(path string, info os.FileInfo, _ error) error {
 		if path == "." {
 			return nil
 		}
-		return platformLChown(path, info, toHost, toContainer)
+		return chowner.LChown(path, info, toHost, toContainer)
 	}
 	if err := pwalk.Walk(".", chown); err != nil {
 		fmt.Fprintf(os.Stderr, "error during chown: %v", err)
diff --git a/vendor/github.com/containers/storage/drivers/chown_unix.go b/vendor/github.com/containers/storage/drivers/chown_unix.go
index 0387adfc123..c598b936d64 100644
--- a/vendor/github.com/containers/storage/drivers/chown_unix.go
+++ b/vendor/github.com/containers/storage/drivers/chown_unix.go
@@ -1,3 +1,4 @@
+//go:build !windows
 // +build !windows
 
 package graphdriver
@@ -6,17 +7,50 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"sync"
 	"syscall"
 
 	"github.com/containers/storage/pkg/idtools"
 	"github.com/containers/storage/pkg/system"
 )
 
-func platformLChown(path string, info os.FileInfo, toHost, toContainer *idtools.IDMappings) error {
+type inode struct {
+	Dev uint64
+	Ino uint64
+}
+
+type platformChowner struct {
+	mutex  sync.Mutex
+	inodes map[inode]bool
+}
+
+func newLChowner() *platformChowner {
+	return &platformChowner{
+		inodes: make(map[inode]bool),
+	}
+}
+
+func (c *platformChowner) LChown(path string, info os.FileInfo, toHost, toContainer *idtools.IDMappings) error {
 	st, ok := info.Sys().(*syscall.Stat_t)
 	if !ok {
 		return nil
 	}
+
+	i := inode{
+		Dev: uint64(st.Dev),
+		Ino: uint64(st.Ino),
+	}
+	c.mutex.Lock()
+	_, found := c.inodes[i]
+	if !found {
+		c.inodes[i] = true
+	}
+	c.mutex.Unlock()
+
+	if found {
+		return nil
+	}
+
 	// Map an on-disk UID/GID pair from host to container
 	// using the first map, then back to the host using the
 	// second map.  Skip that first step if they're 0, to
@@ -42,7 +76,7 @@ func platformLChown(path string, info os.FileInfo, toHost, toContainer *idtools.
 			UID: uid,
 			GID: gid,
 		}
-		mappedPair, err := toHost.ToHost(pair)
+		mappedPair, err := toHost.ToHostOverflow(pair)
 		if err != nil {
 			return fmt.Errorf("error mapping container ID pair %#v for %q to host: %v", pair, path, err)
 		}
@@ -50,7 +84,7 @@ func platformLChown(path string, info os.FileInfo, toHost, toContainer *idtools.
 	}
 	if uid != int(st.Uid) || gid != int(st.Gid) {
 		cap, err := system.Lgetxattr(path, "security.capability")
-		if err != nil && !errors.Is(err, system.EOPNOTSUPP) && err != system.ErrNotSupportedPlatform {
+		if err != nil && !errors.Is(err, system.EOPNOTSUPP) && !errors.Is(err, system.EOVERFLOW) && err != system.ErrNotSupportedPlatform {
 			return fmt.Errorf("%s: %v", os.Args[0], err)
 		}
 
diff --git a/vendor/github.com/containers/storage/drivers/chown_windows.go b/vendor/github.com/containers/storage/drivers/chown_windows.go
index 31bd5bb52dd..1845a4e086c 100644
--- a/vendor/github.com/containers/storage/drivers/chown_windows.go
+++ b/vendor/github.com/containers/storage/drivers/chown_windows.go
@@ -1,3 +1,4 @@
+//go:build windows
 // +build windows
 
 package graphdriver
@@ -9,6 +10,13 @@ import (
 	"github.com/containers/storage/pkg/idtools"
 )
 
-func platformLChown(path string, info os.FileInfo, toHost, toContainer *idtools.IDMappings) error {
+type platformChowner struct {
+}
+
+func newLChowner() *platformChowner {
+	return &platformChowner{}
+}
+
+func (c *platformChowner) LChown(path string, info os.FileInfo, toHost, toContainer *idtools.IDMappings) error {
 	return &os.PathError{"lchown", path, syscall.EWINDOWS}
 }
diff --git a/vendor/github.com/containers/storage/drivers/devmapper/deviceset.go b/vendor/github.com/containers/storage/drivers/devmapper/deviceset.go
index c5168bfdd25..e604b7e3186 100644
--- a/vendor/github.com/containers/storage/drivers/devmapper/deviceset.go
+++ b/vendor/github.com/containers/storage/drivers/devmapper/deviceset.go
@@ -1,3 +1,4 @@
+//go:build linux && cgo
 // +build linux,cgo
 
 package devmapper
@@ -6,6 +7,7 @@ import (
 	"bufio"
 	"fmt"
 	"io"
+	"io/fs"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -419,40 +421,35 @@ func (devices *DeviceSet) constructDeviceIDMap() {
 	}
 }
 
-func (devices *DeviceSet) deviceFileWalkFunction(path string, finfo os.FileInfo) error {
+func (devices *DeviceSet) deviceFileWalkFunction(path string, name string) error {
 
 	// Skip some of the meta files which are not device files.
-	if strings.HasSuffix(finfo.Name(), ".migrated") {
+	if strings.HasSuffix(name, ".migrated") {
 		logrus.Debugf("devmapper: Skipping file %s", path)
 		return nil
 	}
 
-	if strings.HasPrefix(finfo.Name(), ".") {
+	if strings.HasPrefix(name, ".") {
 		logrus.Debugf("devmapper: Skipping file %s", path)
 		return nil
 	}
 
-	if finfo.Name() == deviceSetMetaFile {
+	if name == deviceSetMetaFile {
 		logrus.Debugf("devmapper: Skipping file %s", path)
 		return nil
 	}
 
-	if finfo.Name() == transactionMetaFile {
+	if name == transactionMetaFile {
 		logrus.Debugf("devmapper: Skipping file %s", path)
 		return nil
 	}
 
 	logrus.Debugf("devmapper: Loading data for file %s", path)
 
-	hash := finfo.Name()
-	if hash == base {
-		hash = ""
-	}
-
 	// Include deleted devices also as cleanup delete device logic
 	// will go through it and see if there are any deleted devices.
-	if _, err := devices.lookupDevice(hash); err != nil {
-		return fmt.Errorf("devmapper: Error looking up device %s:%v", hash, err)
+	if _, err := devices.lookupDevice(name); err != nil {
+		return fmt.Errorf("devmapper: Error looking up device %s:%v", name, err)
 	}
 
 	return nil
@@ -462,21 +459,21 @@ func (devices *DeviceSet) loadDeviceFilesOnStart() error {
 	logrus.Debug("devmapper: loadDeviceFilesOnStart()")
 	defer logrus.Debug("devmapper: loadDeviceFilesOnStart() END")
 
-	var scan = func(path string, info os.FileInfo, err error) error {
+	var scan = func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			logrus.Debugf("devmapper: Can't walk the file %s", path)
 			return nil
 		}
 
 		// Skip any directories
-		if info.IsDir() {
+		if d.IsDir() {
 			return nil
 		}
 
-		return devices.deviceFileWalkFunction(path, info)
+		return devices.deviceFileWalkFunction(path, d.Name())
 	}
 
-	return filepath.Walk(devices.metadataDir(), scan)
+	return filepath.WalkDir(devices.metadataDir(), scan)
 }
 
 // Should be called with devices.Lock() held.
diff --git a/vendor/github.com/containers/storage/drivers/driver_freebsd.go b/vendor/github.com/containers/storage/drivers/driver_freebsd.go
index e1320ee07f6..143cccf92ec 100644
--- a/vendor/github.com/containers/storage/drivers/driver_freebsd.go
+++ b/vendor/github.com/containers/storage/drivers/driver_freebsd.go
@@ -2,15 +2,43 @@ package graphdriver
 
 import (
 	"golang.org/x/sys/unix"
+
+	"github.com/containers/storage/pkg/mount"
+)
+
+const (
+	// FsMagicZfs filesystem id for Zfs
+	FsMagicZfs = FsMagic(0x2fc12fc1)
 )
 
 var (
 	// Slice of drivers that should be used in an order
 	priority = []string{
 		"zfs",
+		"vfs",
+	}
+
+	// FsNames maps filesystem id to name of the filesystem.
+	FsNames = map[FsMagic]string{
+		FsMagicZfs: "zfs",
 	}
 )
 
+// NewDefaultChecker returns a check that parses /proc/mountinfo to check
+// if the specified path is mounted.
+// No-op on FreeBSD.
+func NewDefaultChecker() Checker {
+	return &defaultChecker{}
+}
+
+type defaultChecker struct {
+}
+
+func (c *defaultChecker) IsMounted(path string) bool {
+	m, _ := mount.Mounted(path)
+	return m
+}
+
 // Mounted checks if the given path is mounted as the fs type
 func Mounted(fsType FsMagic, mountPath string) (bool, error) {
 	var buf unix.Statfs_t
diff --git a/vendor/github.com/containers/storage/drivers/fsdiff.go b/vendor/github.com/containers/storage/drivers/fsdiff.go
index a534630df08..b7e681ace45 100644
--- a/vendor/github.com/containers/storage/drivers/fsdiff.go
+++ b/vendor/github.com/containers/storage/drivers/fsdiff.go
@@ -138,6 +138,7 @@ func (gdw *NaiveDiffDriver) Changes(id string, idMappings *idtools.IDMappings, p
 	if parent != "" {
 		options := MountOpts{
 			MountLabel: mountLabel,
+			Options:    []string{"ro"},
 		}
 		parentFs, err = driver.Get(parent, options)
 		if err != nil {
diff --git a/vendor/github.com/containers/storage/drivers/overlay/check.go b/vendor/github.com/containers/storage/drivers/overlay/check.go
index 44b3515a854..48fb7a550fa 100644
--- a/vendor/github.com/containers/storage/drivers/overlay/check.go
+++ b/vendor/github.com/containers/storage/drivers/overlay/check.go
@@ -1,3 +1,4 @@
+//go:build linux
 // +build linux
 
 package overlay
@@ -11,6 +12,7 @@ import (
 	"syscall"
 
 	"github.com/containers/storage/pkg/archive"
+	"github.com/containers/storage/pkg/idtools"
 	"github.com/containers/storage/pkg/ioutils"
 	"github.com/containers/storage/pkg/mount"
 	"github.com/containers/storage/pkg/system"
@@ -218,3 +220,55 @@ func doesVolatile(d string) (bool, error) {
 	}()
 	return true, nil
 }
+
+// supportsIdmappedLowerLayers checks if the kernel supports mounting overlay on top of
+// a idmapped lower layer.
+func supportsIdmappedLowerLayers(home string) (bool, error) {
+	layerDir, err := ioutil.TempDir(home, "compat")
+	if err != nil {
+		return false, err
+	}
+	defer func() {
+		_ = os.RemoveAll(layerDir)
+	}()
+
+	mergedDir := filepath.Join(layerDir, "merged")
+	lowerDir := filepath.Join(layerDir, "lower")
+	lowerMappedDir := filepath.Join(layerDir, "lower-mapped")
+	upperDir := filepath.Join(layerDir, "upper")
+	workDir := filepath.Join(layerDir, "work")
+
+	_ = idtools.MkdirAs(mergedDir, 0700, 0, 0)
+	_ = idtools.MkdirAs(lowerDir, 0700, 0, 0)
+	_ = idtools.MkdirAs(lowerMappedDir, 0700, 0, 0)
+	_ = idtools.MkdirAs(upperDir, 0700, 0, 0)
+	_ = idtools.MkdirAs(workDir, 0700, 0, 0)
+
+	idmap := []idtools.IDMap{
+		{
+			ContainerID: 0,
+			HostID:      0,
+			Size:        1,
+		},
+	}
+	pid, cleanupFunc, err := createUsernsProcess(idmap, idmap)
+	if err != nil {
+		return false, err
+	}
+	defer cleanupFunc()
+
+	if err := createIDMappedMount(lowerDir, lowerMappedDir, int(pid)); err != nil {
+		return false, errors.Wrapf(err, "create mapped mount")
+	}
+	defer unix.Unmount(lowerMappedDir, unix.MNT_DETACH)
+
+	opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerMappedDir, upperDir, workDir)
+	flags := uintptr(0)
+	if err := unix.Mount("overlay", mergedDir, "overlay", flags, opts); err != nil {
+		return false, err
+	}
+	defer func() {
+		_ = unix.Unmount(mergedDir, unix.MNT_DETACH)
+	}()
+	return true, nil
+}
diff --git a/vendor/github.com/containers/storage/drivers/overlay/check_115.go b/vendor/github.com/containers/storage/drivers/overlay/check_115.go
deleted file mode 100644
index 9ad1b863d8d..00000000000
--- a/vendor/github.com/containers/storage/drivers/overlay/check_115.go
+++ /dev/null
@@ -1,42 +0,0 @@
-// +build !go1.16
-
-package overlay
-
-import (
-	"os"
-	"path/filepath"
-	"strings"
-
-	"github.com/containers/storage/pkg/archive"
-	"github.com/containers/storage/pkg/system"
-)
-
-func scanForMountProgramIndicators(home string) (detected bool, err error) {
-	err = filepath.Walk(home, func(path string, info os.FileInfo, err error) error {
-		if detected {
-			return filepath.SkipDir
-		}
-		if err != nil {
-			return err
-		}
-		basename := filepath.Base(path)
-		if strings.HasPrefix(basename, archive.WhiteoutPrefix) {
-			detected = true
-			return filepath.SkipDir
-		}
-		if info.IsDir() {
-			xattrs, err := system.Llistxattr(path)
-			if err != nil {
-				return err
-			}
-			for _, xattr := range xattrs {
-				if strings.HasPrefix(xattr, "user.fuseoverlayfs.") || strings.HasPrefix(xattr, "user.containers.") {
-					detected = true
-					return filepath.SkipDir
-				}
-			}
-		}
-		return nil
-	})
-	return detected, err
-}
diff --git a/vendor/github.com/containers/storage/drivers/overlay/idmapped_utils.go b/vendor/github.com/containers/storage/drivers/overlay/idmapped_utils.go
new file mode 100644
index 00000000000..2af33a6fce0
--- /dev/null
+++ b/vendor/github.com/containers/storage/drivers/overlay/idmapped_utils.go
@@ -0,0 +1,160 @@
+//go:build linux
+// +build linux
+
+package overlay
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+	"unsafe"
+
+	"github.com/containers/storage/pkg/idtools"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+type attr struct {
+	attrSet     uint64
+	attrClr     uint64
+	propagation uint64
+	userNs      uint64
+}
+
+const (
+	// _MOUNT_ATTR_IDMAP - Idmap mount to @userns_fd in struct mount_attr
+	_MOUNT_ATTR_IDMAP = 0x00100000 //nolint:golint
+
+	// _OPEN_TREE_CLONE - Clone the source path mount
+	_OPEN_TREE_CLONE = 0x00000001 //nolint:golint
+
+	// _MOVE_MOUNT_F_EMPTY_PATH - Move the path referenced by the fd
+	_MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 //nolint:golint
+)
+
+// openTree is a wrapper for the open_tree syscall
+func openTree(path string, flags int) (fd int, err error) {
+	var _p0 *byte
+
+	if _p0, err = syscall.BytePtrFromString(path); err != nil {
+		return 0, err
+	}
+
+	r, _, e1 := syscall.Syscall6(uintptr(unix.SYS_OPEN_TREE), uintptr(0), uintptr(unsafe.Pointer(_p0)),
+		uintptr(flags), 0, 0, 0)
+	if e1 != 0 {
+		err = e1
+	}
+	return int(r), nil
+}
+
+// moveMount is a wrapper for the the move_mount syscall.
+func moveMount(fdTree int, target string) (err error) {
+	var _p0, _p1 *byte
+
+	empty := ""
+
+	if _p0, err = syscall.BytePtrFromString(target); err != nil {
+		return err
+	}
+	if _p1, err = syscall.BytePtrFromString(empty); err != nil {
+		return err
+	}
+
+	flags := _MOVE_MOUNT_F_EMPTY_PATH
+
+	_, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOVE_MOUNT),
+		uintptr(fdTree), uintptr(unsafe.Pointer(_p1)),
+		0, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0)
+	if e1 != 0 {
+		err = e1
+	}
+	return
+}
+
+// mountSetAttr is a wrapper for the mount_setattr syscall
+func mountSetAttr(dfd int, path string, flags uint, attr *attr, size uint) (err error) {
+	var _p0 *byte
+
+	if _p0, err = syscall.BytePtrFromString(path); err != nil {
+		return err
+	}
+
+	_, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOUNT_SETATTR), uintptr(dfd), uintptr(unsafe.Pointer(_p0)),
+		uintptr(flags), uintptr(unsafe.Pointer(attr)), uintptr(size), 0)
+	if e1 != 0 {
+		err = e1
+	}
+	return
+}
+
+// createIDMappedMount creates a IDMapped bind mount from SOURCE to TARGET using the user namespace
+// for the PID process.
+func createIDMappedMount(source, target string, pid int) error {
+	path := fmt.Sprintf("/proc/%d/ns/user", pid)
+	userNsFile, err := os.Open(path)
+	if err != nil {
+		return errors.Wrapf(err, "unable to get user ns file descriptor for %q", path)
+	}
+
+	var attr attr
+	attr.attrSet = _MOUNT_ATTR_IDMAP
+	attr.attrClr = 0
+	attr.propagation = 0
+	attr.userNs = uint64(userNsFile.Fd())
+
+	defer userNsFile.Close()
+
+	targetDirFd, err := openTree(source, _OPEN_TREE_CLONE|unix.AT_RECURSIVE)
+	if err != nil {
+		return err
+	}
+	defer unix.Close(targetDirFd)
+
+	if err := mountSetAttr(targetDirFd, "", unix.AT_EMPTY_PATH|unix.AT_RECURSIVE,
+		&attr, uint(unsafe.Sizeof(attr))); err != nil {
+		return err
+	}
+	if err := os.Mkdir(target, 0700); err != nil && !os.IsExist(err) {
+		return err
+	}
+	return moveMount(targetDirFd, target)
+}
+
+// createUsernsProcess forks the current process and creates a user namespace using the specified
+// mappings.  It returns the pid of the new process.
+func createUsernsProcess(uidMaps []idtools.IDMap, gidMaps []idtools.IDMap) (int, func(), error) {
+	pid, _, err := syscall.Syscall6(uintptr(unix.SYS_CLONE), unix.CLONE_NEWUSER|uintptr(unix.SIGCHLD), 0, 0, 0, 0, 0)
+	if err != 0 {
+		return -1, nil, err
+	}
+	if pid == 0 {
+		_ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0)
+		// just wait for the SIGKILL
+		for {
+			syscall.Pause()
+		}
+	}
+	cleanupFunc := func() {
+		unix.Kill(int(pid), unix.SIGKILL)
+		_, _ = unix.Wait4(int(pid), nil, 0, nil)
+	}
+	writeMappings := func(fname string, idmap []idtools.IDMap) error {
+		mappings := ""
+		for _, m := range idmap {
+			mappings = mappings + fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size)
+		}
+		return ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600)
+	}
+	if err := writeMappings("uid_map", uidMaps); err != nil {
+		cleanupFunc()
+		return -1, nil, err
+	}
+	if err := writeMappings("gid_map", gidMaps); err != nil {
+		cleanupFunc()
+		return -1, nil, err
+	}
+
+	return int(pid), cleanupFunc, nil
+}
diff --git a/vendor/github.com/containers/storage/drivers/overlay/overlay.go b/vendor/github.com/containers/storage/drivers/overlay/overlay.go
index 1efe7316d3e..8600ee68552 100644
--- a/vendor/github.com/containers/storage/drivers/overlay/overlay.go
+++ b/vendor/github.com/containers/storage/drivers/overlay/overlay.go
@@ -1,3 +1,4 @@
+//go:build linux
 // +build linux
 
 package overlay
@@ -25,7 +26,6 @@ import (
 	"github.com/containers/storage/pkg/directory"
 	"github.com/containers/storage/pkg/fsutils"
 	"github.com/containers/storage/pkg/idtools"
-	"github.com/containers/storage/pkg/locker"
 	"github.com/containers/storage/pkg/mount"
 	"github.com/containers/storage/pkg/parsers"
 	"github.com/containers/storage/pkg/system"
@@ -38,7 +38,6 @@ import (
 	"github.com/opencontainers/selinux/go-selinux/label"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
-	"github.com/vbatts/tar-split/tar/storage"
 	"golang.org/x/sys/unix"
 )
 
@@ -119,7 +118,8 @@ type Driver struct {
 	supportsDType    bool
 	supportsVolatile *bool
 	usingMetacopy    bool
-	locker           *locker.Locker
+
+	supportsIDMappedMounts *bool
 }
 
 type additionalLayerStore struct {
@@ -155,6 +155,15 @@ func hasMetacopyOption(opts []string) bool {
 	return false
 }
 
+func stripOption(opts []string, option string) []string {
+	for i, s := range opts {
+		if s == option {
+			return stripOption(append(opts[:i], opts[i+1:]...), option)
+		}
+	}
+	return opts
+}
+
 func hasVolatileOption(opts []string) bool {
 	for _, s := range opts {
 		if s == "volatile" {
@@ -195,6 +204,30 @@ func checkSupportVolatile(home, runhome string) (bool, error) {
 	return usingVolatile, nil
 }
 
+// checkAndRecordIDMappedSupport checks and stores if the kernel supports mounting overlay on top of a
+// idmapped lower layer.
+func checkAndRecordIDMappedSupport(home, runhome string) (bool, error) {
+	if os.Geteuid() != 0 {
+		return false, nil
+	}
+
+	feature := "idmapped-lower-dir"
+	overlayCacheResult, overlayCacheText, err := cachedFeatureCheck(runhome, feature)
+	if err == nil {
+		if overlayCacheResult {
+			logrus.Debugf("Cached value indicated that idmapped mounts for overlay are supported")
+			return true, nil
+		}
+		logrus.Debugf("Cached value indicated that idmapped mounts for overlay are not supported")
+		return false, errors.New(overlayCacheText)
+	}
+	supportsIDMappedMounts, err := supportsIdmappedLowerLayers(home)
+	if err2 := cachedFeatureRecord(runhome, feature, supportsIDMappedMounts, ""); err2 != nil {
+		return false, errors.Wrap(err2, "recording overlay idmapped mounts support status")
+	}
+	return supportsIDMappedMounts, err
+}
+
 func checkAndRecordOverlaySupport(fsMagic graphdriver.FsMagic, home, runhome string) (bool, error) {
 	var supportsDType bool
 
@@ -282,6 +315,31 @@ func Init(home string, options graphdriver.Options) (graphdriver.Driver, error)
 		backingFs = fsName
 	}
 
+	runhome := filepath.Join(options.RunRoot, filepath.Base(home))
+	rootUID, rootGID, err := idtools.GetRootUIDGID(options.UIDMaps, options.GIDMaps)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create the driver home dir
+	if err := idtools.MkdirAllAs(path.Join(home, linkDir), 0700, rootUID, rootGID); err != nil {
+		return nil, err
+	}
+
+	if err := idtools.MkdirAllAs(runhome, 0700, rootUID, rootGID); err != nil {
+		return nil, err
+	}
+
+	if opts.mountProgram == "" {
+		if supported, err := SupportsNativeOverlay(home, runhome); err != nil {
+			return nil, err
+		} else if !supported {
+			if path, err := exec.LookPath("fuse-overlayfs"); err == nil {
+				opts.mountProgram = path
+			}
+		}
+	}
+
 	if opts.mountProgram != "" {
 		if unshare.IsRootless() && isNetworkFileSystem(fsMagic) && opts.forceMask == nil {
 			m := os.FileMode(0700)
@@ -306,20 +364,6 @@ func Init(home string, options graphdriver.Options) (graphdriver.Driver, error)
 		}
 	}
 
-	rootUID, rootGID, err := idtools.GetRootUIDGID(options.UIDMaps, options.GIDMaps)
-	if err != nil {
-		return nil, err
-	}
-
-	// Create the driver home dir
-	if err := idtools.MkdirAllAs(path.Join(home, linkDir), 0700, rootUID, rootGID); err != nil {
-		return nil, err
-	}
-	runhome := filepath.Join(options.RunRoot, filepath.Base(home))
-	if err := idtools.MkdirAllAs(runhome, 0700, rootUID, rootGID); err != nil {
-		return nil, err
-	}
-
 	var usingMetacopy bool
 	var supportsDType bool
 	var supportsVolatile *bool
@@ -380,7 +424,6 @@ func Init(home string, options graphdriver.Options) (graphdriver.Driver, error)
 		supportsDType:    supportsDType,
 		usingMetacopy:    usingMetacopy,
 		supportsVolatile: supportsVolatile,
-		locker:           locker.New(),
 		options:          *opts,
 	}
 
@@ -559,14 +602,11 @@ func cachedFeatureRecord(runhome, feature string, supported bool, text string) (
 	return err
 }
 
-func SupportsNativeOverlay(graphroot, rundir string) (bool, error) {
-	if os.Geteuid() != 0 || graphroot == "" || rundir == "" {
+func SupportsNativeOverlay(home, runhome string) (bool, error) {
+	if os.Geteuid() != 0 || home == "" || runhome == "" {
 		return false, nil
 	}
 
-	home := filepath.Join(graphroot, "overlay")
-	runhome := filepath.Join(rundir, "overlay")
-
 	var contents string
 	flagContent, err := ioutil.ReadFile(getMountProgramFlagFile(home))
 	if err == nil {
@@ -881,11 +921,18 @@ func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts, disable
 	if err != nil {
 		return err
 	}
+
+	idPair := idtools.IDPair{
+		UID: rootUID,
+		GID: rootGID,
+	}
+
 	// Make the link directory if it does not exist
-	if err := idtools.MkdirAllAs(path.Join(d.home, linkDir), 0700, rootUID, rootGID); err != nil {
+	if err := idtools.MkdirAllAndChownNew(path.Join(d.home, linkDir), 0700, idPair); err != nil {
 		return err
 	}
-	if err := idtools.MkdirAllAs(path.Dir(dir), 0700, rootUID, rootGID); err != nil {
+
+	if err := idtools.MkdirAllAndChownNew(path.Dir(dir), 0700, idPair); err != nil {
 		return err
 	}
 	if parent != "" {
@@ -896,14 +943,26 @@ func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts, disable
 		rootUID = int(st.UID())
 		rootGID = int(st.GID())
 	}
-	if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
+
+	if _, err := system.Lstat(dir); err == nil {
+		logrus.Warnf("Trying to create a layer %#v while directory %q already exists; removing it first", id, dir)
+		// Don’t just os.RemoveAll(dir) here; d.Remove also removes the link in linkDir,
+		// so that we can’t end up with two symlinks in linkDir pointing to the same layer.
+		if err := d.Remove(id); err != nil {
+			return errors.Wrapf(err, "removing a pre-existing layer directory %q", dir)
+		}
+	}
+
+	if err := idtools.MkdirAllAndChownNew(dir, 0700, idPair); err != nil {
 		return err
 	}
 
 	defer func() {
 		// Clean up on failure
 		if retErr != nil {
-			os.RemoveAll(dir)
+			if err2 := os.RemoveAll(dir); err2 != nil {
+				logrus.Errorf("While recovering from a failure creating a layer, error deleting %#v: %v", dir, err2)
+			}
 		}
 	}()
 
@@ -1039,17 +1098,22 @@ func (d *Driver) getLower(parent string) (string, error) {
 }
 
 func (d *Driver) dir(id string) string {
+	p, _ := d.dir2(id)
+	return p
+}
+
+func (d *Driver) dir2(id string) (string, bool) {
 	newpath := path.Join(d.home, id)
 	if _, err := os.Stat(newpath); err != nil {
 		for _, p := range d.AdditionalImageStores() {
 			l := path.Join(p, d.name, id)
 			_, err = os.Stat(l)
 			if err == nil {
-				return l
+				return l, true
 			}
 		}
 	}
-	return newpath
+	return newpath, false
 }
 
 func (d *Driver) getLowerDirs(id string) ([]string, error) {
@@ -1122,9 +1186,6 @@ func (d *Driver) optsAppendMappings(opts string, uidMaps, gidMaps []idtools.IDMa
 
 // Remove cleans the directories that are created for this id.
 func (d *Driver) Remove(id string) error {
-	d.locker.Lock(id)
-	defer d.locker.Unlock(id)
-
 	dir := d.dir(id)
 	lid, err := ioutil.ReadFile(path.Join(dir, "link"))
 	if err == nil {
@@ -1145,6 +1206,9 @@ func (d *Driver) Remove(id string) error {
 // under each layer has a symlink created for it under the linkDir. If the symlink does not
 // exist, it creates them
 func (d *Driver) recreateSymlinks() error {
+	// We have at most 3 corrective actions per layer, so 10 iterations is plenty.
+	const maxIterations = 10
+
 	// List all the directories under the home directory
 	dirs, err := ioutil.ReadDir(d.home)
 	if err != nil {
@@ -1162,6 +1226,7 @@ func (d *Driver) recreateSymlinks() error {
 	// Keep looping as long as we take some corrective action in each iteration
 	var errs *multierror.Error
 	madeProgress := true
+	iterations := 0
 	for madeProgress {
 		errs = nil
 		madeProgress = false
@@ -1175,7 +1240,7 @@ func (d *Driver) recreateSymlinks() error {
 			// Read the "link" file under each layer to get the name of the symlink
 			data, err := ioutil.ReadFile(path.Join(d.dir(dir.Name()), "link"))
 			if err != nil {
-				errs = multierror.Append(errs, errors.Wrapf(err, "reading name of symlink for %q", dir))
+				errs = multierror.Append(errs, errors.Wrapf(err, "reading name of symlink for %q", dir.Name()))
 				continue
 			}
 			linkPath := path.Join(d.home, linkDir, strings.Trim(string(data), "\n"))
@@ -1212,7 +1277,12 @@ func (d *Driver) recreateSymlinks() error {
 			if len(targetComponents) != 3 || targetComponents[0] != ".." || targetComponents[2] != "diff" {
 				errs = multierror.Append(errs, errors.Errorf("link target of %q looks weird: %q", link, target))
 				// force the link to be recreated on the next pass
-				os.Remove(filepath.Join(linksDir, link.Name()))
+				if err := os.Remove(filepath.Join(linksDir, link.Name())); err != nil {
+					if !os.IsNotExist(err) {
+						errs = multierror.Append(errs, errors.Wrapf(err, "removing link %q", link))
+					} // else don’t report any error, but also don’t set madeProgress.
+					continue
+				}
 				madeProgress = true
 				continue
 			}
@@ -1222,6 +1292,8 @@ func (d *Driver) recreateSymlinks() error {
 			linkFile := filepath.Join(d.dir(targetID), "link")
 			data, err := ioutil.ReadFile(linkFile)
 			if err != nil || string(data) != link.Name() {
+				// NOTE: If two or more links point to the same target, we will update linkFile
+				// with every value of link.Name(), and set madeProgress = true every time.
 				if err := ioutil.WriteFile(linkFile, []byte(link.Name()), 0644); err != nil {
 					errs = multierror.Append(errs, errors.Wrapf(err, "correcting link for layer %s", targetID))
 					continue
@@ -1229,6 +1301,11 @@ func (d *Driver) recreateSymlinks() error {
 				madeProgress = true
 			}
 		}
+		iterations++
+		if iterations >= maxIterations {
+			errs = multierror.Append(errs, fmt.Errorf("Reached %d iterations in overlay graph driver’s recreateSymlink, giving up", iterations))
+			break
+		}
 	}
 	if errs != nil {
 		return errs.ErrorOrNil()
@@ -1242,18 +1319,20 @@ func (d *Driver) Get(id string, options graphdriver.MountOpts) (_ string, retErr
 }
 
 func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountOpts) (_ string, retErr error) {
-	d.locker.Lock(id)
-	defer d.locker.Unlock(id)
-	dir := d.dir(id)
+	dir, inAdditionalStore := d.dir2(id)
 	if _, err := os.Stat(dir); err != nil {
 		return "", err
 	}
-	readWrite := true
+	readWrite := !inAdditionalStore
 
 	if !d.SupportsShifting() || options.DisableShifting {
 		disableShifting = true
 	}
 
+	logLevel := logrus.WarnLevel
+	if unshare.IsRootless() {
+		logLevel = logrus.DebugLevel
+	}
 	optsList := options.Options
 	if len(optsList) == 0 {
 		optsList = strings.Split(d.options.mountOptions, ",")
@@ -1262,16 +1341,18 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO
 		// options otherwise the kernel refuses to follow the metacopy xattr.
 		if hasMetacopyOption(strings.Split(d.options.mountOptions, ",")) && !hasMetacopyOption(options.Options) {
 			if d.usingMetacopy {
+				logrus.StandardLogger().Logf(logrus.DebugLevel, "Adding metacopy option, configured globally")
 				optsList = append(optsList, "metacopy=on")
-			} else {
-				logLevel := logrus.WarnLevel
-				if unshare.IsRootless() {
-					logLevel = logrus.DebugLevel
-				}
-				logrus.StandardLogger().Logf(logLevel, "Ignoring metacopy option from storage.conf, not supported with booted kernel")
 			}
 		}
 	}
+	if !d.usingMetacopy {
+		if hasMetacopyOption(optsList) {
+			logrus.StandardLogger().Logf(logLevel, "Ignoring global metacopy option, not supported with booted kernel")
+		}
+		optsList = stripOption(optsList, "metacopy=on")
+	}
+
 	for _, o := range optsList {
 		if o == "ro" {
 			readWrite = false
@@ -1322,7 +1403,7 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO
 	}
 	for err == nil {
 		absLowers = append(absLowers, filepath.Join(dir, nameWithSuffix("diff", diffN)))
-		relLowers = append(relLowers, dumbJoin(string(link), "..", nameWithSuffix("diff", diffN)))
+		relLowers = append(relLowers, dumbJoin(linkDir, string(link), "..", nameWithSuffix("diff", diffN)))
 		diffN++
 		st, err = os.Stat(filepath.Join(dir, nameWithSuffix("diff", diffN)))
 		if err == nil && !permsKnown {
@@ -1416,41 +1497,82 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO
 
 	workdir := path.Join(dir, "work")
 
-	var opts string
-	if readWrite {
-		opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), diffDir, workdir)
-	} else {
-		opts = fmt.Sprintf("lowerdir=%s:%s", diffDir, strings.Join(absLowers, ":"))
-	}
-	if len(optsList) > 0 {
-		opts = fmt.Sprintf("%s,%s", strings.Join(optsList, ","), opts)
-	}
-
 	if d.options.mountProgram == "" && unshare.IsRootless() {
-		opts = fmt.Sprintf("%s,userxattr", opts)
+		optsList = append(optsList, "userxattr")
 	}
 
-	// If "volatile" is not supported by the file system, just ignore the request
-	if options.Volatile && !hasVolatileOption(strings.Split(opts, ",")) {
+	if options.Volatile && !hasVolatileOption(optsList) {
 		supported, err := d.getSupportsVolatile()
 		if err != nil {
 			return "", err
 		}
+		// If "volatile" is not supported by the file system, just ignore the request
 		if supported {
-			opts = fmt.Sprintf("%s,volatile", opts)
+			optsList = append(optsList, "volatile")
 		}
 	}
 
+	if d.supportsIDmappedMounts() && len(options.UidMaps) > 0 && len(options.GidMaps) > 0 {
+		var newAbsDir []string
+		mappedRoot := filepath.Join(d.home, id, "mapped")
+		if err := os.MkdirAll(mappedRoot, 0700); err != nil {
+			return "", err
+		}
+
+		pid, cleanupFunc, err := createUsernsProcess(options.UidMaps, options.GidMaps)
+		if err != nil {
+			return "", err
+		}
+		defer cleanupFunc()
+
+		idMappedMounts := make(map[string]string)
+
+		// rewrite the lower dirs to their idmapped mount.
+		c := 0
+		for _, absLower := range absLowers {
+			mappedMountSrc := getMappedMountRoot(absLower)
+
+			root, found := idMappedMounts[mappedMountSrc]
+			if !found {
+				root = filepath.Join(mappedRoot, fmt.Sprintf("%d", c))
+				c++
+				if err := createIDMappedMount(mappedMountSrc, root, int(pid)); err != nil {
+					return "", errors.Wrapf(err, "create mapped mount for %q on %q", mappedMountSrc, root)
+				}
+				idMappedMounts[mappedMountSrc] = root
+
+				// overlay takes a reference on the mount, so it is safe to unmount
+				// the mapped idmounts as soon as the final overlay file system is mounted.
+				defer unix.Unmount(root, unix.MNT_DETACH)
+			}
+
+			// relative path to the layer through the id mapped mount
+			rel, err := filepath.Rel(mappedMountSrc, absLower)
+			if err != nil {
+				return "", err
+			}
+
+			newAbsDir = append(newAbsDir, filepath.Join(root, rel))
+		}
+		absLowers = newAbsDir
+	}
+
+	var opts string
+	if readWrite {
+		opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), diffDir, workdir)
+	} else {
+		opts = fmt.Sprintf("lowerdir=%s:%s", diffDir, strings.Join(absLowers, ":"))
+	}
+	if len(optsList) > 0 {
+		opts = fmt.Sprintf("%s,%s", opts, strings.Join(optsList, ","))
+	}
+
 	mountData := label.FormatMountLabel(opts, options.MountLabel)
 	mountFunc := unix.Mount
 	mountTarget := mergedDir
 
 	pageSize := unix.Getpagesize()
 
-	// Use relative paths and mountFrom when the mount data has exceeded
-	// the page size. The mount syscall fails if the mount data cannot
-	// fit within a page and relative links make the mount data much
-	// smaller at the expense of requiring a fork exec to chroot.
 	if d.options.mountProgram != "" {
 		mountFunc = func(source string, target string, mType string, flags uintptr, label string) error {
 			if !disableShifting {
@@ -1476,18 +1598,26 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO
 			}
 			return nil
 		}
-	} else if len(mountData) > pageSize {
+	} else if len(mountData) >= pageSize {
+		// Use relative paths and mountFrom when the mount data has exceeded
+		// the page size. The mount syscall fails if the mount data cannot
+		// fit within a page and relative links make the mount data much
+		// smaller at the expense of requiring a fork exec to chroot.
+
 		workdir = path.Join(id, "work")
 		//FIXME: We need to figure out to get this to work with additional stores
 		if readWrite {
 			diffDir := path.Join(id, "diff")
 			opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(relLowers, ":"), diffDir, workdir)
 		} else {
-			opts = fmt.Sprintf("lowerdir=%s", strings.Join(absLowers, ":"))
+			opts = fmt.Sprintf("lowerdir=%s", strings.Join(relLowers, ":"))
+		}
+		if len(optsList) > 0 {
+			opts = fmt.Sprintf("%s,%s", opts, strings.Join(optsList, ","))
 		}
 		mountData = label.FormatMountLabel(opts, options.MountLabel)
-		if len(mountData) > pageSize {
-			return "", fmt.Errorf("cannot mount layer, mount label %q too large %d > page size %d", options.MountLabel, len(mountData), pageSize)
+		if len(mountData) >= pageSize {
+			return "", fmt.Errorf("cannot mount layer, mount label %q too large %d >= page size %d", options.MountLabel, len(mountData), pageSize)
 		}
 		mountFunc = func(source string, target string, mType string, flags uintptr, label string) error {
 			return mountFrom(d.home, source, target, mType, flags, label)
@@ -1513,8 +1643,6 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO
 
 // Put unmounts the mount path created for the give id.
 func (d *Driver) Put(id string) error {
-	d.locker.Lock(id)
-	defer d.locker.Unlock(id)
 	dir := d.dir(id)
 	if _, err := os.Stat(dir); err != nil {
 		return err
@@ -1529,6 +1657,18 @@ func (d *Driver) Put(id string) error {
 
 	unmounted := false
 
+	mappedRoot := filepath.Join(d.home, id, "mapped")
+	// It should not happen, but cleanup any mapped mount if it was leaked.
+	if _, err := os.Stat(mappedRoot); err == nil {
+		mounts, err := ioutil.ReadDir(mappedRoot)
+		if err == nil {
+			// Go through all of the mapped mounts.
+			for _, m := range mounts {
+				_ = unix.Unmount(filepath.Join(mappedRoot, m.Name()), unix.MNT_DETACH)
+			}
+		}
+	}
+
 	if d.options.mountProgram != "" {
 		// Attempt to unmount the FUSE mount using either fusermount or fusermount3.
 		// If they fail, fallback to unix.Unmount
@@ -1606,11 +1746,24 @@ func (d *Driver) getWhiteoutFormat() archive.WhiteoutFormat {
 	return whiteoutFormat
 }
 
-type fileGetNilCloser struct {
-	storage.FileGetter
+type overlayFileGetter struct {
+	diffDirs []string
+}
+
+func (g *overlayFileGetter) Get(path string) (io.ReadCloser, error) {
+	for _, d := range g.diffDirs {
+		f, err := os.Open(filepath.Join(d, path))
+		if err == nil {
+			return f, nil
+		}
+	}
+	if len(g.diffDirs) > 0 {
+		return os.Open(filepath.Join(g.diffDirs[0], path))
+	}
+	return nil, fmt.Errorf("%s: %w", path, os.ErrNotExist)
 }
 
-func (f fileGetNilCloser) Close() error {
+func (g *overlayFileGetter) Close() error {
 	return nil
 }
 
@@ -1619,13 +1772,18 @@ func (d *Driver) getStagingDir() string {
 }
 
 // DiffGetter returns a FileGetCloser that can read files from the directory that
-// contains files for the layer differences. Used for direct access for tar-split.
+// contains files for the layer differences, either for this layer, or one of our
+// lowers if we're just a template directory. Used for direct access for tar-split.
 func (d *Driver) DiffGetter(id string) (graphdriver.FileGetCloser, error) {
 	p, err := d.getDiffPath(id)
 	if err != nil {
 		return nil, err
 	}
-	return fileGetNilCloser{storage.NewPathFileGetter(p)}, nil
+	paths, err := d.getLowerDiffPaths(id)
+	if err != nil {
+		return nil, err
+	}
+	return &overlayFileGetter{diffDirs: append([]string{p}, paths...)}, nil
 }
 
 // CleanupStagingDirectory cleanups the staging directory.
@@ -1900,12 +2058,31 @@ func (d *Driver) UpdateLayerIDMap(id string, toContainer, toHost *idtools.IDMapp
 	return nil
 }
 
+// supportsIDmappedMounts returns whether the kernel supports using idmapped mounts with
+// overlay lower layers.
+func (d *Driver) supportsIDmappedMounts() bool {
+	if d.supportsIDMappedMounts != nil {
+		return *d.supportsIDMappedMounts
+	}
+
+	supportsIDMappedMounts, err := checkAndRecordIDMappedSupport(d.home, d.runhome)
+	d.supportsIDMappedMounts = &supportsIDMappedMounts
+	if err == nil {
+		return supportsIDMappedMounts
+	}
+	logrus.Debugf("Check for idmapped mounts support %v", err)
+	return false
+}
+
 // SupportsShifting tells whether the driver support shifting of the UIDs/GIDs in an userNS
 func (d *Driver) SupportsShifting() bool {
 	if os.Getenv("_TEST_FORCE_SUPPORT_SHIFTING") == "yes-please" {
 		return true
 	}
-	return d.options.mountProgram != ""
+	if d.options.mountProgram != "" {
+		return true
+	}
+	return d.supportsIDmappedMounts()
 }
 
 // dumbJoin is more or less a dumber version of filepath.Join, but one which
@@ -2074,3 +2251,15 @@ func redirectDiffIfAdditionalLayer(diffPath string) (string, error) {
 	}
 	return diffPath, nil
 }
+
+// getMappedMountRoot is a heuristic that calculates the parent directory where
+// the idmapped mount should be applied.
+// It is useful to minimize the number of idmapped mounts and at the same time use
+// a common path as long as possible to reduce the length of the mount data argument.
+func getMappedMountRoot(path string) string {
+	dirName := filepath.Dir(path)
+	if filepath.Base(dirName) == linkDir {
+		return filepath.Dir(dirName)
+	}
+	return dirName
+}
diff --git a/vendor/github.com/containers/storage/drivers/register/register_zfs.go b/vendor/github.com/containers/storage/drivers/register/register_zfs.go
index c748468e5cb..4623e7f4648 100644
--- a/vendor/github.com/containers/storage/drivers/register/register_zfs.go
+++ b/vendor/github.com/containers/storage/drivers/register/register_zfs.go
@@ -1,4 +1,4 @@
-// +build !exclude_graphdriver_zfs,linux !exclude_graphdriver_zfs,freebsd, solaris
+// +build !exclude_graphdriver_zfs,linux !exclude_graphdriver_zfs,freebsd solaris
 
 package register
 
diff --git a/vendor/github.com/containers/storage/drivers/zfs/zfs.go b/vendor/github.com/containers/storage/drivers/zfs/zfs.go
index e034bf152c9..f29dc8f8556 100644
--- a/vendor/github.com/containers/storage/drivers/zfs/zfs.go
+++ b/vendor/github.com/containers/storage/drivers/zfs/zfs.go
@@ -344,7 +344,7 @@ func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts) error {
 				return errors.Wrap(err, "error creating zfs mount")
 			}
 			defer func() {
-				if err := unix.Unmount(mountpoint, unix.MNT_DETACH); err != nil {
+				if err := detachUnmount(mountpoint); err != nil {
 					logrus.Warnf("Failed to unmount %s mount %s: %v", id, mountpoint, err)
 				}
 			}()
@@ -483,7 +483,7 @@ func (d *Driver) Put(id string) error {
 
 	logger.Debugf(`unmount("%s")`, mountpoint)
 
-	if err := unix.Unmount(mountpoint, unix.MNT_DETACH); err != nil {
+	if err := detachUnmount(mountpoint); err != nil {
 		logger.Warnf("Failed to unmount %s mount %s: %v", id, mountpoint, err)
 	}
 	if err := unix.Rmdir(mountpoint); err != nil && !os.IsNotExist(err) {
diff --git a/vendor/github.com/containers/storage/drivers/zfs/zfs_freebsd.go b/vendor/github.com/containers/storage/drivers/zfs/zfs_freebsd.go
index bf690515984..61a2ed871a4 100644
--- a/vendor/github.com/containers/storage/drivers/zfs/zfs_freebsd.go
+++ b/vendor/github.com/containers/storage/drivers/zfs/zfs_freebsd.go
@@ -2,7 +2,6 @@ package zfs
 
 import (
 	"fmt"
-	"strings"
 
 	"github.com/containers/storage/drivers"
 	"github.com/pkg/errors"
@@ -26,14 +25,10 @@ func checkRootdirFs(rootdir string) error {
 }
 
 func getMountpoint(id string) string {
-	maxlen := 12
-
-	// we need to preserve filesystem suffix
-	suffix := strings.SplitN(id, "-", 2)
-
-	if len(suffix) > 1 {
-		return id[:maxlen] + "-" + suffix[1]
-	}
+	return id
+}
 
-	return id[:maxlen]
+func detachUnmount(mountpoint string) error {
+	// FreeBSD's MNT_FORCE is roughly equivalent to MNT_DETACH
+	return unix.Unmount(mountpoint, unix.MNT_FORCE)
 }
diff --git a/vendor/github.com/containers/storage/drivers/zfs/zfs_linux.go b/vendor/github.com/containers/storage/drivers/zfs/zfs_linux.go
index edcb1da36b7..44c68f394ec 100644
--- a/vendor/github.com/containers/storage/drivers/zfs/zfs_linux.go
+++ b/vendor/github.com/containers/storage/drivers/zfs/zfs_linux.go
@@ -4,6 +4,7 @@ import (
 	graphdriver "github.com/containers/storage/drivers"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
 )
 
 func checkRootdirFs(rootDir string) error {
@@ -27,3 +28,7 @@ func checkRootdirFs(rootDir string) error {
 func getMountpoint(id string) string {
 	return id
 }
+
+func detachUnmount(mountpoint string) error {
+	return unix.Unmount(mountpoint, unix.MNT_DETACH)
+}
diff --git a/vendor/github.com/containers/storage/errors.go b/vendor/github.com/containers/storage/errors.go
index 5fc810b89d4..de6e377541c 100644
--- a/vendor/github.com/containers/storage/errors.go
+++ b/vendor/github.com/containers/storage/errors.go
@@ -1,6 +1,8 @@
 package storage
 
 import (
+	"errors"
+
 	"github.com/containers/storage/types"
 )
 
@@ -55,4 +57,9 @@ var (
 	ErrStoreIsReadOnly = types.ErrStoreIsReadOnly
 	// ErrNotSupported is returned when the requested functionality is not supported.
 	ErrNotSupported = types.ErrNotSupported
+	// ErrInvalidMappings is returned when the specified mappings are invalid.
+	ErrInvalidMappings = types.ErrInvalidMappings
+	// ErrInvalidNameOperation is returned when updateName is called with invalid operation.
+	// Internal error
+	errInvalidUpdateNameOperation = errors.New("invalid update name operation")
 )
diff --git a/vendor/github.com/containers/storage/idset.go b/vendor/github.com/containers/storage/idset.go
index f870b9ceed3..0a06a43235f 100644
--- a/vendor/github.com/containers/storage/idset.go
+++ b/vendor/github.com/containers/storage/idset.go
@@ -1,6 +1,9 @@
 package storage
 
 import (
+	"fmt"
+	"strings"
+
 	"github.com/containers/storage/pkg/idtools"
 	"github.com/google/go-intervals/intervalset"
 	"github.com/pkg/errors"
@@ -218,3 +221,45 @@ func maxInt(a, b int) int {
 	}
 	return a
 }
+
+func hasOverlappingRanges(mappings []idtools.IDMap) error {
+	hostIntervals := intervalset.Empty()
+	containerIntervals := intervalset.Empty()
+
+	var conflicts []string
+
+	for _, m := range mappings {
+		c := interval{start: m.ContainerID, end: m.ContainerID + m.Size}
+		h := interval{start: m.HostID, end: m.HostID + m.Size}
+
+		added := false
+		overlaps := false
+
+		containerIntervals.IntervalsBetween(c, func(x intervalset.Interval) bool {
+			overlaps = true
+			return false
+		})
+		if overlaps {
+			conflicts = append(conflicts, fmt.Sprintf("%v:%v:%v", m.ContainerID, m.HostID, m.Size))
+			added = true
+		}
+		containerIntervals.Add(intervalset.NewSet([]intervalset.Interval{c}))
+
+		hostIntervals.IntervalsBetween(h, func(x intervalset.Interval) bool {
+			overlaps = true
+			return false
+		})
+		if overlaps && !added {
+			conflicts = append(conflicts, fmt.Sprintf("%v:%v:%v", m.ContainerID, m.HostID, m.Size))
+		}
+		hostIntervals.Add(intervalset.NewSet([]intervalset.Interval{h}))
+	}
+
+	if conflicts != nil {
+		if len(conflicts) == 1 {
+			return errors.Wrapf(ErrInvalidMappings, "the specified UID and/or GID mapping %s conflicts with other mappings", conflicts[0])
+		}
+		return errors.Wrapf(ErrInvalidMappings, "the specified UID and/or GID mappings %s conflict with other mappings", strings.Join(conflicts, ", "))
+	}
+	return nil
+}
diff --git a/vendor/github.com/containers/storage/images.go b/vendor/github.com/containers/storage/images.go
index bca25a65b8c..a4c3ed22c75 100644
--- a/vendor/github.com/containers/storage/images.go
+++ b/vendor/github.com/containers/storage/images.go
@@ -136,8 +136,19 @@ type ImageStore interface {
 	// SetNames replaces the list of names associated with an image with the
 	// supplied values.  The values are expected to be valid normalized
 	// named image references.
+	// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 	SetNames(id string, names []string) error
 
+	// AddNames adds the supplied values to the list of names associated with the image with
+	// the specified id. The values are expected to be valid normalized
+	// named image references.
+	AddNames(id string, names []string) error
+
+	// RemoveNames removes the supplied values from the list of names associated with the image with
+	// the specified id.  The values are expected to be valid normalized
+	// named image references.
+	RemoveNames(id string, names []string) error
+
 	// Delete removes the record of the image.
 	Delete(id string) error
 
@@ -425,37 +436,36 @@ func (r *imageStore) Create(id string, names []string, layer, metadata string, c
 	if created.IsZero() {
 		created = time.Now().UTC()
 	}
-	if err == nil {
-		image = &Image{
-			ID:             id,
-			Digest:         searchableDigest,
-			Digests:        nil,
-			Names:          names,
-			TopLayer:       layer,
-			Metadata:       metadata,
-			BigDataNames:   []string{},
-			BigDataSizes:   make(map[string]int64),
-			BigDataDigests: make(map[string]digest.Digest),
-			Created:        created,
-			Flags:          make(map[string]interface{}),
-		}
-		err := image.recomputeDigests()
-		if err != nil {
-			return nil, errors.Wrapf(err, "error validating digests for new image")
-		}
-		r.images = append(r.images, image)
-		r.idindex.Add(id)
-		r.byid[id] = image
-		for _, name := range names {
-			r.byname[name] = image
-		}
-		for _, digest := range image.Digests {
-			list := r.bydigest[digest]
-			r.bydigest[digest] = append(list, image)
-		}
-		err = r.Save()
-		image = copyImage(image)
+
+	image = &Image{
+		ID:             id,
+		Digest:         searchableDigest,
+		Digests:        nil,
+		Names:          names,
+		TopLayer:       layer,
+		Metadata:       metadata,
+		BigDataNames:   []string{},
+		BigDataSizes:   make(map[string]int64),
+		BigDataDigests: make(map[string]digest.Digest),
+		Created:        created,
+		Flags:          make(map[string]interface{}),
+	}
+	err = image.recomputeDigests()
+	if err != nil {
+		return nil, errors.Wrapf(err, "error validating digests for new image")
+	}
+	r.images = append(r.images, image)
+	r.idindex.Add(id)
+	r.byid[id] = image
+	for _, name := range names {
+		r.byname[name] = image
+	}
+	for _, digest := range image.Digests {
+		list := r.bydigest[digest]
+		r.bydigest[digest] = append(list, image)
 	}
+	err = r.Save()
+	image = copyImage(image)
 	return image, err
 }
 
@@ -506,26 +516,44 @@ func (i *Image) addNameToHistory(name string) {
 	i.NamesHistory = dedupeNames(append([]string{name}, i.NamesHistory...))
 }
 
+// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 func (r *imageStore) SetNames(id string, names []string) error {
+	return r.updateNames(id, names, setNames)
+}
+
+func (r *imageStore) AddNames(id string, names []string) error {
+	return r.updateNames(id, names, addNames)
+}
+
+func (r *imageStore) RemoveNames(id string, names []string) error {
+	return r.updateNames(id, names, removeNames)
+}
+
+func (r *imageStore) updateNames(id string, names []string, op updateNameOperation) error {
 	if !r.IsReadWrite() {
 		return errors.Wrapf(ErrStoreIsReadOnly, "not allowed to change image name assignments at %q", r.imagespath())
 	}
-	names = dedupeNames(names)
-	if image, ok := r.lookup(id); ok {
-		for _, name := range image.Names {
-			delete(r.byname, name)
-		}
-		for _, name := range names {
-			if otherImage, ok := r.byname[name]; ok {
-				r.removeName(otherImage, name)
-			}
-			r.byname[name] = image
-			image.addNameToHistory(name)
+	image, ok := r.lookup(id)
+	if !ok {
+		return errors.Wrapf(ErrImageUnknown, "error locating image with ID %q", id)
+	}
+	oldNames := image.Names
+	names, err := applyNameOperation(oldNames, names, op)
+	if err != nil {
+		return err
+	}
+	for _, name := range oldNames {
+		delete(r.byname, name)
+	}
+	for _, name := range names {
+		if otherImage, ok := r.byname[name]; ok {
+			r.removeName(otherImage, name)
 		}
-		image.Names = names
-		return r.Save()
+		r.byname[name] = image
+		image.addNameToHistory(name)
 	}
-	return errors.Wrapf(ErrImageUnknown, "error locating image with ID %q", id)
+	image.Names = names
+	return r.Save()
 }
 
 func (r *imageStore) Delete(id string) error {
diff --git a/vendor/github.com/containers/storage/layers.go b/vendor/github.com/containers/storage/layers.go
index fbf6ad3621f..bba8d7588f7 100644
--- a/vendor/github.com/containers/storage/layers.go
+++ b/vendor/github.com/containers/storage/layers.go
@@ -23,6 +23,7 @@ import (
 	"github.com/containers/storage/pkg/system"
 	"github.com/containers/storage/pkg/tarlog"
 	"github.com/containers/storage/pkg/truncindex"
+	multierror "github.com/hashicorp/go-multierror"
 	"github.com/klauspost/pgzip"
 	digest "github.com/opencontainers/go-digest"
 	"github.com/opencontainers/selinux/go-selinux/label"
@@ -220,8 +221,17 @@ type LayerStore interface {
 
 	// SetNames replaces the list of names associated with a layer with the
 	// supplied values.
+	// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 	SetNames(id string, names []string) error
 
+	// AddNames adds the supplied values to the list of names associated with the layer with the
+	// specified id.
+	AddNames(id string, names []string) error
+
+	// RemoveNames remove the supplied values from the list of names associated with the layer with the
+	// specified id.
+	RemoveNames(id string, names []string) error
+
 	// Delete deletes a layer with the specified name or ID.
 	Delete(id string) error
 
@@ -398,14 +408,13 @@ func (r *layerStore) Load() error {
 				if layer.Flags == nil {
 					layer.Flags = make(map[string]interface{})
 				}
-				if cleanup, ok := layer.Flags[incompleteFlag]; ok {
-					if b, ok := cleanup.(bool); ok && b {
-						err = r.deleteInternal(layer.ID)
-						if err != nil {
-							break
-						}
-						shouldSave = true
+				if layerHasIncompleteFlag(layer) {
+					logrus.Warnf("Found incomplete layer %#v, deleting it", layer.ID)
+					err = r.deleteInternal(layer.ID)
+					if err != nil {
+						break
 					}
+					shouldSave = true
 				}
 			}
 		}
@@ -674,7 +683,7 @@ func (r *layerStore) PutAdditionalLayer(id string, parentLayer *Layer, names []s
 		r.bycompressedsum[layer.CompressedDigest] = append(r.bycompressedsum[layer.CompressedDigest], layer.ID)
 	}
 	if layer.UncompressedDigest != "" {
-		r.byuncompressedsum[layer.CompressedDigest] = append(r.byuncompressedsum[layer.CompressedDigest], layer.ID)
+		r.byuncompressedsum[layer.UncompressedDigest] = append(r.byuncompressedsum[layer.UncompressedDigest], layer.ID)
 	}
 	if err := r.Save(); err != nil {
 		r.driver.Remove(id)
@@ -683,11 +692,10 @@ func (r *layerStore) PutAdditionalLayer(id string, parentLayer *Layer, names []s
 	return copyLayer(layer), nil
 }
 
-func (r *layerStore) Put(id string, parentLayer *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, flags map[string]interface{}, diff io.Reader) (layer *Layer, size int64, err error) {
+func (r *layerStore) Put(id string, parentLayer *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, flags map[string]interface{}, diff io.Reader) (*Layer, int64, error) {
 	if !r.IsReadWrite() {
 		return nil, -1, errors.Wrapf(ErrStoreIsReadOnly, "not allowed to create new layers at %q", r.layerspath())
 	}
-	size = -1
 	if err := os.MkdirAll(r.rundir, 0700); err != nil {
 		return nil, -1, err
 	}
@@ -716,12 +724,32 @@ func (r *layerStore) Put(id string, parentLayer *Layer, names []string, mountLab
 		parent = parentLayer.ID
 	}
 	var parentMappings, templateIDMappings, oldMappings *idtools.IDMappings
+	var (
+		templateMetadata           string
+		templateCompressedDigest   digest.Digest
+		templateCompressedSize     int64
+		templateUncompressedDigest digest.Digest
+		templateUncompressedSize   int64
+		templateCompressionType    archive.Compression
+		templateUIDs, templateGIDs []uint32
+		templateTSdata             []byte
+	)
 	if moreOptions.TemplateLayer != "" {
+		var tserr error
 		templateLayer, ok := r.lookup(moreOptions.TemplateLayer)
 		if !ok {
 			return nil, -1, ErrLayerUnknown
 		}
+		templateMetadata = templateLayer.Metadata
 		templateIDMappings = idtools.NewIDMappingsFromMaps(templateLayer.UIDMap, templateLayer.GIDMap)
+		templateCompressedDigest, templateCompressedSize = templateLayer.CompressedDigest, templateLayer.CompressedSize
+		templateUncompressedDigest, templateUncompressedSize = templateLayer.UncompressedDigest, templateLayer.UncompressedSize
+		templateCompressionType = templateLayer.CompressionType
+		templateUIDs, templateGIDs = append([]uint32{}, templateLayer.UIDs...), append([]uint32{}, templateLayer.GIDs...)
+		templateTSdata, tserr = ioutil.ReadFile(r.tspath(templateLayer.ID))
+		if tserr != nil && !os.IsNotExist(tserr) {
+			return nil, -1, tserr
+		}
 	} else {
 		templateIDMappings = &idtools.IDMappings{}
 	}
@@ -733,6 +761,60 @@ func (r *layerStore) Put(id string, parentLayer *Layer, names []string, mountLab
 	if mountLabel != "" {
 		label.ReserveLabel(mountLabel)
 	}
+
+	// Before actually creating the layer, make a persistent record of it with incompleteFlag,
+	// so that future processes have a chance to delete it.
+	layer := &Layer{
+		ID:                 id,
+		Parent:             parent,
+		Names:              names,
+		MountLabel:         mountLabel,
+		Metadata:           templateMetadata,
+		Created:            time.Now().UTC(),
+		CompressedDigest:   templateCompressedDigest,
+		CompressedSize:     templateCompressedSize,
+		UncompressedDigest: templateUncompressedDigest,
+		UncompressedSize:   templateUncompressedSize,
+		CompressionType:    templateCompressionType,
+		UIDs:               templateUIDs,
+		GIDs:               templateGIDs,
+		Flags:              make(map[string]interface{}),
+		UIDMap:             copyIDMap(moreOptions.UIDMap),
+		GIDMap:             copyIDMap(moreOptions.GIDMap),
+		BigDataNames:       []string{},
+	}
+	r.layers = append(r.layers, layer)
+	r.idindex.Add(id)
+	r.byid[id] = layer
+	for _, name := range names {
+		r.byname[name] = layer
+	}
+	for flag, value := range flags {
+		layer.Flags[flag] = value
+	}
+	layer.Flags[incompleteFlag] = true
+
+	succeeded := false
+	cleanupFailureContext := ""
+	defer func() {
+		if !succeeded {
+			// On any error, try both removing the driver's data as well
+			// as the in-memory layer record.
+			if err2 := r.Delete(layer.ID); err2 != nil {
+				if cleanupFailureContext == "" {
+					cleanupFailureContext = "unknown: cleanupFailureContext not set at the failure site"
+				}
+				logrus.Errorf("While recovering from a failure (%s), error deleting layer %#v: %v", cleanupFailureContext, layer.ID, err2)
+			}
+		}
+	}()
+
+	err := r.Save()
+	if err != nil {
+		cleanupFailureContext = "saving incomplete layer metadata"
+		return nil, -1, err
+	}
+
 	idMappings := idtools.NewIDMappingsFromMaps(moreOptions.UIDMap, moreOptions.GIDMap)
 	opts := drivers.CreateOpts{
 		MountLabel: mountLabel,
@@ -740,89 +822,67 @@ func (r *layerStore) Put(id string, parentLayer *Layer, names []string, mountLab
 		IDMappings: idMappings,
 	}
 	if moreOptions.TemplateLayer != "" {
-		if err = r.driver.CreateFromTemplate(id, moreOptions.TemplateLayer, templateIDMappings, parent, parentMappings, &opts, writeable); err != nil {
-			if id != "" {
-				return nil, -1, errors.Wrapf(err, "error creating copy of template layer %q with ID %q", moreOptions.TemplateLayer, id)
-			}
-			return nil, -1, errors.Wrapf(err, "error creating copy of template layer %q", moreOptions.TemplateLayer)
+		if err := r.driver.CreateFromTemplate(id, moreOptions.TemplateLayer, templateIDMappings, parent, parentMappings, &opts, writeable); err != nil {
+			cleanupFailureContext = "creating a layer from template"
+			return nil, -1, errors.Wrapf(err, "error creating copy of template layer %q with ID %q", moreOptions.TemplateLayer, id)
 		}
 		oldMappings = templateIDMappings
 	} else {
 		if writeable {
-			if err = r.driver.CreateReadWrite(id, parent, &opts); err != nil {
-				if id != "" {
-					return nil, -1, errors.Wrapf(err, "error creating read-write layer with ID %q", id)
-				}
-				return nil, -1, errors.Wrapf(err, "error creating read-write layer")
+			if err := r.driver.CreateReadWrite(id, parent, &opts); err != nil {
+				cleanupFailureContext = "creating a read-write layer"
+				return nil, -1, errors.Wrapf(err, "error creating read-write layer with ID %q", id)
 			}
 		} else {
-			if err = r.driver.Create(id, parent, &opts); err != nil {
-				if id != "" {
-					return nil, -1, errors.Wrapf(err, "error creating layer with ID %q", id)
-				}
-				return nil, -1, errors.Wrapf(err, "error creating layer")
+			if err := r.driver.Create(id, parent, &opts); err != nil {
+				cleanupFailureContext = "creating a read-only layer"
+				return nil, -1, errors.Wrapf(err, "error creating layer with ID %q", id)
 			}
 		}
 		oldMappings = parentMappings
 	}
 	if !reflect.DeepEqual(oldMappings.UIDs(), idMappings.UIDs()) || !reflect.DeepEqual(oldMappings.GIDs(), idMappings.GIDs()) {
-		if err = r.driver.UpdateLayerIDMap(id, oldMappings, idMappings, mountLabel); err != nil {
-			// We don't have a record of this layer, but at least
-			// try to clean it up underneath us.
-			r.driver.Remove(id)
+		if err := r.driver.UpdateLayerIDMap(id, oldMappings, idMappings, mountLabel); err != nil {
+			cleanupFailureContext = "in UpdateLayerIDMap"
 			return nil, -1, err
 		}
 	}
-	if err == nil {
-		layer = &Layer{
-			ID:           id,
-			Parent:       parent,
-			Names:        names,
-			MountLabel:   mountLabel,
-			Created:      time.Now().UTC(),
-			Flags:        make(map[string]interface{}),
-			UIDMap:       copyIDMap(moreOptions.UIDMap),
-			GIDMap:       copyIDMap(moreOptions.GIDMap),
-			BigDataNames: []string{},
-		}
-		r.layers = append(r.layers, layer)
-		r.idindex.Add(id)
-		r.byid[id] = layer
-		for _, name := range names {
-			r.byname[name] = layer
-		}
-		for flag, value := range flags {
-			layer.Flags[flag] = value
-		}
-		if diff != nil {
-			layer.Flags[incompleteFlag] = true
-			err = r.Save()
-			if err != nil {
-				// We don't have a record of this layer, but at least
-				// try to clean it up underneath us.
-				r.driver.Remove(id)
-				return nil, -1, err
-			}
-			size, err = r.applyDiffWithOptions(layer.ID, moreOptions, diff)
-			if err != nil {
-				if r.Delete(layer.ID) != nil {
-					// Either a driver error or an error saving.
-					// We now have a layer that's been marked for
-					// deletion but which we failed to remove.
-				}
-				return nil, -1, err
-			}
-			delete(layer.Flags, incompleteFlag)
+	if len(templateTSdata) > 0 {
+		if err := os.MkdirAll(filepath.Dir(r.tspath(id)), 0o700); err != nil {
+			cleanupFailureContext = "creating tar-split parent directory for a copy from template"
+			return nil, -1, err
+		}
+		if err := ioutils.AtomicWriteFile(r.tspath(id), templateTSdata, 0o600); err != nil {
+			cleanupFailureContext = "creating a tar-split copy from template"
+			return nil, -1, err
 		}
-		err = r.Save()
+	}
+
+	var size int64 = -1
+	if diff != nil {
+		size, err = r.applyDiffWithOptions(layer.ID, moreOptions, diff)
 		if err != nil {
-			// We don't have a record of this layer, but at least
-			// try to clean it up underneath us.
-			r.driver.Remove(id)
+			cleanupFailureContext = "applying layer diff"
 			return nil, -1, err
 		}
-		layer = copyLayer(layer)
+	} else {
+		// applyDiffWithOptions in the `diff != nil` case handles this bit for us
+		if layer.CompressedDigest != "" {
+			r.bycompressedsum[layer.CompressedDigest] = append(r.bycompressedsum[layer.CompressedDigest], layer.ID)
+		}
+		if layer.UncompressedDigest != "" {
+			r.byuncompressedsum[layer.UncompressedDigest] = append(r.byuncompressedsum[layer.UncompressedDigest], layer.ID)
+		}
 	}
+	delete(layer.Flags, incompleteFlag)
+	err = r.Save()
+	if err != nil {
+		cleanupFailureContext = "saving finished layer metadata"
+		return nil, -1, err
+	}
+
+	layer = copyLayer(layer)
+	succeeded = true
 	return layer, size, err
 }
 
@@ -854,7 +914,6 @@ func (r *layerStore) Mounted(id string) (int, error) {
 }
 
 func (r *layerStore) Mount(id string, options drivers.MountOpts) (string, error) {
-
 	// check whether options include ro option
 	hasReadOnlyOpt := func(opts []string) bool {
 		for _, item := range opts {
@@ -1031,25 +1090,43 @@ func (r *layerStore) removeName(layer *Layer, name string) {
 	layer.Names = stringSliceWithoutValue(layer.Names, name)
 }
 
+// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 func (r *layerStore) SetNames(id string, names []string) error {
+	return r.updateNames(id, names, setNames)
+}
+
+func (r *layerStore) AddNames(id string, names []string) error {
+	return r.updateNames(id, names, addNames)
+}
+
+func (r *layerStore) RemoveNames(id string, names []string) error {
+	return r.updateNames(id, names, removeNames)
+}
+
+func (r *layerStore) updateNames(id string, names []string, op updateNameOperation) error {
 	if !r.IsReadWrite() {
 		return errors.Wrapf(ErrStoreIsReadOnly, "not allowed to change layer name assignments at %q", r.layerspath())
 	}
-	names = dedupeNames(names)
-	if layer, ok := r.lookup(id); ok {
-		for _, name := range layer.Names {
-			delete(r.byname, name)
-		}
-		for _, name := range names {
-			if otherLayer, ok := r.byname[name]; ok {
-				r.removeName(otherLayer, name)
-			}
-			r.byname[name] = layer
+	layer, ok := r.lookup(id)
+	if !ok {
+		return ErrLayerUnknown
+	}
+	oldNames := layer.Names
+	names, err := applyNameOperation(oldNames, names, op)
+	if err != nil {
+		return err
+	}
+	for _, name := range oldNames {
+		delete(r.byname, name)
+	}
+	for _, name := range names {
+		if otherLayer, ok := r.byname[name]; ok {
+			r.removeName(otherLayer, name)
 		}
-		layer.Names = names
-		return r.Save()
+		r.byname[name] = layer
 	}
-	return ErrLayerUnknown
+	layer.Names = names
+	return r.Save()
 }
 
 func (r *layerStore) datadir(id string) string {
@@ -1148,6 +1225,17 @@ func (r *layerStore) tspath(id string) string {
 	return filepath.Join(r.layerdir, id+tarSplitSuffix)
 }
 
+// layerHasIncompleteFlag returns true if layer.Flags contains an incompleteFlag set to true
+func layerHasIncompleteFlag(layer *Layer) bool {
+	// layer.Flags[…] is defined to succeed and return ok == false if Flags == nil
+	if flagValue, ok := layer.Flags[incompleteFlag]; ok {
+		if b, ok := flagValue.(bool); ok && b {
+			return true
+		}
+	}
+	return false
+}
+
 func (r *layerStore) deleteInternal(id string) error {
 	if !r.IsReadWrite() {
 		return errors.Wrapf(ErrStoreIsReadOnly, "not allowed to delete layers at %q", r.layerspath())
@@ -1156,6 +1244,18 @@ func (r *layerStore) deleteInternal(id string) error {
 	if !ok {
 		return ErrLayerUnknown
 	}
+	// Ensure that if we are interrupted, the layer will be cleaned up.
+	if !layerHasIncompleteFlag(layer) {
+		if layer.Flags == nil {
+			layer.Flags = make(map[string]interface{})
+		}
+		layer.Flags[incompleteFlag] = true
+		if err := r.Save(); err != nil {
+			return err
+		}
+	}
+	// We never unset incompleteFlag; below, we remove the entire object from r.layers.
+
 	id = layer.ID
 	err := r.driver.Remove(id)
 	if err != nil {
@@ -1463,34 +1563,48 @@ func (r *layerStore) Diff(from, to string, options *DiffOptions) (io.ReadCloser,
 		}
 		return maybeCompressReadCloser(diff)
 	}
-	defer tsfile.Close()
 
 	decompressor, err := pgzip.NewReader(tsfile)
 	if err != nil {
-		return nil, err
-	}
-	defer decompressor.Close()
-
-	tsbytes, err := ioutil.ReadAll(decompressor)
-	if err != nil {
+		if e := tsfile.Close(); e != nil {
+			logrus.Debug(e)
+		}
 		return nil, err
 	}
 
-	metadata = storage.NewJSONUnpacker(bytes.NewBuffer(tsbytes))
+	metadata = storage.NewJSONUnpacker(decompressor)
 
 	fgetter, err := r.newFileGetter(to)
 	if err != nil {
-		return nil, err
+		errs := multierror.Append(nil, errors.Wrapf(err, "creating file-getter"))
+		if err := decompressor.Close(); err != nil {
+			errs = multierror.Append(errs, errors.Wrapf(err, "closing decompressor"))
+		}
+		if err := tsfile.Close(); err != nil {
+			errs = multierror.Append(errs, errors.Wrapf(err, "closing tarstream headers"))
+		}
+		return nil, errs.ErrorOrNil()
 	}
 
 	tarstream := asm.NewOutputTarStream(fgetter, metadata)
 	rc := ioutils.NewReadCloserWrapper(tarstream, func() error {
-		err1 := tarstream.Close()
-		err2 := fgetter.Close()
-		if err2 == nil {
-			return err1
+		var errs *multierror.Error
+		if err := decompressor.Close(); err != nil {
+			errs = multierror.Append(errs, errors.Wrapf(err, "closing decompressor"))
+		}
+		if err := tsfile.Close(); err != nil {
+			errs = multierror.Append(errs, errors.Wrapf(err, "closing tarstream headers"))
+		}
+		if err := tarstream.Close(); err != nil {
+			errs = multierror.Append(errs, errors.Wrapf(err, "closing reconstructed tarstream"))
+		}
+		if err := fgetter.Close(); err != nil {
+			errs = multierror.Append(errs, errors.Wrapf(err, "closing file-getter"))
+		}
+		if errs != nil {
+			return errs.ErrorOrNil()
 		}
-		return err2
+		return nil
 	})
 	return maybeCompressReadCloser(rc)
 }
diff --git a/vendor/github.com/containers/storage/pkg/archive/archive.go b/vendor/github.com/containers/storage/pkg/archive/archive.go
index 76544ff289b..d4f129ee634 100644
--- a/vendor/github.com/containers/storage/pkg/archive/archive.go
+++ b/vendor/github.com/containers/storage/pkg/archive/archive.go
@@ -7,6 +7,7 @@ import (
 	"compress/bzip2"
 	"fmt"
 	"io"
+	"io/fs"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -77,6 +78,10 @@ const (
 	containersOverrideXattr = "user.containers.override_stat"
 )
 
+var xattrsToIgnore = map[string]interface{}{
+	"security.selinux": true,
+}
+
 // Archiver allows the reuse of most utility functions of this package with a
 // pluggable Untar function.  To facilitate the passing of specific id mappings
 // for untar, an archiver can be created with maps which will then be passed to
@@ -507,6 +512,10 @@ func (ta *tarAppender) addTarFile(path, name string) error {
 			return err
 		}
 	}
+	if fi.Mode()&os.ModeSocket != 0 {
+		logrus.Warnf("archive: skipping %q since it is a socket", path)
+		return nil
+	}
 
 	hdr, err := FileInfoHeader(name, fi, link)
 	if err != nil {
@@ -743,6 +752,9 @@ func createTarFile(path, extractDir string, hdr *tar.Header, reader io.Reader, L
 
 	var errs []string
 	for key, value := range hdr.Xattrs {
+		if _, found := xattrsToIgnore[key]; found {
+			continue
+		}
 		if err := system.Lsetxattr(path, key, []byte(value), 0); err != nil {
 			if errors.Is(err, syscall.ENOTSUP) || (inUserns && errors.Is(err, syscall.EPERM)) {
 				// We ignore errors here because not all graphdrivers support
@@ -852,14 +864,14 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
 			rebaseName := options.RebaseNames[include]
 
 			walkRoot := getWalkRoot(srcPath, include)
-			filepath.Walk(walkRoot, func(filePath string, f os.FileInfo, err error) error {
+			filepath.WalkDir(walkRoot, func(filePath string, d fs.DirEntry, err error) error {
 				if err != nil {
 					logrus.Errorf("Tar: Can't stat file %s to tar: %s", srcPath, err)
 					return nil
 				}
 
 				relFilePath, err := filepath.Rel(srcPath, filePath)
-				if err != nil || (!options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
+				if err != nil || (!options.IncludeSourceDir && relFilePath == "." && d.IsDir()) {
 					// Error getting relative path OR we are looking
 					// at the source directory path. Skip in both situations.
 					return nil
@@ -892,7 +904,7 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
 					// dir. If so then we can't skip this dir.
 
 					// Its not a dir then so we can just return/skip.
-					if !f.IsDir() {
+					if !d.IsDir() {
 						return nil
 					}
 
@@ -962,7 +974,10 @@ func Unpack(decompressedArchive io.Reader, dest string, options *TarOptions) err
 	whiteoutConverter := GetWhiteoutConverter(options.WhiteoutFormat, options.WhiteoutData)
 	buffer := make([]byte, 1<<20)
 
+	doChown := !options.NoLchown
 	if options.ForceMask != nil {
+		// if ForceMask is in place, make sure lchown is disabled.
+		doChown = false
 		uid, gid, mode, err := GetFileOwner(dest)
 		if err == nil {
 			value := fmt.Sprintf("%d:%d:0%o", uid, gid, mode)
@@ -1067,7 +1082,7 @@ loop:
 			chownOpts = &idtools.IDPair{UID: hdr.Uid, GID: hdr.Gid}
 		}
 
-		if err := createTarFile(path, dest, hdr, trBuf, !options.NoLchown, chownOpts, options.InUserNS, options.IgnoreChownErrors, options.ForceMask, buffer); err != nil {
+		if err = createTarFile(path, dest, hdr, trBuf, doChown, chownOpts, options.InUserNS, options.IgnoreChownErrors, options.ForceMask, buffer); err != nil {
 			return err
 		}
 
diff --git a/vendor/github.com/containers/storage/pkg/archive/archive_linux.go b/vendor/github.com/containers/storage/pkg/archive/archive_linux.go
index 2f548b661ce..51fbd9a2197 100644
--- a/vendor/github.com/containers/storage/pkg/archive/archive_linux.go
+++ b/vendor/github.com/containers/storage/pkg/archive/archive_linux.go
@@ -36,7 +36,7 @@ func (o overlayWhiteoutConverter) ConvertWrite(hdr *tar.Header, path string, fi
 		// we just rename the file and make it normal
 		dir, filename := filepath.Split(hdr.Name)
 		hdr.Name = filepath.Join(dir, WhiteoutPrefix+filename)
-		hdr.Mode = 0600
+		hdr.Mode = 0
 		hdr.Typeflag = tar.TypeReg
 		hdr.Size = 0
 	}
diff --git a/vendor/github.com/containers/storage/pkg/archive/changes_other.go b/vendor/github.com/containers/storage/pkg/archive/changes_other.go
index bbbd8c9de87..8769f2291b6 100644
--- a/vendor/github.com/containers/storage/pkg/archive/changes_other.go
+++ b/vendor/github.com/containers/storage/pkg/archive/changes_other.go
@@ -1,9 +1,11 @@
+//go:build !linux
 // +build !linux
 
 package archive
 
 import (
 	"fmt"
+	"io/fs"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -41,7 +43,7 @@ func collectFileInfoForChanges(oldDir, newDir string, oldIDMap, newIDMap *idtool
 func collectFileInfo(sourceDir string, idMappings *idtools.IDMappings) (*FileInfo, error) {
 	root := newRootFileInfo(idMappings)
 
-	err := filepath.Walk(sourceDir, func(path string, f os.FileInfo, err error) error {
+	err := filepath.WalkDir(sourceDir, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			return err
 		}
diff --git a/vendor/github.com/containers/storage/pkg/archive/diff.go b/vendor/github.com/containers/storage/pkg/archive/diff.go
index 14ffad5c0d4..ca8832fe421 100644
--- a/vendor/github.com/containers/storage/pkg/archive/diff.go
+++ b/vendor/github.com/containers/storage/pkg/archive/diff.go
@@ -4,6 +4,7 @@ import (
 	"archive/tar"
 	"fmt"
 	"io"
+	"io/fs"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -134,7 +135,7 @@ func UnpackLayer(dest string, layer io.Reader, options *TarOptions) (size int64,
 				if err != nil {
 					return 0, err
 				}
-				err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
+				err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
 					if err != nil {
 						if os.IsNotExist(err) {
 							err = nil // parent was deleted
diff --git a/vendor/github.com/containers/storage/pkg/chrootarchive/archive.go b/vendor/github.com/containers/storage/pkg/chrootarchive/archive.go
index e874eb74e05..482e036630f 100644
--- a/vendor/github.com/containers/storage/pkg/chrootarchive/archive.go
+++ b/vendor/github.com/containers/storage/pkg/chrootarchive/archive.go
@@ -5,9 +5,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
-	"net"
 	"os"
-	"os/user"
 	"path/filepath"
 	"sync"
 
@@ -17,13 +15,6 @@ import (
 	"github.com/pkg/errors"
 )
 
-func init() {
-	// initialize nss libraries in Glibc so that the dynamic libraries are loaded in the host
-	// environment not in the chroot from untrusted files.
-	_, _ = user.Lookup("storage")
-	_, _ = net.LookupHost("localhost")
-}
-
 // NewArchiver returns a new Archiver which uses chrootarchive.Untar
 func NewArchiver(idMappings *idtools.IDMappings) *archive.Archiver {
 	archiver := archive.NewArchiver(idMappings)
diff --git a/vendor/github.com/containers/storage/pkg/chrootarchive/chroot_linux.go b/vendor/github.com/containers/storage/pkg/chrootarchive/chroot_linux.go
index 76c94c6c1e9..58729ec8cc7 100644
--- a/vendor/github.com/containers/storage/pkg/chrootarchive/chroot_linux.go
+++ b/vendor/github.com/containers/storage/pkg/chrootarchive/chroot_linux.go
@@ -3,7 +3,9 @@ package chrootarchive
 import (
 	"fmt"
 	"io/ioutil"
+	"net"
 	"os"
+	"os/user"
 	"path/filepath"
 
 	"github.com/containers/storage/pkg/mount"
@@ -23,6 +25,11 @@ func chroot(path string) (err error) {
 		return err
 	}
 
+	// initialize nss libraries in Glibc so that the dynamic libraries are loaded in the host
+	// environment not in the chroot from untrusted files.
+	_, _ = user.Lookup("storage")
+	_, _ = net.LookupHost("localhost")
+
 	// if the process doesn't have CAP_SYS_ADMIN, but does have CAP_SYS_CHROOT, we need to use the actual chroot
 	if !caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) && caps.Get(capability.EFFECTIVE, capability.CAP_SYS_CHROOT) {
 		return realChroot(path)
diff --git a/vendor/github.com/containers/storage/pkg/chunked/cache_linux.go b/vendor/github.com/containers/storage/pkg/chunked/cache_linux.go
new file mode 100644
index 00000000000..b8b278a1329
--- /dev/null
+++ b/vendor/github.com/containers/storage/pkg/chunked/cache_linux.go
@@ -0,0 +1,627 @@
+package chunked
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+
+	storage "github.com/containers/storage"
+	"github.com/containers/storage/pkg/chunked/internal"
+	"github.com/containers/storage/pkg/ioutils"
+	jsoniter "github.com/json-iterator/go"
+	digest "github.com/opencontainers/go-digest"
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+)
+
+const (
+	cacheKey     = "chunked-manifest-cache"
+	cacheVersion = 1
+)
+
+type metadata struct {
+	tagLen    int
+	digestLen int
+	tags      []byte
+	vdata     []byte
+}
+
+type layer struct {
+	id       string
+	metadata *metadata
+	target   string
+}
+
+type layersCache struct {
+	layers  []layer
+	refs    int
+	store   storage.Store
+	mutex   sync.RWMutex
+	created time.Time
+}
+
+var cacheMutex sync.Mutex
+var cache *layersCache
+
+func (c *layersCache) release() {
+	cacheMutex.Lock()
+	defer cacheMutex.Unlock()
+
+	c.refs--
+	if c.refs == 0 {
+		cache = nil
+	}
+}
+
+func getLayersCacheRef(store storage.Store) *layersCache {
+	cacheMutex.Lock()
+	defer cacheMutex.Unlock()
+	if cache != nil && cache.store == store && time.Since(cache.created).Minutes() < 10 {
+		cache.refs++
+		return cache
+	}
+	cache := &layersCache{
+		store:   store,
+		refs:    1,
+		created: time.Now(),
+	}
+	return cache
+}
+
+func getLayersCache(store storage.Store) (*layersCache, error) {
+	c := getLayersCacheRef(store)
+
+	if err := c.load(); err != nil {
+		c.release()
+		return nil, err
+	}
+	return c, nil
+}
+
+func (c *layersCache) load() error {
+	c.mutex.Lock()
+	defer c.mutex.Unlock()
+
+	allLayers, err := c.store.Layers()
+	if err != nil {
+		return err
+	}
+	existingLayers := make(map[string]string)
+	for _, r := range c.layers {
+		existingLayers[r.id] = r.target
+	}
+
+	currentLayers := make(map[string]string)
+	for _, r := range allLayers {
+		currentLayers[r.ID] = r.ID
+		if _, found := existingLayers[r.ID]; found {
+			continue
+		}
+
+		bigData, err := c.store.LayerBigData(r.ID, cacheKey)
+		// if the cache areadly exists, read and use it
+		if err == nil {
+			defer bigData.Close()
+			metadata, err := readMetadataFromCache(bigData)
+			if err == nil {
+				c.addLayer(r.ID, metadata)
+				continue
+			}
+			logrus.Warningf("Error reading cache file for layer %q: %v", r.ID, err)
+		} else if errors.Cause(err) != os.ErrNotExist {
+			return err
+		}
+
+		// otherwise create it from the layer TOC.
+		manifestReader, err := c.store.LayerBigData(r.ID, bigDataKey)
+		if err != nil {
+			continue
+		}
+		defer manifestReader.Close()
+
+		manifest, err := ioutil.ReadAll(manifestReader)
+		if err != nil {
+			return fmt.Errorf("open manifest file for layer %q: %w", r.ID, err)
+		}
+
+		metadata, err := writeCache(manifest, r.ID, c.store)
+		if err == nil {
+			c.addLayer(r.ID, metadata)
+		}
+	}
+
+	var newLayers []layer
+	for _, l := range c.layers {
+		if _, found := currentLayers[l.id]; found {
+			newLayers = append(newLayers, l)
+		}
+	}
+	c.layers = newLayers
+
+	return nil
+}
+
+// calculateHardLinkFingerprint calculates a hash that can be used to verify if a file
+// is usable for deduplication with hardlinks.
+// To calculate the digest, it uses the file payload digest, UID, GID, mode and xattrs.
+func calculateHardLinkFingerprint(f *internal.FileMetadata) (string, error) {
+	digester := digest.Canonical.Digester()
+
+	modeString := fmt.Sprintf("%d:%d:%o", f.UID, f.GID, f.Mode)
+	hash := digester.Hash()
+
+	if _, err := hash.Write([]byte(f.Digest)); err != nil {
+		return "", err
+	}
+
+	if _, err := hash.Write([]byte(modeString)); err != nil {
+		return "", err
+	}
+
+	if len(f.Xattrs) > 0 {
+		keys := make([]string, 0, len(f.Xattrs))
+		for k := range f.Xattrs {
+			keys = append(keys, k)
+		}
+		sort.Strings(keys)
+
+		for _, k := range keys {
+			if _, err := hash.Write([]byte(k)); err != nil {
+				return "", err
+			}
+			if _, err := hash.Write([]byte(f.Xattrs[k])); err != nil {
+				return "", err
+			}
+		}
+	}
+	return string(digester.Digest()), nil
+}
+
+// generateFileLocation generates a file location in the form $OFFSET@$PATH
+func generateFileLocation(path string, offset uint64) []byte {
+	return []byte(fmt.Sprintf("%d@%s", offset, path))
+}
+
+// generateTag generates a tag in the form $DIGEST$OFFSET@LEN.
+// the [OFFSET; LEN] points to the variable length data where the file locations
+// are stored.  $DIGEST has length digestLen stored in the metadata file header.
+func generateTag(digest string, offset, len uint64) string {
+	return fmt.Sprintf("%s%.20d@%.20d", digest, offset, len)
+}
+
+type setBigData interface {
+	// SetLayerBigData stores a (possibly large) chunk of named data
+	SetLayerBigData(id, key string, data io.Reader) error
+}
+
+// writeCache write a cache for the layer ID.
+// It generates a sorted list of digests with their offset to the path location and offset.
+// The same cache is used to lookup files, chunks and candidates for deduplication with hard links.
+// There are 3 kind of digests stored:
+// - digest(file.payload))
+// - digest(digest(file.payload) + file.UID + file.GID + file.mode + file.xattrs)
+// - digest(i) for each i in chunks(file payload)
+func writeCache(manifest []byte, id string, dest setBigData) (*metadata, error) {
+	var vdata bytes.Buffer
+	tagLen := 0
+	digestLen := 0
+	var tagsBuffer bytes.Buffer
+
+	toc, err := prepareMetadata(manifest)
+	if err != nil {
+		return nil, err
+	}
+
+	var tags []string
+	for _, k := range toc {
+		if k.Digest != "" {
+			location := generateFileLocation(k.Name, 0)
+
+			off := uint64(vdata.Len())
+			l := uint64(len(location))
+
+			d := generateTag(k.Digest, off, l)
+			if tagLen == 0 {
+				tagLen = len(d)
+			}
+			if tagLen != len(d) {
+				return nil, errors.New("digest with different length found")
+			}
+			tags = append(tags, d)
+
+			fp, err := calculateHardLinkFingerprint(k)
+			if err != nil {
+				return nil, err
+			}
+			d = generateTag(fp, off, l)
+			if tagLen != len(d) {
+				return nil, errors.New("digest with different length found")
+			}
+			tags = append(tags, d)
+
+			if _, err := vdata.Write(location); err != nil {
+				return nil, err
+			}
+
+			digestLen = len(k.Digest)
+		}
+		if k.ChunkDigest != "" {
+			location := generateFileLocation(k.Name, uint64(k.ChunkOffset))
+			off := uint64(vdata.Len())
+			l := uint64(len(location))
+			d := generateTag(k.ChunkDigest, off, l)
+			if tagLen == 0 {
+				tagLen = len(d)
+			}
+			if tagLen != len(d) {
+				return nil, errors.New("digest with different length found")
+			}
+			tags = append(tags, d)
+
+			if _, err := vdata.Write(location); err != nil {
+				return nil, err
+			}
+			digestLen = len(k.ChunkDigest)
+		}
+	}
+
+	sort.Strings(tags)
+
+	for _, t := range tags {
+		if _, err := tagsBuffer.Write([]byte(t)); err != nil {
+			return nil, err
+		}
+	}
+
+	pipeReader, pipeWriter := io.Pipe()
+	errChan := make(chan error, 1)
+	go func() {
+		defer pipeWriter.Close()
+		defer close(errChan)
+
+		// version
+		if err := binary.Write(pipeWriter, binary.LittleEndian, uint64(cacheVersion)); err != nil {
+			errChan <- err
+			return
+		}
+
+		// len of a tag
+		if err := binary.Write(pipeWriter, binary.LittleEndian, uint64(tagLen)); err != nil {
+			errChan <- err
+			return
+		}
+
+		// len of a digest
+		if err := binary.Write(pipeWriter, binary.LittleEndian, uint64(digestLen)); err != nil {
+			errChan <- err
+			return
+		}
+
+		// tags length
+		if err := binary.Write(pipeWriter, binary.LittleEndian, uint64(tagsBuffer.Len())); err != nil {
+			errChan <- err
+			return
+		}
+
+		// vdata length
+		if err := binary.Write(pipeWriter, binary.LittleEndian, uint64(vdata.Len())); err != nil {
+			errChan <- err
+			return
+		}
+
+		// tags
+		if _, err := pipeWriter.Write(tagsBuffer.Bytes()); err != nil {
+			errChan <- err
+			return
+		}
+
+		// variable length data
+		if _, err := pipeWriter.Write(vdata.Bytes()); err != nil {
+			errChan <- err
+			return
+		}
+
+		errChan <- nil
+	}()
+	defer pipeReader.Close()
+
+	counter := ioutils.NewWriteCounter(ioutil.Discard)
+
+	r := io.TeeReader(pipeReader, counter)
+
+	if err := dest.SetLayerBigData(id, cacheKey, r); err != nil {
+		return nil, err
+	}
+
+	if err := <-errChan; err != nil {
+		return nil, err
+	}
+
+	logrus.Debugf("Written lookaside cache for layer %q with length %v", id, counter.Count)
+
+	return &metadata{
+		digestLen: digestLen,
+		tagLen:    tagLen,
+		tags:      tagsBuffer.Bytes(),
+		vdata:     vdata.Bytes(),
+	}, nil
+}
+
+func readMetadataFromCache(bigData io.Reader) (*metadata, error) {
+	var version, tagLen, digestLen, tagsLen, vdataLen uint64
+	if err := binary.Read(bigData, binary.LittleEndian, &version); err != nil {
+		return nil, err
+	}
+	if version != cacheVersion {
+		return nil, nil
+	}
+	if err := binary.Read(bigData, binary.LittleEndian, &tagLen); err != nil {
+		return nil, err
+	}
+	if err := binary.Read(bigData, binary.LittleEndian, &digestLen); err != nil {
+		return nil, err
+	}
+	if err := binary.Read(bigData, binary.LittleEndian, &tagsLen); err != nil {
+		return nil, err
+	}
+	if err := binary.Read(bigData, binary.LittleEndian, &vdataLen); err != nil {
+		return nil, err
+	}
+
+	tags := make([]byte, tagsLen)
+	if _, err := bigData.Read(tags); err != nil {
+		return nil, err
+	}
+
+	vdata := make([]byte, vdataLen)
+	if _, err := bigData.Read(vdata); err != nil {
+		return nil, err
+	}
+
+	return &metadata{
+		tagLen:    int(tagLen),
+		digestLen: int(digestLen),
+		tags:      tags,
+		vdata:     vdata,
+	}, nil
+}
+
+func prepareMetadata(manifest []byte) ([]*internal.FileMetadata, error) {
+	toc, err := unmarshalToc(manifest)
+	if err != nil {
+		// ignore errors here.  They might be caused by a different manifest format.
+		return nil, nil
+	}
+
+	var r []*internal.FileMetadata
+	chunkSeen := make(map[string]bool)
+	for i := range toc.Entries {
+		d := toc.Entries[i].Digest
+		if d != "" {
+			r = append(r, &toc.Entries[i])
+			continue
+		}
+
+		// chunks do not use hard link dedup so keeping just one candidate is enough
+		cd := toc.Entries[i].ChunkDigest
+		if cd != "" && !chunkSeen[cd] {
+			r = append(r, &toc.Entries[i])
+			chunkSeen[cd] = true
+		}
+	}
+	return r, nil
+}
+
+func (c *layersCache) addLayer(id string, metadata *metadata) error {
+	target, err := c.store.DifferTarget(id)
+	if err != nil {
+		return fmt.Errorf("get checkout directory layer %q: %w", id, err)
+	}
+
+	l := layer{
+		id:       id,
+		metadata: metadata,
+		target:   target,
+	}
+	c.layers = append(c.layers, l)
+	return nil
+}
+
+func byteSliceAsString(b []byte) string {
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func findTag(digest string, metadata *metadata) (string, uint64, uint64) {
+	if len(digest) != metadata.digestLen {
+		return "", 0, 0
+	}
+
+	nElements := len(metadata.tags) / metadata.tagLen
+
+	i := sort.Search(nElements, func(i int) bool {
+		d := byteSliceAsString(metadata.tags[i*metadata.tagLen : i*metadata.tagLen+metadata.digestLen])
+		return strings.Compare(d, digest) >= 0
+	})
+	if i < nElements {
+		d := string(metadata.tags[i*metadata.tagLen : i*metadata.tagLen+len(digest)])
+		if digest == d {
+			startOff := i*metadata.tagLen + metadata.digestLen
+			parts := strings.Split(string(metadata.tags[startOff:(i+1)*metadata.tagLen]), "@")
+			off, _ := strconv.ParseInt(parts[0], 10, 64)
+			len, _ := strconv.ParseInt(parts[1], 10, 64)
+			return digest, uint64(off), uint64(len)
+		}
+	}
+	return "", 0, 0
+}
+
+func (c *layersCache) findDigestInternal(digest string) (string, string, int64, error) {
+	if digest == "" {
+		return "", "", -1, nil
+	}
+
+	c.mutex.RLock()
+	defer c.mutex.RUnlock()
+
+	for _, layer := range c.layers {
+		digest, off, len := findTag(digest, layer.metadata)
+		if digest != "" {
+			position := string(layer.metadata.vdata[off : off+len])
+			parts := strings.SplitN(position, "@", 2)
+			offFile, _ := strconv.ParseInt(parts[0], 10, 64)
+			return layer.target, parts[1], offFile, nil
+		}
+	}
+
+	return "", "", -1, nil
+}
+
+// findFileInOtherLayers finds the specified file in other layers.
+// file is the file to look for.
+func (c *layersCache) findFileInOtherLayers(file *internal.FileMetadata, useHardLinks bool) (string, string, error) {
+	digest := file.Digest
+	if useHardLinks {
+		var err error
+		digest, err = calculateHardLinkFingerprint(file)
+		if err != nil {
+			return "", "", err
+		}
+	}
+	target, name, off, err := c.findDigestInternal(digest)
+	if off == 0 {
+		return target, name, err
+	}
+	return "", "", nil
+}
+
+func (c *layersCache) findChunkInOtherLayers(chunk *internal.FileMetadata) (string, string, int64, error) {
+	return c.findDigestInternal(chunk.ChunkDigest)
+}
+
+func unmarshalToc(manifest []byte) (*internal.TOC, error) {
+	var buf bytes.Buffer
+	count := 0
+	var toc internal.TOC
+
+	iter := jsoniter.ParseBytes(jsoniter.ConfigFastest, manifest)
+	for field := iter.ReadObject(); field != ""; field = iter.ReadObject() {
+		if field != "entries" {
+			iter.Skip()
+			continue
+		}
+		for iter.ReadArray() {
+			for field := iter.ReadObject(); field != ""; field = iter.ReadObject() {
+				switch field {
+				case "type", "name", "linkName", "digest", "chunkDigest", "chunkType":
+					count += len(iter.ReadStringAsSlice())
+				case "xattrs":
+					for key := iter.ReadObject(); key != ""; key = iter.ReadObject() {
+						count += len(iter.ReadStringAsSlice())
+					}
+				default:
+					iter.Skip()
+				}
+			}
+		}
+		break
+	}
+
+	buf.Grow(count)
+
+	getString := func(b []byte) string {
+		from := buf.Len()
+		buf.Write(b)
+		to := buf.Len()
+		return byteSliceAsString(buf.Bytes()[from:to])
+	}
+
+	iter = jsoniter.ParseBytes(jsoniter.ConfigFastest, manifest)
+	for field := iter.ReadObject(); field != ""; field = iter.ReadObject() {
+		if field == "version" {
+			toc.Version = iter.ReadInt()
+			continue
+		}
+		if field != "entries" {
+			iter.Skip()
+			continue
+		}
+		for iter.ReadArray() {
+			var m internal.FileMetadata
+			for field := iter.ReadObject(); field != ""; field = iter.ReadObject() {
+				switch field {
+				case "type":
+					m.Type = getString(iter.ReadStringAsSlice())
+				case "name":
+					m.Name = getString(iter.ReadStringAsSlice())
+				case "linkName":
+					m.Linkname = getString(iter.ReadStringAsSlice())
+				case "mode":
+					m.Mode = iter.ReadInt64()
+				case "size":
+					m.Size = iter.ReadInt64()
+				case "UID":
+					m.UID = iter.ReadInt()
+				case "GID":
+					m.GID = iter.ReadInt()
+				case "ModTime":
+					time, err := time.Parse(time.RFC3339, byteSliceAsString(iter.ReadStringAsSlice()))
+					if err != nil {
+						return nil, err
+					}
+					m.ModTime = &time
+				case "accesstime":
+					time, err := time.Parse(time.RFC3339, byteSliceAsString(iter.ReadStringAsSlice()))
+					if err != nil {
+						return nil, err
+					}
+					m.AccessTime = &time
+				case "changetime":
+					time, err := time.Parse(time.RFC3339, byteSliceAsString(iter.ReadStringAsSlice()))
+					if err != nil {
+						return nil, err
+					}
+					m.ChangeTime = &time
+				case "devMajor":
+					m.Devmajor = iter.ReadInt64()
+				case "devMinor":
+					m.Devminor = iter.ReadInt64()
+				case "digest":
+					m.Digest = getString(iter.ReadStringAsSlice())
+				case "offset":
+					m.Offset = iter.ReadInt64()
+				case "endOffset":
+					m.EndOffset = iter.ReadInt64()
+				case "chunkSize":
+					m.ChunkSize = iter.ReadInt64()
+				case "chunkOffset":
+					m.ChunkOffset = iter.ReadInt64()
+				case "chunkDigest":
+					m.ChunkDigest = getString(iter.ReadStringAsSlice())
+				case "chunkType":
+					m.ChunkType = getString(iter.ReadStringAsSlice())
+				case "xattrs":
+					m.Xattrs = make(map[string]string)
+					for key := iter.ReadObject(); key != ""; key = iter.ReadObject() {
+						value := iter.ReadStringAsSlice()
+						m.Xattrs[key] = getString(value)
+					}
+				default:
+					iter.Skip()
+				}
+			}
+			toc.Entries = append(toc.Entries, m)
+		}
+		break
+	}
+	toc.StringsBuf = buf
+	return &toc, nil
+}
diff --git a/vendor/github.com/containers/storage/pkg/chunked/compressor/compressor.go b/vendor/github.com/containers/storage/pkg/chunked/compressor/compressor.go
index 092cf584af3..aeb7cfd4f01 100644
--- a/vendor/github.com/containers/storage/pkg/chunked/compressor/compressor.go
+++ b/vendor/github.com/containers/storage/pkg/chunked/compressor/compressor.go
@@ -5,6 +5,7 @@ package compressor
 // larger software like the graph drivers.
 
 import (
+	"bufio"
 	"encoding/base64"
 	"io"
 	"io/ioutil"
@@ -15,6 +16,189 @@ import (
 	"github.com/vbatts/tar-split/archive/tar"
 )
 
+const RollsumBits = 16
+const holesThreshold = int64(1 << 10)
+
+type holesFinder struct {
+	reader    *bufio.Reader
+	fileOff   int64
+	zeros     int64
+	from      int64
+	threshold int64
+
+	state int
+}
+
+const (
+	holesFinderStateRead = iota
+	holesFinderStateAccumulate
+	holesFinderStateFound
+	holesFinderStateEOF
+)
+
+// ReadByte reads a single byte from the underlying reader.
+// If a single byte is read, the return value is (0, RAW-BYTE-VALUE, nil).
+// If there are at least f.THRESHOLD consecutive zeros, then the
+// return value is (N_CONSECUTIVE_ZEROS, '\x00').
+func (f *holesFinder) ReadByte() (int64, byte, error) {
+	for {
+		switch f.state {
+		// reading the file stream
+		case holesFinderStateRead:
+			if f.zeros > 0 {
+				f.zeros--
+				return 0, 0, nil
+			}
+			b, err := f.reader.ReadByte()
+			if err != nil {
+				return 0, b, err
+			}
+
+			if b != 0 {
+				return 0, b, err
+			}
+
+			f.zeros = 1
+			if f.zeros == f.threshold {
+				f.state = holesFinderStateFound
+			} else {
+				f.state = holesFinderStateAccumulate
+			}
+		// accumulating zeros, but still didn't reach the threshold
+		case holesFinderStateAccumulate:
+			b, err := f.reader.ReadByte()
+			if err != nil {
+				if err == io.EOF {
+					f.state = holesFinderStateEOF
+					continue
+				}
+				return 0, b, err
+			}
+
+			if b == 0 {
+				f.zeros++
+				if f.zeros == f.threshold {
+					f.state = holesFinderStateFound
+				}
+			} else {
+				if f.reader.UnreadByte(); err != nil {
+					return 0, 0, err
+				}
+				f.state = holesFinderStateRead
+			}
+		// found a hole.  Number of zeros >= threshold
+		case holesFinderStateFound:
+			b, err := f.reader.ReadByte()
+			if err != nil {
+				if err == io.EOF {
+					f.state = holesFinderStateEOF
+				}
+				holeLen := f.zeros
+				f.zeros = 0
+				return holeLen, 0, nil
+			}
+			if b != 0 {
+				if f.reader.UnreadByte(); err != nil {
+					return 0, 0, err
+				}
+				f.state = holesFinderStateRead
+
+				holeLen := f.zeros
+				f.zeros = 0
+				return holeLen, 0, nil
+			}
+			f.zeros++
+		// reached EOF.  Flush pending zeros if any.
+		case holesFinderStateEOF:
+			if f.zeros > 0 {
+				f.zeros--
+				return 0, 0, nil
+			}
+			return 0, 0, io.EOF
+		}
+	}
+}
+
+type rollingChecksumReader struct {
+	reader      *holesFinder
+	closed      bool
+	rollsum     *RollSum
+	pendingHole int64
+
+	// WrittenOut is the total number of bytes read from
+	// the stream.
+	WrittenOut int64
+
+	// IsLastChunkZeros tells whether the last generated
+	// chunk is a hole (made of consecutive zeros).  If it
+	// is false, then the last chunk is a data chunk
+	// generated by the rolling checksum.
+	IsLastChunkZeros bool
+}
+
+func (rc *rollingChecksumReader) Read(b []byte) (bool, int, error) {
+	rc.IsLastChunkZeros = false
+
+	if rc.pendingHole > 0 {
+		toCopy := int64(len(b))
+		if rc.pendingHole < toCopy {
+			toCopy = rc.pendingHole
+		}
+		rc.pendingHole -= toCopy
+		for i := int64(0); i < toCopy; i++ {
+			b[i] = 0
+		}
+
+		rc.WrittenOut += toCopy
+
+		rc.IsLastChunkZeros = true
+
+		// if there are no other zeros left, terminate the chunk
+		return rc.pendingHole == 0, int(toCopy), nil
+	}
+
+	if rc.closed {
+		return false, 0, io.EOF
+	}
+
+	for i := 0; i < len(b); i++ {
+		holeLen, n, err := rc.reader.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				rc.closed = true
+				if i == 0 {
+					return false, 0, err
+				}
+				return false, i, nil
+			}
+			// Report any other error type
+			return false, -1, err
+		}
+		if holeLen > 0 {
+			for j := int64(0); j < holeLen; j++ {
+				rc.rollsum.Roll(0)
+			}
+			rc.pendingHole = holeLen
+			return true, i, nil
+		}
+		b[i] = n
+		rc.WrittenOut++
+		rc.rollsum.Roll(n)
+		if rc.rollsum.OnSplitWithBits(RollsumBits) {
+			return true, i + 1, nil
+		}
+	}
+	return false, len(b), nil
+}
+
+type chunk struct {
+	ChunkOffset int64
+	Offset      int64
+	Checksum    string
+	ChunkSize   int64
+	ChunkType   string
+}
+
 func writeZstdChunkedStream(destFile io.Writer, outMetadata map[string]string, reader io.Reader, level int) error {
 	// total written so far.  Used to retrieve partial offsets in the file
 	dest := ioutils.NewWriteCounter(destFile)
@@ -64,40 +248,78 @@ func writeZstdChunkedStream(destFile io.Writer, outMetadata map[string]string, r
 		if _, err := zstdWriter.Write(rawBytes); err != nil {
 			return err
 		}
-		payloadDigester := digest.Canonical.Digester()
-		payloadChecksum := payloadDigester.Hash()
 
-		payloadDest := io.MultiWriter(payloadChecksum, zstdWriter)
+		payloadDigester := digest.Canonical.Digester()
+		chunkDigester := digest.Canonical.Digester()
 
 		// Now handle the payload, if any
-		var startOffset, endOffset int64
+		startOffset := int64(0)
+		lastOffset := int64(0)
+		lastChunkOffset := int64(0)
+
 		checksum := ""
+
+		chunks := []chunk{}
+
+		hf := &holesFinder{
+			threshold: holesThreshold,
+			reader:    bufio.NewReader(tr),
+		}
+
+		rcReader := &rollingChecksumReader{
+			reader:  hf,
+			rollsum: NewRollSum(),
+		}
+
+		payloadDest := io.MultiWriter(payloadDigester.Hash(), chunkDigester.Hash(), zstdWriter)
 		for {
-			read, errRead := tr.Read(buf)
+			mustSplit, read, errRead := rcReader.Read(buf)
 			if errRead != nil && errRead != io.EOF {
 				return err
 			}
-
-			// restart the compression only if there is
-			// a payload.
+			// restart the compression only if there is a payload.
 			if read > 0 {
 				if startOffset == 0 {
 					startOffset, err = restartCompression()
 					if err != nil {
 						return err
 					}
+					lastOffset = startOffset
+				}
+
+				if _, err := payloadDest.Write(buf[:read]); err != nil {
+					return err
 				}
-				_, err := payloadDest.Write(buf[:read])
+			}
+			if (mustSplit || errRead == io.EOF) && startOffset > 0 {
+				off, err := restartCompression()
 				if err != nil {
 					return err
 				}
+
+				chunkSize := rcReader.WrittenOut - lastChunkOffset
+				if chunkSize > 0 {
+					chunkType := internal.ChunkTypeData
+					if rcReader.IsLastChunkZeros {
+						chunkType = internal.ChunkTypeZeros
+					}
+
+					chunks = append(chunks, chunk{
+						ChunkOffset: lastChunkOffset,
+						Offset:      lastOffset,
+						Checksum:    chunkDigester.Digest().String(),
+						ChunkSize:   chunkSize,
+						ChunkType:   chunkType,
+					})
+				}
+
+				lastOffset = off
+				lastChunkOffset = rcReader.WrittenOut
+				chunkDigester = digest.Canonical.Digester()
+				payloadDest = io.MultiWriter(payloadDigester.Hash(), chunkDigester.Hash(), zstdWriter)
 			}
 			if errRead == io.EOF {
 				if startOffset > 0 {
-					endOffset, err = restartCompression()
-					if err != nil {
-						return err
-					}
 					checksum = payloadDigester.Digest().String()
 				}
 				break
@@ -112,30 +334,42 @@ func writeZstdChunkedStream(destFile io.Writer, outMetadata map[string]string, r
 		for k, v := range hdr.Xattrs {
 			xattrs[k] = base64.StdEncoding.EncodeToString([]byte(v))
 		}
-		m := internal.FileMetadata{
-			Type:       typ,
-			Name:       hdr.Name,
-			Linkname:   hdr.Linkname,
-			Mode:       hdr.Mode,
-			Size:       hdr.Size,
-			UID:        hdr.Uid,
-			GID:        hdr.Gid,
-			ModTime:    hdr.ModTime,
-			AccessTime: hdr.AccessTime,
-			ChangeTime: hdr.ChangeTime,
-			Devmajor:   hdr.Devmajor,
-			Devminor:   hdr.Devminor,
-			Xattrs:     xattrs,
-			Digest:     checksum,
-			Offset:     startOffset,
-			EndOffset:  endOffset,
-
-			// ChunkSize is 0 for the last chunk
-			ChunkSize:   0,
-			ChunkOffset: 0,
-			ChunkDigest: checksum,
-		}
-		metadata = append(metadata, m)
+		entries := []internal.FileMetadata{
+			{
+				Type:       typ,
+				Name:       hdr.Name,
+				Linkname:   hdr.Linkname,
+				Mode:       hdr.Mode,
+				Size:       hdr.Size,
+				UID:        hdr.Uid,
+				GID:        hdr.Gid,
+				ModTime:    &hdr.ModTime,
+				AccessTime: &hdr.AccessTime,
+				ChangeTime: &hdr.ChangeTime,
+				Devmajor:   hdr.Devmajor,
+				Devminor:   hdr.Devminor,
+				Xattrs:     xattrs,
+				Digest:     checksum,
+				Offset:     startOffset,
+				EndOffset:  lastOffset,
+			},
+		}
+		for i := 1; i < len(chunks); i++ {
+			entries = append(entries, internal.FileMetadata{
+				Type:        internal.TypeChunk,
+				Name:        hdr.Name,
+				ChunkOffset: chunks[i].ChunkOffset,
+			})
+		}
+		if len(chunks) > 1 {
+			for i := range chunks {
+				entries[i].ChunkSize = chunks[i].ChunkSize
+				entries[i].Offset = chunks[i].Offset
+				entries[i].ChunkDigest = chunks[i].Checksum
+				entries[i].ChunkType = chunks[i].ChunkType
+			}
+		}
+		metadata = append(metadata, entries...)
 	}
 
 	rawBytes := tr.RawBytes()
@@ -212,7 +446,7 @@ func zstdChunkedWriterWithLevel(out io.Writer, metadata map[string]string, level
 // ZstdCompressor is a CompressorFunc for the zstd compression algorithm.
 func ZstdCompressor(r io.Writer, metadata map[string]string, level *int) (io.WriteCloser, error) {
 	if level == nil {
-		l := 3
+		l := 10
 		level = &l
 	}
 
diff --git a/vendor/github.com/containers/storage/pkg/chunked/compressor/rollsum.go b/vendor/github.com/containers/storage/pkg/chunked/compressor/rollsum.go
new file mode 100644
index 00000000000..f4dfad822e9
--- /dev/null
+++ b/vendor/github.com/containers/storage/pkg/chunked/compressor/rollsum.go
@@ -0,0 +1,81 @@
+/*
+Copyright 2011 The Perkeep Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package rollsum implements rolling checksums similar to apenwarr's bup, which
+// is similar to librsync.
+//
+// The bup project is at https://github.com/apenwarr/bup and its splitting in
+// particular is at https://github.com/apenwarr/bup/blob/master/lib/bup/bupsplit.c
+package compressor
+
+import (
+	"math/bits"
+)
+
+const windowSize = 64 // Roll assumes windowSize is a power of 2
+const charOffset = 31
+
+const blobBits = 13
+const blobSize = 1 << blobBits // 8k
+
+type RollSum struct {
+	s1, s2 uint32
+	window [windowSize]uint8
+	wofs   int
+}
+
+func NewRollSum() *RollSum {
+	return &RollSum{
+		s1: windowSize * charOffset,
+		s2: windowSize * (windowSize - 1) * charOffset,
+	}
+}
+
+func (rs *RollSum) add(drop, add uint32) {
+	s1 := rs.s1 + add - drop
+	rs.s1 = s1
+	rs.s2 += s1 - uint32(windowSize)*(drop+charOffset)
+}
+
+// Roll adds ch to the rolling sum.
+func (rs *RollSum) Roll(ch byte) {
+	wp := &rs.window[rs.wofs]
+	rs.add(uint32(*wp), uint32(ch))
+	*wp = ch
+	rs.wofs = (rs.wofs + 1) & (windowSize - 1)
+}
+
+// OnSplit reports whether at least 13 consecutive trailing bits of
+// the current checksum are set the same way.
+func (rs *RollSum) OnSplit() bool {
+	return (rs.s2 & (blobSize - 1)) == ((^0) & (blobSize - 1))
+}
+
+// OnSplitWithBits reports whether at least n consecutive trailing bits
+// of the current checksum are set the same way.
+func (rs *RollSum) OnSplitWithBits(n uint32) bool {
+	mask := (uint32(1) << n) - 1
+	return rs.s2&mask == (^uint32(0))&mask
+}
+
+func (rs *RollSum) Bits() int {
+	rsum := rs.Digest() >> (blobBits + 1)
+	return blobBits + bits.TrailingZeros32(^rsum)
+}
+
+func (rs *RollSum) Digest() uint32 {
+	return (rs.s1 << 16) | (rs.s2 & 0xffff)
+}
diff --git a/vendor/github.com/containers/storage/pkg/chunked/internal/compression.go b/vendor/github.com/containers/storage/pkg/chunked/internal/compression.go
index c91c43d85cd..3bb5286d92d 100644
--- a/vendor/github.com/containers/storage/pkg/chunked/internal/compression.go
+++ b/vendor/github.com/containers/storage/pkg/chunked/internal/compression.go
@@ -8,11 +8,11 @@ import (
 	"archive/tar"
 	"bytes"
 	"encoding/binary"
-	"encoding/json"
 	"fmt"
 	"io"
 	"time"
 
+	jsoniter "github.com/json-iterator/go"
 	"github.com/klauspost/compress/zstd"
 	"github.com/opencontainers/go-digest"
 )
@@ -20,6 +20,9 @@ import (
 type TOC struct {
 	Version int            `json:"version"`
 	Entries []FileMetadata `json:"entries"`
+
+	// internal: used by unmarshalToc
+	StringsBuf bytes.Buffer `json:"-"`
 }
 
 type FileMetadata struct {
@@ -27,25 +30,33 @@ type FileMetadata struct {
 	Name       string            `json:"name"`
 	Linkname   string            `json:"linkName,omitempty"`
 	Mode       int64             `json:"mode,omitempty"`
-	Size       int64             `json:"size"`
-	UID        int               `json:"uid"`
-	GID        int               `json:"gid"`
-	ModTime    time.Time         `json:"modtime"`
-	AccessTime time.Time         `json:"accesstime"`
-	ChangeTime time.Time         `json:"changetime"`
-	Devmajor   int64             `json:"devMajor"`
-	Devminor   int64             `json:"devMinor"`
+	Size       int64             `json:"size,omitempty"`
+	UID        int               `json:"uid,omitempty"`
+	GID        int               `json:"gid,omitempty"`
+	ModTime    *time.Time        `json:"modtime,omitempty"`
+	AccessTime *time.Time        `json:"accesstime,omitempty"`
+	ChangeTime *time.Time        `json:"changetime,omitempty"`
+	Devmajor   int64             `json:"devMajor,omitempty"`
+	Devminor   int64             `json:"devMinor,omitempty"`
 	Xattrs     map[string]string `json:"xattrs,omitempty"`
 	Digest     string            `json:"digest,omitempty"`
 	Offset     int64             `json:"offset,omitempty"`
 	EndOffset  int64             `json:"endOffset,omitempty"`
 
-	// Currently chunking is not supported.
 	ChunkSize   int64  `json:"chunkSize,omitempty"`
 	ChunkOffset int64  `json:"chunkOffset,omitempty"`
 	ChunkDigest string `json:"chunkDigest,omitempty"`
+	ChunkType   string `json:"chunkType,omitempty"`
+
+	// internal: computed by mergeTOCEntries.
+	Chunks []*FileMetadata `json:"-"`
 }
 
+const (
+	ChunkTypeData  = ""
+	ChunkTypeZeros = "zeros"
+)
+
 const (
 	TypeReg     = "reg"
 	TypeChunk   = "chunk"
@@ -123,6 +134,7 @@ func WriteZstdChunkedManifest(dest io.Writer, outMetadata map[string]string, off
 		Entries: metadata,
 	}
 
+	var json = jsoniter.ConfigCompatibleWithStandardLibrary
 	// Generate the manifest
 	manifest, err := json.Marshal(toc)
 	if err != nil {
diff --git a/vendor/github.com/containers/storage/pkg/chunked/storage_linux.go b/vendor/github.com/containers/storage/pkg/chunked/storage_linux.go
index 6efc6a4c845..9434499d2d9 100644
--- a/vendor/github.com/containers/storage/pkg/chunked/storage_linux.go
+++ b/vendor/github.com/containers/storage/pkg/chunked/storage_linux.go
@@ -4,8 +4,8 @@ import (
 	archivetar "archive/tar"
 	"context"
 	"encoding/base64"
-	"encoding/json"
 	"fmt"
+	"hash"
 	"io"
 	"io/ioutil"
 	"os"
@@ -13,6 +13,8 @@ import (
 	"reflect"
 	"sort"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -25,6 +27,7 @@ import (
 	"github.com/containers/storage/pkg/idtools"
 	"github.com/containers/storage/pkg/system"
 	"github.com/containers/storage/types"
+	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/klauspost/compress/zstd"
 	"github.com/klauspost/pgzip"
 	digest "github.com/opencontainers/go-digest"
@@ -41,24 +44,35 @@ const (
 	bigDataKey              = "zstd-chunked-manifest"
 
 	fileTypeZstdChunked = iota
-	fileTypeEstargz     = iota
+	fileTypeEstargz
+	fileTypeNoCompression
+	fileTypeHole
+
+	copyGoRoutines = 32
 )
 
 type compressedFileType int
 
 type chunkedDiffer struct {
-	stream         ImageSourceSeekable
-	manifest       []byte
-	layersMetadata map[string][]internal.FileMetadata
-	layersTarget   map[string]string
-	tocOffset      int64
-	fileType       compressedFileType
+	stream      ImageSourceSeekable
+	manifest    []byte
+	layersCache *layersCache
+	tocOffset   int64
+	fileType    compressedFileType
+
+	copyBuffer []byte
 
 	gzipReader *pgzip.Reader
+	zstdReader *zstd.Decoder
+	rawReader  io.Reader
+}
+
+var xattrsToIgnore = map[string]interface{}{
+	"security.selinux": true,
 }
 
-func timeToTimespec(time time.Time) (ts unix.Timespec) {
-	if time.IsZero() {
+func timeToTimespec(time *time.Time) (ts unix.Timespec) {
+	if time == nil || time.IsZero() {
 		// Return UTIME_OMIT special value
 		ts.Sec = 0
 		ts.Nsec = ((1 << 30) - 2)
@@ -67,11 +81,29 @@ func timeToTimespec(time time.Time) (ts unix.Timespec) {
 	return unix.NsecToTimespec(time.UnixNano())
 }
 
+func doHardLink(srcFd int, destDirFd int, destBase string) error {
+	doLink := func() error {
+		// Using unix.AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH while this variant that uses
+		// /proc/self/fd doesn't and can be used with rootless.
+		srcPath := fmt.Sprintf("/proc/self/fd/%d", srcFd)
+		return unix.Linkat(unix.AT_FDCWD, srcPath, destDirFd, destBase, unix.AT_SYMLINK_FOLLOW)
+	}
+
+	err := doLink()
+
+	// if the destination exists, unlink it first and try again
+	if err != nil && os.IsExist(err) {
+		unix.Unlinkat(destDirFd, destBase, 0)
+		return doLink()
+	}
+	return err
+}
+
 func copyFileContent(srcFd int, destFile string, dirfd int, mode os.FileMode, useHardLinks bool) (*os.File, int64, error) {
 	src := fmt.Sprintf("/proc/self/fd/%d", srcFd)
 	st, err := os.Stat(src)
 	if err != nil {
-		return nil, -1, err
+		return nil, -1, fmt.Errorf("copy file content for %q: %w", destFile, err)
 	}
 
 	copyWithFileRange, copyWithFileClone := true, true
@@ -83,20 +115,7 @@ func copyFileContent(srcFd int, destFile string, dirfd int, mode os.FileMode, us
 		if err == nil {
 			defer destDir.Close()
 
-			doLink := func() error {
-				// Using unix.AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH while this variant that uses
-				// /proc/self/fd doesn't and can be used with rootless.
-				srcPath := fmt.Sprintf("/proc/self/fd/%d", srcFd)
-				return unix.Linkat(unix.AT_FDCWD, srcPath, int(destDir.Fd()), destBase, unix.AT_SYMLINK_FOLLOW)
-			}
-
-			err := doLink()
-
-			// if the destination exists, unlink it first and try again
-			if err != nil && os.IsExist(err) {
-				unix.Unlinkat(int(destDir.Fd()), destBase, 0)
-				err = doLink()
-			}
+			err := doHardLink(srcFd, int(destDir.Fd()), destBase)
 			if err == nil {
 				return nil, st.Size(), nil
 			}
@@ -106,63 +125,15 @@ func copyFileContent(srcFd int, destFile string, dirfd int, mode os.FileMode, us
 	// If the destination file already exists, we shouldn't blow it away
 	dstFile, err := openFileUnderRoot(destFile, dirfd, newFileFlags, mode)
 	if err != nil {
-		return nil, -1, err
+		return nil, -1, fmt.Errorf("open file %q under rootfs for copy: %w", destFile, err)
 	}
 
 	err = driversCopy.CopyRegularToFile(src, dstFile, st, &copyWithFileRange, &copyWithFileClone)
 	if err != nil {
 		dstFile.Close()
-		return nil, -1, err
-	}
-	return dstFile, st.Size(), err
-}
-
-func prepareOtherLayersCache(layersMetadata map[string][]internal.FileMetadata) map[string]map[string][]*internal.FileMetadata {
-	maps := make(map[string]map[string][]*internal.FileMetadata)
-
-	for layerID, v := range layersMetadata {
-		r := make(map[string][]*internal.FileMetadata)
-		for i := range v {
-			if v[i].Digest != "" {
-				r[v[i].Digest] = append(r[v[i].Digest], &v[i])
-			}
-		}
-		maps[layerID] = r
-	}
-	return maps
-}
-
-func getLayersCache(store storage.Store) (map[string][]internal.FileMetadata, map[string]string, error) {
-	allLayers, err := store.Layers()
-	if err != nil {
-		return nil, nil, err
-	}
-
-	layersMetadata := make(map[string][]internal.FileMetadata)
-	layersTarget := make(map[string]string)
-	for _, r := range allLayers {
-		manifestReader, err := store.LayerBigData(r.ID, bigDataKey)
-		if err != nil {
-			continue
-		}
-		defer manifestReader.Close()
-		manifest, err := ioutil.ReadAll(manifestReader)
-		if err != nil {
-			return nil, nil, err
-		}
-		var toc internal.TOC
-		if err := json.Unmarshal(manifest, &toc); err != nil {
-			continue
-		}
-		layersMetadata[r.ID] = toc.Entries
-		target, err := store.DifferTarget(r.ID)
-		if err != nil {
-			return nil, nil, err
-		}
-		layersTarget[r.ID] = target
+		return nil, -1, fmt.Errorf("copy to file %q under rootfs: %w", destFile, err)
 	}
-
-	return layersMetadata, layersTarget, nil
+	return dstFile, st.Size(), nil
 }
 
 // GetDiffer returns a differ than can be used with ApplyDiffWithDiffer.
@@ -179,67 +150,71 @@ func GetDiffer(ctx context.Context, store storage.Store, blobSize int64, annotat
 func makeZstdChunkedDiffer(ctx context.Context, store storage.Store, blobSize int64, annotations map[string]string, iss ImageSourceSeekable) (*chunkedDiffer, error) {
 	manifest, tocOffset, err := readZstdChunkedManifest(iss, blobSize, annotations)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("read zstd:chunked manifest: %w", err)
 	}
-	layersMetadata, layersTarget, err := getLayersCache(store)
+	layersCache, err := getLayersCache(store)
 	if err != nil {
 		return nil, err
 	}
 
 	return &chunkedDiffer{
-		stream:         iss,
-		manifest:       manifest,
-		layersMetadata: layersMetadata,
-		layersTarget:   layersTarget,
-		tocOffset:      tocOffset,
-		fileType:       fileTypeZstdChunked,
+		copyBuffer:  makeCopyBuffer(),
+		stream:      iss,
+		manifest:    manifest,
+		layersCache: layersCache,
+		tocOffset:   tocOffset,
+		fileType:    fileTypeZstdChunked,
 	}, nil
 }
 
 func makeEstargzChunkedDiffer(ctx context.Context, store storage.Store, blobSize int64, annotations map[string]string, iss ImageSourceSeekable) (*chunkedDiffer, error) {
 	manifest, tocOffset, err := readEstargzChunkedManifest(iss, blobSize, annotations)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("read zstd:chunked manifest: %w", err)
 	}
-	layersMetadata, layersTarget, err := getLayersCache(store)
+	layersCache, err := getLayersCache(store)
 	if err != nil {
 		return nil, err
 	}
 
 	return &chunkedDiffer{
-		stream:         iss,
-		manifest:       manifest,
-		layersMetadata: layersMetadata,
-		layersTarget:   layersTarget,
-		tocOffset:      tocOffset,
-		fileType:       fileTypeEstargz,
+		copyBuffer:  makeCopyBuffer(),
+		stream:      iss,
+		manifest:    manifest,
+		layersCache: layersCache,
+		tocOffset:   tocOffset,
+		fileType:    fileTypeEstargz,
 	}, nil
 }
 
+func makeCopyBuffer() []byte {
+	return make([]byte, 2<<20)
+}
+
 // copyFileFromOtherLayer copies a file from another layer
 // file is the file to look for.
 // source is the path to the source layer checkout.
-// otherFile contains the metadata for the file.
+// name is the path to the file to copy in source.
 // dirfd is an open file descriptor to the destination root directory.
 // useHardLinks defines whether the deduplication can be performed using hard links.
-func copyFileFromOtherLayer(file *internal.FileMetadata, source string, otherFile *internal.FileMetadata, dirfd int, useHardLinks bool) (bool, *os.File, int64, error) {
+func copyFileFromOtherLayer(file *internal.FileMetadata, source string, name string, dirfd int, useHardLinks bool) (bool, *os.File, int64, error) {
 	srcDirfd, err := unix.Open(source, unix.O_RDONLY, 0)
 	if err != nil {
-		return false, nil, 0, err
+		return false, nil, 0, fmt.Errorf("open source file: %w", err)
 	}
 	defer unix.Close(srcDirfd)
 
-	srcFile, err := openFileUnderRoot(otherFile.Name, srcDirfd, unix.O_RDONLY, 0)
+	srcFile, err := openFileUnderRoot(name, srcDirfd, unix.O_RDONLY, 0)
 	if err != nil {
-		return false, nil, 0, err
+		return false, nil, 0, fmt.Errorf("open source file under target rootfs: %w", err)
 	}
 	defer srcFile.Close()
 
 	dstFile, written, err := copyFileContent(int(srcFile.Fd()), file.Name, dirfd, 0, useHardLinks)
 	if err != nil {
-		return false, nil, 0, err
+		return false, nil, 0, fmt.Errorf("copy content to %q: %w", file.Name, err)
 	}
-	return true, dstFile, written, err
+	return true, dstFile, written, nil
 }
 
 // canDedupMetadataWithHardLink says whether it is possible to deduplicate file with otherFile.
@@ -275,10 +250,6 @@ func canDedupFileWithHardLink(file *internal.FileMetadata, fd int, s os.FileInfo
 		return false
 	}
 
-	xattrsToIgnore := map[string]interface{}{
-		"security.selinux": true,
-	}
-
 	xattrs := make(map[string]string)
 	for _, x := range listXattrs {
 		v, err := system.Lgetxattr(path, x)
@@ -301,45 +272,9 @@ func canDedupFileWithHardLink(file *internal.FileMetadata, fd int, s os.FileInfo
 	return canDedupMetadataWithHardLink(file, &otherFile)
 }
 
-// findFileInOtherLayers finds the specified file in other layers.
-// file is the file to look for.
-// dirfd is an open file descriptor to the checkout root directory.
-// layersMetadata contains the metadata for each layer in the storage.
-// layersTarget maps each layer to its checkout on disk.
-// useHardLinks defines whether the deduplication can be performed using hard links.
-func findFileInOtherLayers(file *internal.FileMetadata, dirfd int, layersMetadata map[string]map[string][]*internal.FileMetadata, layersTarget map[string]string, useHardLinks bool) (bool, *os.File, int64, error) {
-	// this is ugly, needs to be indexed
-	for layerID, checksums := range layersMetadata {
-		source, ok := layersTarget[layerID]
-		if !ok {
-			continue
-		}
-		files, found := checksums[file.Digest]
-		if !found {
-			continue
-		}
-		for _, candidate := range files {
-			// check if it is a valid candidate to dedup file
-			if useHardLinks && !canDedupMetadataWithHardLink(file, candidate) {
-				continue
-			}
-
-			found, dstFile, written, err := copyFileFromOtherLayer(file, source, candidate, dirfd, useHardLinks)
-			if found && err == nil {
-				return found, dstFile, written, err
-			}
-		}
-	}
-	// If hard links deduplication was used and it has failed, try again without hard links.
-	if useHardLinks {
-		return findFileInOtherLayers(file, dirfd, layersMetadata, layersTarget, false)
-	}
-	return false, nil, 0, nil
-}
-
-func getFileDigest(f *os.File) (digest.Digest, error) {
+func getFileDigest(f *os.File, buf []byte) (digest.Digest, error) {
 	digester := digest.Canonical.Digester()
-	if _, err := io.Copy(digester.Hash(), f); err != nil {
+	if _, err := io.CopyBuffer(digester.Hash(), f, buf); err != nil {
 		return "", err
 	}
 	return digester.Digest(), nil
@@ -401,7 +336,7 @@ func findFileInOSTreeRepos(file *internal.FileMetadata, ostreeRepos []string, di
 // file is the file to look for.
 // dirfd is an open fd to the destination checkout.
 // useHardLinks defines whether the deduplication can be performed using hard links.
-func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool) (bool, *os.File, int64, error) {
+func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool, buf []byte) (bool, *os.File, int64, error) {
 	sourceFile := filepath.Clean(filepath.Join("/", file.Name))
 	if !strings.HasPrefix(sourceFile, "/usr/") {
 		// limit host deduplication to files under /usr.
@@ -430,7 +365,7 @@ func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool
 		return false, nil, 0, err
 	}
 
-	checksum, err := getFileDigest(f)
+	checksum, err := getFileDigest(f, buf)
 	if err != nil {
 		return false, nil, 0, err
 	}
@@ -452,7 +387,7 @@ func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool
 		dstFile.Close()
 		return false, nil, 0, err
 	}
-	checksum, err = getFileDigest(f)
+	checksum, err = getFileDigest(f, buf)
 	if err != nil {
 		dstFile.Close()
 		return false, nil, 0, err
@@ -464,6 +399,19 @@ func findFileOnTheHost(file *internal.FileMetadata, dirfd int, useHardLinks bool
 	return true, dstFile, written, nil
 }
 
+// findFileInOtherLayers finds the specified file in other layers.
+// cache is the layers cache to use.
+// file is the file to look for.
+// dirfd is an open file descriptor to the checkout root directory.
+// useHardLinks defines whether the deduplication can be performed using hard links.
+func findFileInOtherLayers(cache *layersCache, file *internal.FileMetadata, dirfd int, useHardLinks bool) (bool, *os.File, int64, error) {
+	target, name, err := cache.findFileInOtherLayers(file, useHardLinks)
+	if err != nil || name == "" {
+		return false, nil, 0, err
+	}
+	return copyFileFromOtherLayer(file, target, name, dirfd, useHardLinks)
+}
+
 func maybeDoIDRemap(manifest []internal.FileMetadata, options *archive.TarOptions) error {
 	if options.ChownOpts == nil && len(options.UIDMaps) == 0 || len(options.GIDMaps) == 0 {
 		return nil
@@ -490,22 +438,50 @@ func maybeDoIDRemap(manifest []internal.FileMetadata, options *archive.TarOption
 	return nil
 }
 
-type missingFile struct {
-	File *internal.FileMetadata
+type originFile struct {
+	Root   string
+	Path   string
+	Offset int64
+}
+
+type missingFileChunk struct {
 	Gap  int64
+	Hole bool
+
+	File *internal.FileMetadata
+
+	CompressedSize   int64
+	UncompressedSize int64
 }
 
-func (m missingFile) Length() int64 {
-	return m.File.EndOffset - m.File.Offset
+type missingPart struct {
+	Hole        bool
+	SourceChunk *ImageSourceChunk
+	OriginFile  *originFile
+	Chunks      []missingFileChunk
 }
 
-type missingChunk struct {
-	RawChunk ImageSourceChunk
-	Files    []missingFile
+func (o *originFile) OpenFile() (io.ReadCloser, error) {
+	srcDirfd, err := unix.Open(o.Root, unix.O_RDONLY, 0)
+	if err != nil {
+		return nil, fmt.Errorf("open source file: %w", err)
+	}
+	defer unix.Close(srcDirfd)
+
+	srcFile, err := openFileUnderRoot(o.Path, srcDirfd, unix.O_RDONLY, 0)
+	if err != nil {
+		return nil, fmt.Errorf("open source file under target rootfs: %w", err)
+	}
+
+	if _, err := srcFile.Seek(o.Offset, 0); err != nil {
+		srcFile.Close()
+		return nil, err
+	}
+	return srcFile, nil
 }
 
 // setFileAttrs sets the file attributes for file given metadata
-func setFileAttrs(file *os.File, mode os.FileMode, metadata *internal.FileMetadata, options *archive.TarOptions) error {
+func setFileAttrs(dirfd int, file *os.File, mode os.FileMode, metadata *internal.FileMetadata, options *archive.TarOptions, usePath bool) error {
 	if file == nil || file.Fd() < 0 {
 		return errors.Errorf("invalid file")
 	}
@@ -515,211 +491,611 @@ func setFileAttrs(file *os.File, mode os.FileMode, metadata *internal.FileMetada
 	if err != nil {
 		return err
 	}
+
+	// If it is a symlink, force to use the path
 	if t == tar.TypeSymlink {
-		return nil
+		usePath = true
+	}
+
+	baseName := ""
+	if usePath {
+		dirName := filepath.Dir(metadata.Name)
+		if dirName != "" {
+			parentFd, err := openFileUnderRoot(dirName, dirfd, unix.O_PATH|unix.O_DIRECTORY, 0)
+			if err != nil {
+				return err
+			}
+			defer parentFd.Close()
+
+			dirfd = int(parentFd.Fd())
+		}
+		baseName = filepath.Base(metadata.Name)
+	}
+
+	doChown := func() error {
+		if usePath {
+			return unix.Fchownat(dirfd, baseName, metadata.UID, metadata.GID, unix.AT_SYMLINK_NOFOLLOW)
+		}
+		return unix.Fchown(fd, metadata.UID, metadata.GID)
+	}
+
+	doSetXattr := func(k string, v []byte) error {
+		return unix.Fsetxattr(fd, k, v, 0)
+	}
+
+	doUtimes := func() error {
+		ts := []unix.Timespec{timeToTimespec(metadata.AccessTime), timeToTimespec(metadata.ModTime)}
+		if usePath {
+			return unix.UtimesNanoAt(dirfd, baseName, ts, unix.AT_SYMLINK_NOFOLLOW)
+		}
+		return unix.UtimesNanoAt(unix.AT_FDCWD, fmt.Sprintf("/proc/self/fd/%d", fd), ts, 0)
 	}
 
-	if err := unix.Fchown(fd, metadata.UID, metadata.GID); err != nil {
+	doChmod := func() error {
+		if usePath {
+			return unix.Fchmodat(dirfd, baseName, uint32(mode), unix.AT_SYMLINK_NOFOLLOW)
+		}
+		return unix.Fchmod(fd, uint32(mode))
+	}
+
+	if err := doChown(); err != nil {
 		if !options.IgnoreChownErrors {
-			return err
+			return fmt.Errorf("chown %q to %d:%d: %w", metadata.Name, metadata.UID, metadata.GID, err)
 		}
 	}
 
+	canIgnore := func(err error) bool {
+		return err == nil || errors.Is(err, unix.ENOSYS) || errors.Is(err, unix.ENOTSUP)
+	}
+
 	for k, v := range metadata.Xattrs {
+		if _, found := xattrsToIgnore[k]; found {
+			continue
+		}
 		data, err := base64.StdEncoding.DecodeString(v)
 		if err != nil {
-			return err
+			return fmt.Errorf("decode xattr %q: %w", v, err)
 		}
-		if err := unix.Fsetxattr(fd, k, data, 0); err != nil {
-			return err
+		if err := doSetXattr(k, data); !canIgnore(err) {
+			return fmt.Errorf("set xattr %s=%q for %q: %w", k, data, metadata.Name, err)
 		}
 	}
 
-	ts := []unix.Timespec{timeToTimespec(metadata.AccessTime), timeToTimespec(metadata.ModTime)}
-	if err := unix.UtimesNanoAt(fd, "", ts, 0); err != nil && errors.Is(err, unix.ENOSYS) {
-		return err
+	if err := doUtimes(); !canIgnore(err) {
+		return fmt.Errorf("set utimes for %q: %w", metadata.Name, err)
 	}
 
-	if err := unix.Fchmod(fd, uint32(mode)); err != nil {
-		return err
+	if err := doChmod(); !canIgnore(err) {
+		return fmt.Errorf("chmod %q: %w", metadata.Name, err)
 	}
 	return nil
 }
 
-// openFileUnderRoot safely opens a file under the specified root directory using openat2
-// name is the path to open relative to dirfd.
-// dirfd is an open file descriptor to the target checkout directory.
-// flags are the flags top pass to the open syscall.
-// mode specifies the mode to use for newly created files.
-func openFileUnderRoot(name string, dirfd int, flags uint64, mode os.FileMode) (*os.File, error) {
+func openFileUnderRootFallback(dirfd int, name string, flags uint64, mode os.FileMode) (int, error) {
+	root := fmt.Sprintf("/proc/self/fd/%d", dirfd)
+
+	targetRoot, err := os.Readlink(root)
+	if err != nil {
+		return -1, err
+	}
+
+	hasNoFollow := (flags & unix.O_NOFOLLOW) != 0
+
+	fd := -1
+	// If O_NOFOLLOW is specified in the flags, then resolve only the parent directory and use the
+	// last component as the path to openat().
+	if hasNoFollow {
+		dirName := filepath.Dir(name)
+		if dirName != "" {
+			newRoot, err := securejoin.SecureJoin(root, filepath.Dir(name))
+			if err != nil {
+				return -1, err
+			}
+			root = newRoot
+		}
+
+		parentDirfd, err := unix.Open(root, unix.O_PATH, 0)
+		if err != nil {
+			return -1, err
+		}
+		defer unix.Close(parentDirfd)
+
+		fd, err = unix.Openat(parentDirfd, filepath.Base(name), int(flags), uint32(mode))
+		if err != nil {
+			return -1, err
+		}
+	} else {
+		newPath, err := securejoin.SecureJoin(root, name)
+		if err != nil {
+			return -1, err
+		}
+		fd, err = unix.Openat(dirfd, newPath, int(flags), uint32(mode))
+		if err != nil {
+			return -1, err
+		}
+	}
+
+	target, err := os.Readlink(fmt.Sprintf("/proc/self/fd/%d", fd))
+	if err != nil {
+		unix.Close(fd)
+		return -1, err
+	}
+
+	// Add an additional check to make sure the opened fd is inside the rootfs
+	if !strings.HasPrefix(target, targetRoot) {
+		unix.Close(fd)
+		return -1, fmt.Errorf("error while resolving %q.  It resolves outside the root directory", name)
+	}
+
+	return fd, err
+}
+
+func openFileUnderRootOpenat2(dirfd int, name string, flags uint64, mode os.FileMode) (int, error) {
 	how := unix.OpenHow{
 		Flags:   flags,
 		Mode:    uint64(mode & 07777),
 		Resolve: unix.RESOLVE_IN_ROOT,
 	}
+	return unix.Openat2(dirfd, name, &how)
+}
 
-	fd, err := unix.Openat2(dirfd, name, &how)
-	if err != nil {
-		return nil, err
+// skipOpenat2 is set when openat2 is not supported by the underlying kernel and avoid
+// using it again.
+var skipOpenat2 int32
+
+// openFileUnderRootRaw tries to open a file using openat2 and if it is not supported fallbacks to a
+// userspace lookup.
+func openFileUnderRootRaw(dirfd int, name string, flags uint64, mode os.FileMode) (int, error) {
+	var fd int
+	var err error
+	if atomic.LoadInt32(&skipOpenat2) > 0 {
+		fd, err = openFileUnderRootFallback(dirfd, name, flags, mode)
+	} else {
+		fd, err = openFileUnderRootOpenat2(dirfd, name, flags, mode)
+		// If the function failed with ENOSYS, switch off the support for openat2
+		// and fallback to using safejoin.
+		if err != nil && errors.Is(err, unix.ENOSYS) {
+			atomic.StoreInt32(&skipOpenat2, 1)
+			fd, err = openFileUnderRootFallback(dirfd, name, flags, mode)
+		}
 	}
-	return os.NewFile(uintptr(fd), name), nil
+	return fd, err
 }
 
-func (c *chunkedDiffer) createFileFromCompressedStream(dest string, dirfd int, reader io.Reader, mode os.FileMode, metadata *internal.FileMetadata, options *archive.TarOptions) (err error) {
-	file, err := openFileUnderRoot(metadata.Name, dirfd, newFileFlags, 0)
-	if err != nil {
-		return err
-	}
-	defer func() {
-		err2 := file.Close()
-		if err == nil {
-			err = err2
+// openFileUnderRoot safely opens a file under the specified root directory using openat2
+// name is the path to open relative to dirfd.
+// dirfd is an open file descriptor to the target checkout directory.
+// flags are the flags to pass to the open syscall.
+// mode specifies the mode to use for newly created files.
+func openFileUnderRoot(name string, dirfd int, flags uint64, mode os.FileMode) (*os.File, error) {
+	fd, err := openFileUnderRootRaw(dirfd, name, flags, mode)
+	if err == nil {
+		return os.NewFile(uintptr(fd), name), nil
+	}
+
+	hasCreate := (flags & unix.O_CREAT) != 0
+	if errors.Is(err, unix.ENOENT) && hasCreate {
+		parent := filepath.Dir(name)
+		if parent != "" {
+			newDirfd, err2 := openOrCreateDirUnderRoot(parent, dirfd, 0)
+			if err2 == nil {
+				defer newDirfd.Close()
+				fd, err := openFileUnderRootRaw(int(newDirfd.Fd()), filepath.Base(name), flags, mode)
+				if err == nil {
+					return os.NewFile(uintptr(fd), name), nil
+				}
+			}
 		}
-	}()
+	}
+	return nil, fmt.Errorf("open %q under the rootfs: %w", name, err)
+}
 
-	digester := digest.Canonical.Digester()
-	checksum := digester.Hash()
-	to := io.MultiWriter(file, checksum)
+// openOrCreateDirUnderRoot safely opens a directory or create it if it is missing.
+// name is the path to open relative to dirfd.
+// dirfd is an open file descriptor to the target checkout directory.
+// mode specifies the mode to use for newly created files.
+func openOrCreateDirUnderRoot(name string, dirfd int, mode os.FileMode) (*os.File, error) {
+	fd, err := openFileUnderRootRaw(dirfd, name, unix.O_DIRECTORY|unix.O_RDONLY, mode)
+	if err == nil {
+		return os.NewFile(uintptr(fd), name), nil
+	}
+
+	if errors.Is(err, unix.ENOENT) {
+		parent := filepath.Dir(name)
+		if parent != "" {
+			pDir, err2 := openOrCreateDirUnderRoot(parent, dirfd, mode)
+			if err2 != nil {
+				return nil, err
+			}
+			defer pDir.Close()
 
-	switch c.fileType {
-	case fileTypeZstdChunked:
-		z, err := zstd.NewReader(reader)
-		if err != nil {
-			return err
-		}
-		defer z.Close()
+			baseName := filepath.Base(name)
 
-		if _, err := io.Copy(to, io.LimitReader(z, metadata.Size)); err != nil {
-			return err
+			if err2 := unix.Mkdirat(int(pDir.Fd()), baseName, 0755); err2 != nil {
+				return nil, err
+			}
+
+			fd, err = openFileUnderRootRaw(int(pDir.Fd()), baseName, unix.O_DIRECTORY|unix.O_RDONLY, mode)
+			if err == nil {
+				return os.NewFile(uintptr(fd), name), nil
+			}
 		}
-		if _, err := io.Copy(ioutil.Discard, reader); err != nil {
-			return err
+	}
+	return nil, err
+}
+
+func (c *chunkedDiffer) prepareCompressedStreamToFile(partCompression compressedFileType, from io.Reader, mf *missingFileChunk) (compressedFileType, error) {
+	switch {
+	case partCompression == fileTypeHole:
+		// The entire part is a hole.  Do not need to read from a file.
+		c.rawReader = nil
+		return fileTypeHole, nil
+	case mf.Hole:
+		// Only the missing chunk in the requested part refers to a hole.
+		// The received data must be discarded.
+		limitReader := io.LimitReader(from, mf.CompressedSize)
+		_, err := io.CopyBuffer(ioutil.Discard, limitReader, c.copyBuffer)
+		return fileTypeHole, err
+	case partCompression == fileTypeZstdChunked:
+		c.rawReader = io.LimitReader(from, mf.CompressedSize)
+		if c.zstdReader == nil {
+			r, err := zstd.NewReader(c.rawReader)
+			if err != nil {
+				return partCompression, err
+			}
+			c.zstdReader = r
+		} else {
+			if err := c.zstdReader.Reset(c.rawReader); err != nil {
+				return partCompression, err
+			}
 		}
-	case fileTypeEstargz:
+	case partCompression == fileTypeEstargz:
+		c.rawReader = io.LimitReader(from, mf.CompressedSize)
 		if c.gzipReader == nil {
-			r, err := pgzip.NewReader(reader)
+			r, err := pgzip.NewReader(c.rawReader)
 			if err != nil {
-				return err
+				return partCompression, err
 			}
 			c.gzipReader = r
 		} else {
-			if err := c.gzipReader.Reset(reader); err != nil {
-				return err
+			if err := c.gzipReader.Reset(c.rawReader); err != nil {
+				return partCompression, err
 			}
 		}
-		defer c.gzipReader.Close()
+	case partCompression == fileTypeNoCompression:
+		c.rawReader = io.LimitReader(from, mf.UncompressedSize)
+	default:
+		return partCompression, fmt.Errorf("unknown file type %q", c.fileType)
+	}
+	return partCompression, nil
+}
 
-		if _, err := io.Copy(to, io.LimitReader(c.gzipReader, metadata.Size)); err != nil {
+// hashHole writes SIZE zeros to the specified hasher
+func hashHole(h hash.Hash, size int64, copyBuffer []byte) error {
+	count := int64(len(copyBuffer))
+	if size < count {
+		count = size
+	}
+	for i := int64(0); i < count; i++ {
+		copyBuffer[i] = 0
+	}
+	for size > 0 {
+		count = int64(len(copyBuffer))
+		if size < count {
+			count = size
+		}
+		if _, err := h.Write(copyBuffer[:count]); err != nil {
 			return err
 		}
-		if _, err := io.Copy(ioutil.Discard, reader); err != nil {
+		size -= count
+	}
+	return nil
+}
+
+// appendHole creates a hole with the specified size at the open fd.
+func appendHole(fd int, size int64) error {
+	off, err := unix.Seek(fd, size, unix.SEEK_CUR)
+	if err != nil {
+		return err
+	}
+	// Make sure the file size is changed.  It might be the last hole and no other data written afterwards.
+	if err := unix.Ftruncate(fd, off); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (c *chunkedDiffer) appendCompressedStreamToFile(compression compressedFileType, destFile *destinationFile, size int64) error {
+	switch compression {
+	case fileTypeZstdChunked:
+		defer c.zstdReader.Reset(nil)
+		if _, err := io.CopyBuffer(destFile.to, io.LimitReader(c.zstdReader, size), c.copyBuffer); err != nil {
+			return err
+		}
+	case fileTypeEstargz:
+		defer c.gzipReader.Close()
+		if _, err := io.CopyBuffer(destFile.to, io.LimitReader(c.gzipReader, size), c.copyBuffer); err != nil {
+			return err
+		}
+	case fileTypeNoCompression:
+		if _, err := io.CopyBuffer(destFile.to, io.LimitReader(c.rawReader, size), c.copyBuffer); err != nil {
+			return err
+		}
+	case fileTypeHole:
+		if err := appendHole(int(destFile.file.Fd()), size); err != nil {
+			return err
+		}
+		if err := hashHole(destFile.hash, size, c.copyBuffer); err != nil {
 			return err
 		}
 	default:
 		return fmt.Errorf("unknown file type %q", c.fileType)
 	}
+	return nil
+}
+
+type destinationFile struct {
+	dirfd    int
+	file     *os.File
+	digester digest.Digester
+	hash     hash.Hash
+	to       io.Writer
+	metadata *internal.FileMetadata
+	options  *archive.TarOptions
+}
+
+func openDestinationFile(dirfd int, metadata *internal.FileMetadata, options *archive.TarOptions) (*destinationFile, error) {
+	file, err := openFileUnderRoot(metadata.Name, dirfd, newFileFlags, 0)
+	if err != nil {
+		return nil, err
+	}
 
-	manifestChecksum, err := digest.Parse(metadata.Digest)
+	digester := digest.Canonical.Digester()
+	hash := digester.Hash()
+	to := io.MultiWriter(file, hash)
+
+	return &destinationFile{
+		file:     file,
+		digester: digester,
+		hash:     hash,
+		to:       to,
+		metadata: metadata,
+		options:  options,
+		dirfd:    dirfd,
+	}, nil
+}
+
+func (d *destinationFile) Close() error {
+	manifestChecksum, err := digest.Parse(d.metadata.Digest)
 	if err != nil {
 		return err
 	}
-	if digester.Digest() != manifestChecksum {
-		return fmt.Errorf("checksum mismatch for %q", dest)
+	if d.digester.Digest() != manifestChecksum {
+		return fmt.Errorf("checksum mismatch for %q (got %q instead of %q)", d.file.Name(), d.digester.Digest(), manifestChecksum)
+	}
+
+	return setFileAttrs(d.dirfd, d.file, os.FileMode(d.metadata.Mode), d.metadata, d.options, false)
+}
+
+func closeDestinationFiles(files chan *destinationFile, errors chan error) {
+	for f := range files {
+		errors <- f.Close()
 	}
-	return setFileAttrs(file, mode, metadata, options)
+	close(errors)
 }
 
-func (c *chunkedDiffer) storeMissingFiles(streams chan io.ReadCloser, errs chan error, dest string, dirfd int, missingChunks []missingChunk, options *archive.TarOptions) error {
-	for mc := 0; ; mc++ {
-		var part io.ReadCloser
-		select {
-		case p := <-streams:
-			part = p
-		case err := <-errs:
-			return err
-		}
-		if part == nil {
-			if mc == len(missingChunks) {
-				break
+func (c *chunkedDiffer) storeMissingFiles(streams chan io.ReadCloser, errs chan error, dest string, dirfd int, missingParts []missingPart, options *archive.TarOptions) (Err error) {
+	var destFile *destinationFile
+
+	filesToClose := make(chan *destinationFile, 3)
+	closeFilesErrors := make(chan error, 2)
+
+	go closeDestinationFiles(filesToClose, closeFilesErrors)
+	defer func() {
+		close(filesToClose)
+		for e := range closeFilesErrors {
+			if e != nil && Err == nil {
+				Err = e
 			}
-			return errors.Errorf("invalid stream returned")
 		}
-		if mc == len(missingChunks) {
-			part.Close()
-			return errors.Errorf("too many chunks returned")
+	}()
+
+	for _, missingPart := range missingParts {
+		var part io.ReadCloser
+		partCompression := c.fileType
+		switch {
+		case missingPart.Hole:
+			partCompression = fileTypeHole
+		case missingPart.OriginFile != nil:
+			var err error
+			part, err = missingPart.OriginFile.OpenFile()
+			if err != nil {
+				return err
+			}
+			partCompression = fileTypeNoCompression
+		case missingPart.SourceChunk != nil:
+			select {
+			case p := <-streams:
+				part = p
+			case err := <-errs:
+				if err == nil {
+					return errors.New("not enough data returned from the server")
+				}
+				return err
+			}
+			if part == nil {
+				return errors.Errorf("invalid stream returned")
+			}
+		default:
+			return errors.Errorf("internal error: missing part misses both local and remote data stream")
 		}
 
-		for _, mf := range missingChunks[mc].Files {
+		for _, mf := range missingPart.Chunks {
 			if mf.Gap > 0 {
 				limitReader := io.LimitReader(part, mf.Gap)
-				_, err := io.Copy(ioutil.Discard, limitReader)
+				_, err := io.CopyBuffer(ioutil.Discard, limitReader, c.copyBuffer)
 				if err != nil {
-					part.Close()
-					return err
+					Err = err
+					goto exit
 				}
 				continue
 			}
 
-			limitReader := io.LimitReader(part, mf.Length())
+			if mf.File.Name == "" {
+				Err = errors.Errorf("file name empty")
+				goto exit
+			}
 
-			if err := c.createFileFromCompressedStream(dest, dirfd, limitReader, os.FileMode(mf.File.Mode), mf.File, options); err != nil {
-				part.Close()
-				return err
+			compression, err := c.prepareCompressedStreamToFile(partCompression, part, &mf)
+			if err != nil {
+				Err = err
+				goto exit
+			}
+
+			// Open the new file if it is different that what is already
+			// opened
+			if destFile == nil || destFile.metadata.Name != mf.File.Name {
+				var err error
+				if destFile != nil {
+				cleanup:
+					for {
+						select {
+						case err = <-closeFilesErrors:
+							if err != nil {
+								Err = err
+								goto exit
+							}
+						default:
+							break cleanup
+						}
+					}
+					filesToClose <- destFile
+				}
+				destFile, err = openDestinationFile(dirfd, mf.File, options)
+				if err != nil {
+					Err = err
+					goto exit
+				}
+			}
+
+			if err := c.appendCompressedStreamToFile(compression, destFile, mf.UncompressedSize); err != nil {
+				Err = err
+				goto exit
+			}
+			if c.rawReader != nil {
+				if _, err := io.CopyBuffer(ioutil.Discard, c.rawReader, c.copyBuffer); err != nil {
+					Err = err
+					goto exit
+				}
 			}
 		}
-		part.Close()
+	exit:
+		if part != nil {
+			part.Close()
+			if Err != nil {
+				break
+			}
+		}
+	}
+
+	if destFile != nil {
+		return destFile.Close()
 	}
+
 	return nil
 }
 
-func mergeMissingChunks(missingChunks []missingChunk, target int) []missingChunk {
-	if len(missingChunks) <= target {
-		return missingChunks
+func mergeMissingChunks(missingParts []missingPart, target int) []missingPart {
+	getGap := func(missingParts []missingPart, i int) int {
+		prev := missingParts[i-1].SourceChunk.Offset + missingParts[i-1].SourceChunk.Length
+		return int(missingParts[i].SourceChunk.Offset - prev)
 	}
+	getCost := func(missingParts []missingPart, i int) int {
+		cost := getGap(missingParts, i)
+		if missingParts[i-1].OriginFile != nil {
+			cost += int(missingParts[i-1].SourceChunk.Length)
+		}
+		if missingParts[i].OriginFile != nil {
+			cost += int(missingParts[i].SourceChunk.Length)
+		}
+		return cost
+	}
+
+	// simple case: merge chunks from the same file.
+	newMissingParts := missingParts[0:1]
+	prevIndex := 0
+	for i := 1; i < len(missingParts); i++ {
+		gap := getGap(missingParts, i)
+		if gap == 0 && missingParts[prevIndex].OriginFile == nil &&
+			missingParts[i].OriginFile == nil &&
+			!missingParts[prevIndex].Hole && !missingParts[i].Hole &&
+			len(missingParts[prevIndex].Chunks) == 1 && len(missingParts[i].Chunks) == 1 &&
+			missingParts[prevIndex].Chunks[0].File.Name == missingParts[i].Chunks[0].File.Name {
+			missingParts[prevIndex].SourceChunk.Length += uint64(gap) + missingParts[i].SourceChunk.Length
+			missingParts[prevIndex].Chunks[0].CompressedSize += missingParts[i].Chunks[0].CompressedSize
+			missingParts[prevIndex].Chunks[0].UncompressedSize += missingParts[i].Chunks[0].UncompressedSize
+		} else {
+			newMissingParts = append(newMissingParts, missingParts[i])
+			prevIndex++
+		}
+	}
+	missingParts = newMissingParts
 
-	getGap := func(missingChunks []missingChunk, i int) int {
-		prev := missingChunks[i-1].RawChunk.Offset + missingChunks[i-1].RawChunk.Length
-		return int(missingChunks[i].RawChunk.Offset - prev)
+	if len(missingParts) <= target {
+		return missingParts
 	}
 
 	// this implementation doesn't account for duplicates, so it could merge
 	// more than necessary to reach the specified target.  Since target itself
 	// is a heuristic value, it doesn't matter.
-	var gaps []int
-	for i := 1; i < len(missingChunks); i++ {
-		gaps = append(gaps, getGap(missingChunks, i))
+	costs := make([]int, len(missingParts)-1)
+	for i := 1; i < len(missingParts); i++ {
+		costs[i-1] = getCost(missingParts, i)
 	}
-	sort.Ints(gaps)
+	sort.Ints(costs)
 
-	toShrink := len(missingChunks) - target
-	targetValue := gaps[toShrink-1]
+	toShrink := len(missingParts) - target
+	if toShrink >= len(costs) {
+		toShrink = len(costs) - 1
+	}
+	targetValue := costs[toShrink]
 
-	newMissingChunks := missingChunks[0:1]
-	for i := 1; i < len(missingChunks); i++ {
-		gap := getGap(missingChunks, i)
-		if gap > targetValue {
-			newMissingChunks = append(newMissingChunks, missingChunks[i])
+	newMissingParts = missingParts[0:1]
+	for i := 1; i < len(missingParts); i++ {
+		if getCost(missingParts, i) > targetValue {
+			newMissingParts = append(newMissingParts, missingParts[i])
 		} else {
-			prev := &newMissingChunks[len(newMissingChunks)-1]
-			prev.RawChunk.Length += uint64(gap) + missingChunks[i].RawChunk.Length
+			gap := getGap(missingParts, i)
+			prev := &newMissingParts[len(newMissingParts)-1]
+			prev.SourceChunk.Length += uint64(gap) + missingParts[i].SourceChunk.Length
+			prev.Hole = false
+			prev.OriginFile = nil
 			if gap > 0 {
-				gapFile := missingFile{
+				gapFile := missingFileChunk{
 					Gap: int64(gap),
 				}
-				prev.Files = append(prev.Files, gapFile)
+				prev.Chunks = append(prev.Chunks, gapFile)
 			}
-			prev.Files = append(prev.Files, missingChunks[i].Files...)
+			prev.Chunks = append(prev.Chunks, missingParts[i].Chunks...)
 		}
 	}
-	return newMissingChunks
+	return newMissingParts
 }
 
-func (c *chunkedDiffer) retrieveMissingFiles(dest string, dirfd int, missingChunks []missingChunk, options *archive.TarOptions) error {
+func (c *chunkedDiffer) retrieveMissingFiles(dest string, dirfd int, missingParts []missingPart, options *archive.TarOptions) error {
 	var chunksToRequest []ImageSourceChunk
-	for _, c := range missingChunks {
-		chunksToRequest = append(chunksToRequest, c.RawChunk)
+
+	calculateChunksToRequest := func() {
+		chunksToRequest = []ImageSourceChunk{}
+		for _, c := range missingParts {
+			if c.OriginFile == nil && !c.Hole {
+				chunksToRequest = append(chunksToRequest, *c.SourceChunk)
+			}
+		}
 	}
 
+	calculateChunksToRequest()
+
 	// There are some missing files.  Prepare a multirange request for the missing chunks.
 	var streams chan io.ReadCloser
 	var err error
@@ -731,32 +1107,33 @@ func (c *chunkedDiffer) retrieveMissingFiles(dest string, dirfd int, missingChun
 		}
 
 		if _, ok := err.(ErrBadRequest); ok {
-			requested := len(missingChunks)
+			requested := len(missingParts)
 			// If the server cannot handle at least 64 chunks in a single request, just give up.
 			if requested < 64 {
 				return err
 			}
 
 			// Merge more chunks to request
-			missingChunks = mergeMissingChunks(missingChunks, requested/2)
+			missingParts = mergeMissingChunks(missingParts, requested/2)
+			calculateChunksToRequest()
 			continue
 		}
 		return err
 	}
 
-	if err := c.storeMissingFiles(streams, errs, dest, dirfd, missingChunks, options); err != nil {
+	if err := c.storeMissingFiles(streams, errs, dest, dirfd, missingParts, options); err != nil {
 		return err
 	}
 	return nil
 }
 
-func safeMkdir(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, options *archive.TarOptions) error {
-	parent := filepath.Dir(metadata.Name)
-	base := filepath.Base(metadata.Name)
+func safeMkdir(dirfd int, mode os.FileMode, name string, metadata *internal.FileMetadata, options *archive.TarOptions) error {
+	parent := filepath.Dir(name)
+	base := filepath.Base(name)
 
 	parentFd := dirfd
 	if parent != "." {
-		parentFile, err := openFileUnderRoot(parent, dirfd, unix.O_DIRECTORY|unix.O_PATH|unix.O_RDONLY, 0)
+		parentFile, err := openOrCreateDirUnderRoot(parent, dirfd, 0)
 		if err != nil {
 			return err
 		}
@@ -766,21 +1143,21 @@ func safeMkdir(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, opt
 
 	if err := unix.Mkdirat(parentFd, base, uint32(mode)); err != nil {
 		if !os.IsExist(err) {
-			return err
+			return fmt.Errorf("mkdir %q: %w", name, err)
 		}
 	}
 
-	file, err := openFileUnderRoot(metadata.Name, dirfd, unix.O_RDONLY, 0)
+	file, err := openFileUnderRoot(base, parentFd, unix.O_DIRECTORY|unix.O_RDONLY, 0)
 	if err != nil {
 		return err
 	}
 	defer file.Close()
 
-	return setFileAttrs(file, mode, metadata, options)
+	return setFileAttrs(dirfd, file, mode, metadata, options, false)
 }
 
 func safeLink(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, options *archive.TarOptions) error {
-	sourceFile, err := openFileUnderRoot(metadata.Linkname, dirfd, unix.O_RDONLY, 0)
+	sourceFile, err := openFileUnderRoot(metadata.Linkname, dirfd, unix.O_PATH|unix.O_RDONLY|unix.O_NOFOLLOW, 0)
 	if err != nil {
 		return err
 	}
@@ -789,7 +1166,7 @@ func safeLink(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, opti
 	destDir, destBase := filepath.Dir(metadata.Name), filepath.Base(metadata.Name)
 	destDirFd := dirfd
 	if destDir != "." {
-		f, err := openFileUnderRoot(destDir, dirfd, unix.O_RDONLY, 0)
+		f, err := openOrCreateDirUnderRoot(destDir, dirfd, 0)
 		if err != nil {
 			return err
 		}
@@ -797,25 +1174,35 @@ func safeLink(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, opti
 		destDirFd = int(f.Fd())
 	}
 
-	err = unix.Linkat(int(sourceFile.Fd()), "", destDirFd, destBase, unix.AT_EMPTY_PATH)
+	err = doHardLink(int(sourceFile.Fd()), destDirFd, destBase)
 	if err != nil {
-		return err
+		return fmt.Errorf("create hardlink %q pointing to %q: %w", metadata.Name, metadata.Linkname, err)
 	}
 
-	newFile, err := openFileUnderRoot(metadata.Name, dirfd, unix.O_WRONLY, 0)
+	newFile, err := openFileUnderRoot(metadata.Name, dirfd, unix.O_WRONLY|unix.O_NOFOLLOW, 0)
 	if err != nil {
+		// If the target is a symlink, open the file with O_PATH.
+		if errors.Is(err, unix.ELOOP) {
+			newFile, err := openFileUnderRoot(metadata.Name, dirfd, unix.O_PATH|unix.O_NOFOLLOW, 0)
+			if err != nil {
+				return err
+			}
+			defer newFile.Close()
+
+			return setFileAttrs(dirfd, newFile, mode, metadata, options, true)
+		}
 		return err
 	}
 	defer newFile.Close()
 
-	return setFileAttrs(newFile, mode, metadata, options)
+	return setFileAttrs(dirfd, newFile, mode, metadata, options, false)
 }
 
 func safeSymlink(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, options *archive.TarOptions) error {
 	destDir, destBase := filepath.Dir(metadata.Name), filepath.Base(metadata.Name)
 	destDirFd := dirfd
 	if destDir != "." {
-		f, err := openFileUnderRoot(destDir, dirfd, unix.O_RDONLY, 0)
+		f, err := openOrCreateDirUnderRoot(destDir, dirfd, 0)
 		if err != nil {
 			return err
 		}
@@ -823,7 +1210,10 @@ func safeSymlink(dirfd int, mode os.FileMode, metadata *internal.FileMetadata, o
 		destDirFd = int(f.Fd())
 	}
 
-	return unix.Symlinkat(metadata.Linkname, destDirFd, destBase)
+	if err := unix.Symlinkat(metadata.Linkname, destDirFd, destBase); err != nil {
+		return fmt.Errorf("create symlink %q pointing to %q: %w", metadata.Name, metadata.Linkname, err)
+	}
+	return nil
 }
 
 type whiteoutHandler struct {
@@ -832,13 +1222,16 @@ type whiteoutHandler struct {
 }
 
 func (d whiteoutHandler) Setxattr(path, name string, value []byte) error {
-	file, err := openFileUnderRoot(path, d.Dirfd, unix.O_RDONLY, 0)
+	file, err := openOrCreateDirUnderRoot(path, d.Dirfd, 0)
 	if err != nil {
 		return err
 	}
 	defer file.Close()
 
-	return unix.Fsetxattr(int(file.Fd()), name, value, 0)
+	if err := unix.Fsetxattr(int(file.Fd()), name, value, 0); err != nil {
+		return fmt.Errorf("set xattr %s=%q for %q: %w", name, value, path, err)
+	}
+	return nil
 }
 
 func (d whiteoutHandler) Mknod(path string, mode uint32, dev int) error {
@@ -847,7 +1240,7 @@ func (d whiteoutHandler) Mknod(path string, mode uint32, dev int) error {
 
 	dirfd := d.Dirfd
 	if dir != "" {
-		dir, err := openFileUnderRoot(dir, d.Dirfd, unix.O_RDONLY, 0)
+		dir, err := openOrCreateDirUnderRoot(dir, d.Dirfd, 0)
 		if err != nil {
 			return err
 		}
@@ -856,12 +1249,16 @@ func (d whiteoutHandler) Mknod(path string, mode uint32, dev int) error {
 		dirfd = int(dir.Fd())
 	}
 
-	return unix.Mknodat(dirfd, base, mode, dev)
+	if err := unix.Mknodat(dirfd, base, mode, dev); err != nil {
+		return fmt.Errorf("mknod %q: %w", path, err)
+	}
+
+	return nil
 }
 
 func checkChownErr(err error, name string, uid, gid int) error {
 	if errors.Is(err, syscall.EINVAL) {
-		return errors.Wrapf(err, "potentially insufficient UIDs or GIDs available in user namespace (requested %d:%d for %s): Check /etc/subuid and /etc/subgid", uid, gid, name)
+		return fmt.Errorf("potentially insufficient UIDs or GIDs available in user namespace (requested %d:%d for %s): Check /etc/subuid and /etc/subgid if configured locally and run podman-system-migrate: %w", uid, gid, name, err)
 	}
 	return err
 }
@@ -899,7 +1296,69 @@ func parseBooleanPullOption(storeOpts *storage.StoreOptions, name string, def bo
 	return def
 }
 
+type findAndCopyFileOptions struct {
+	useHardLinks    bool
+	enableHostDedup bool
+	ostreeRepos     []string
+	options         *archive.TarOptions
+}
+
+func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *internal.FileMetadata, copyOptions *findAndCopyFileOptions, mode os.FileMode) (bool, error) {
+	finalizeFile := func(dstFile *os.File) error {
+		if dstFile != nil {
+			defer dstFile.Close()
+			if err := setFileAttrs(dirfd, dstFile, mode, r, copyOptions.options, false); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+
+	found, dstFile, _, err := findFileInOtherLayers(c.layersCache, r, dirfd, copyOptions.useHardLinks)
+	if err != nil {
+		return false, err
+	}
+	if found {
+		if err := finalizeFile(dstFile); err != nil {
+			return false, err
+		}
+		return true, nil
+	}
+
+	found, dstFile, _, err = findFileInOSTreeRepos(r, copyOptions.ostreeRepos, dirfd, copyOptions.useHardLinks)
+	if err != nil {
+		return false, err
+	}
+	if found {
+		if err := finalizeFile(dstFile); err != nil {
+			return false, err
+		}
+		return true, nil
+	}
+
+	if copyOptions.enableHostDedup {
+		found, dstFile, _, err = findFileOnTheHost(r, dirfd, copyOptions.useHardLinks, c.copyBuffer)
+		if err != nil {
+			return false, err
+		}
+		if found {
+			if err := finalizeFile(dstFile); err != nil {
+				return false, err
+			}
+			return true, nil
+		}
+	}
+	return false, nil
+}
+
 func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (graphdriver.DriverWithDifferOutput, error) {
+	defer c.layersCache.release()
+	defer func() {
+		if c.zstdReader != nil {
+			c.zstdReader.Close()
+		}
+	}()
+
 	bigData := map[string][]byte{
 		bigDataKey: c.manifest,
 	}
@@ -927,14 +1386,14 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
 	ostreeRepos := strings.Split(storeOpts.PullOptions["ostree_repos"], ":")
 
 	// Generate the manifest
-	var toc internal.TOC
-	if err := json.Unmarshal(c.manifest, &toc); err != nil {
+	toc, err := unmarshalToc(c.manifest)
+	if err != nil {
 		return output, err
 	}
 
 	whiteoutConverter := archive.GetWhiteoutConverter(options.WhiteoutFormat, options.WhiteoutData)
 
-	var missingChunks []missingChunk
+	var missingParts []missingPart
 
 	mergedEntries, err := c.mergeTocEntries(c.fileType, toc.Entries)
 	if err != nil {
@@ -956,17 +1415,61 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
 
 	dirfd, err := unix.Open(dest, unix.O_RDONLY|unix.O_PATH, 0)
 	if err != nil {
-		return output, err
+		return output, fmt.Errorf("cannot open %q: %w", dest, err)
 	}
 	defer unix.Close(dirfd)
 
-	otherLayersCache := prepareOtherLayersCache(c.layersMetadata)
-
 	// hardlinks can point to missing files.  So create them after all files
 	// are retrieved
 	var hardLinks []hardLinkToCreate
 
-	missingChunksSize, totalChunksSize := int64(0), int64(0)
+	missingPartsSize, totalChunksSize := int64(0), int64(0)
+
+	copyOptions := findAndCopyFileOptions{
+		useHardLinks:    useHardLinks,
+		enableHostDedup: enableHostDedup,
+		ostreeRepos:     ostreeRepos,
+		options:         options,
+	}
+
+	type copyFileJob struct {
+		njob     int
+		index    int
+		mode     os.FileMode
+		metadata *internal.FileMetadata
+
+		found bool
+		err   error
+	}
+
+	var wg sync.WaitGroup
+
+	copyResults := make([]copyFileJob, len(mergedEntries))
+
+	copyFileJobs := make(chan copyFileJob)
+	defer func() {
+		if copyFileJobs != nil {
+			close(copyFileJobs)
+		}
+		wg.Wait()
+	}()
+
+	for i := 0; i < copyGoRoutines; i++ {
+		wg.Add(1)
+		jobs := copyFileJobs
+
+		go func() {
+			defer wg.Done()
+			for job := range jobs {
+				found, err := c.findAndCopyFile(dirfd, job.metadata, &copyOptions, job.mode)
+				job.err = err
+				job.found = found
+				copyResults[job.njob] = job
+			}
+		}()
+	}
+
+	filesToWaitFor := 0
 	for i, r := range mergedEntries {
 		if options.ForceMask != nil {
 			value := fmt.Sprintf("%d:%d:0%o", r.UID, r.GID, r.Mode&07777)
@@ -1016,7 +1519,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
 						return err
 					}
 					defer file.Close()
-					if err := setFileAttrs(file, mode, &r, options); err != nil {
+					if err := setFileAttrs(dirfd, file, mode, &r, options, false); err != nil {
 						return err
 					}
 					return nil
@@ -1028,7 +1531,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
 			}
 
 		case tar.TypeDir:
-			if err := safeMkdir(dirfd, mode, &r, options); err != nil {
+			if err := safeMkdir(dirfd, mode, r.Name, &r, options); err != nil {
 				return output, err
 			}
 			continue
@@ -1062,74 +1565,95 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
 
 		totalChunksSize += r.Size
 
-		finalizeFile := func(dstFile *os.File) error {
-			if dstFile != nil {
-				defer dstFile.Close()
-				if err := setFileAttrs(dstFile, mode, &r, options); err != nil {
-					return err
-				}
+		if t == tar.TypeReg {
+			index := i
+			njob := filesToWaitFor
+			job := copyFileJob{
+				mode:     mode,
+				metadata: &mergedEntries[index],
+				index:    index,
+				njob:     njob,
 			}
-			return nil
+			copyFileJobs <- job
+			filesToWaitFor++
 		}
+	}
 
-		found, dstFile, _, err := findFileInOtherLayers(&r, dirfd, otherLayersCache, c.layersTarget, useHardLinks)
-		if err != nil {
-			return output, err
-		}
-		if found {
-			if err := finalizeFile(dstFile); err != nil {
-				return output, err
-			}
-			continue
-		}
+	close(copyFileJobs)
+	copyFileJobs = nil
 
-		found, dstFile, _, err = findFileInOSTreeRepos(&r, ostreeRepos, dirfd, useHardLinks)
-		if err != nil {
-			return output, err
+	wg.Wait()
+
+	for _, res := range copyResults[:filesToWaitFor] {
+		r := &mergedEntries[res.index]
+
+		if res.err != nil {
+			return output, res.err
 		}
-		if found {
-			if err := finalizeFile(dstFile); err != nil {
-				return output, err
-			}
+		// the file was already copied to its destination
+		// so nothing left to do.
+		if res.found {
 			continue
 		}
 
-		if enableHostDedup {
-			found, dstFile, _, err = findFileOnTheHost(&r, dirfd, useHardLinks)
-			if err != nil {
-				return output, err
-			}
-			if found {
-				if err := finalizeFile(dstFile); err != nil {
-					return output, err
-				}
-				continue
+		missingPartsSize += r.Size
+
+		remainingSize := r.Size
+
+		// the file is missing, attempt to find individual chunks.
+		for _, chunk := range r.Chunks {
+			compressedSize := int64(chunk.EndOffset - chunk.Offset)
+			size := remainingSize
+			if chunk.ChunkSize > 0 {
+				size = chunk.ChunkSize
 			}
-		}
+			remainingSize = remainingSize - size
 
-		missingChunksSize += r.Size
-		if t == tar.TypeReg {
 			rawChunk := ImageSourceChunk{
-				Offset: uint64(r.Offset),
-				Length: uint64(r.EndOffset - r.Offset),
+				Offset: uint64(chunk.Offset),
+				Length: uint64(compressedSize),
 			}
-
-			file := missingFile{
-				File: &mergedEntries[i],
+			file := missingFileChunk{
+				File:             &mergedEntries[res.index],
+				CompressedSize:   compressedSize,
+				UncompressedSize: size,
 			}
-
-			missingChunks = append(missingChunks, missingChunk{
-				RawChunk: rawChunk,
-				Files: []missingFile{
+			mp := missingPart{
+				SourceChunk: &rawChunk,
+				Chunks: []missingFileChunk{
 					file,
 				},
-			})
+			}
+
+			switch chunk.ChunkType {
+			case internal.ChunkTypeData:
+				root, path, offset, err := c.layersCache.findChunkInOtherLayers(chunk)
+				if err != nil {
+					return output, err
+				}
+				if offset >= 0 && validateChunkChecksum(chunk, root, path, offset, c.copyBuffer) {
+					missingPartsSize -= size
+					mp.OriginFile = &originFile{
+						Root:   root,
+						Path:   path,
+						Offset: offset,
+					}
+				}
+			case internal.ChunkTypeZeros:
+				missingPartsSize -= size
+				mp.Hole = true
+				// Mark all chunks belonging to the missing part as holes
+				for i := range mp.Chunks {
+					mp.Chunks[i].Hole = true
+				}
+			}
+			missingParts = append(missingParts, mp)
 		}
 	}
 	// There are some missing files.  Prepare a multirange request for the missing chunks.
-	if len(missingChunks) > 0 {
-		missingChunks = mergeMissingChunks(missingChunks, maxNumberMissingChunks)
-		if err := c.retrieveMissingFiles(dest, dirfd, missingChunks, options); err != nil {
+	if len(missingParts) > 0 {
+		missingParts = mergeMissingChunks(missingParts, maxNumberMissingChunks)
+		if err := c.retrieveMissingFiles(dest, dirfd, missingParts, options); err != nil {
 			return output, err
 		}
 	}
@@ -1141,31 +1665,69 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions) (gra
 	}
 
 	if totalChunksSize > 0 {
-		logrus.Debugf("Missing %d bytes out of %d (%.2f %%)", missingChunksSize, totalChunksSize, float32(missingChunksSize*100.0)/float32(totalChunksSize))
+		logrus.Debugf("Missing %d bytes out of %d (%.2f %%)", missingPartsSize, totalChunksSize, float32(missingPartsSize*100.0)/float32(totalChunksSize))
 	}
 	return output, nil
 }
 
+func mustSkipFile(fileType compressedFileType, e internal.FileMetadata) bool {
+	// ignore the metadata files for the estargz format.
+	if fileType != fileTypeEstargz {
+		return false
+	}
+	switch e.Name {
+	// ignore the metadata files for the estargz format.
+	case estargz.PrefetchLandmark, estargz.NoPrefetchLandmark, estargz.TOCTarName:
+		return true
+	}
+	return false
+}
+
 func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []internal.FileMetadata) ([]internal.FileMetadata, error) {
-	var mergedEntries []internal.FileMetadata
-	var prevEntry *internal.FileMetadata
-	for _, entry := range entries {
-		e := entry
+	countNextChunks := func(start int) int {
+		count := 0
+		for _, e := range entries[start:] {
+			if e.Type != TypeChunk {
+				return count
+			}
+			count++
+		}
+		return count
+	}
 
-		// ignore the metadata files for the estargz format.
-		if fileType == fileTypeEstargz && (e.Name == estargz.PrefetchLandmark || e.Name == estargz.NoPrefetchLandmark || e.Name == estargz.TOCTarName) {
+	size := 0
+	for _, entry := range entries {
+		if mustSkipFile(fileType, entry) {
 			continue
 		}
+		if entry.Type != TypeChunk {
+			size++
+		}
+	}
 
+	mergedEntries := make([]internal.FileMetadata, size)
+	m := 0
+	for i := 0; i < len(entries); i++ {
+		e := entries[i]
+		if mustSkipFile(fileType, e) {
+			continue
+		}
 		if e.Type == TypeChunk {
-			if prevEntry == nil || prevEntry.Type != TypeReg {
-				return nil, errors.New("chunk type without a regular file")
+			return nil, fmt.Errorf("chunk type without a regular file")
+		}
+
+		if e.Type == TypeReg {
+			nChunks := countNextChunks(i + 1)
+
+			e.Chunks = make([]*internal.FileMetadata, nChunks+1)
+			for j := 0; j <= nChunks; j++ {
+				e.Chunks[j] = &entries[i+j]
+				e.EndOffset = entries[i+j].EndOffset
 			}
-			prevEntry.EndOffset = e.EndOffset
-			continue
+			i += nChunks
 		}
-		mergedEntries = append(mergedEntries, e)
-		prevEntry = &e
+		mergedEntries[m] = e
+		m++
 	}
 	// stargz/estargz doesn't store EndOffset so let's calculate it here
 	lastOffset := c.tocOffset
@@ -1176,6 +1738,47 @@ func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []i
 		if mergedEntries[i].Offset != 0 {
 			lastOffset = mergedEntries[i].Offset
 		}
+
+		lastChunkOffset := mergedEntries[i].EndOffset
+		for j := len(mergedEntries[i].Chunks) - 1; j >= 0; j-- {
+			mergedEntries[i].Chunks[j].EndOffset = lastChunkOffset
+			mergedEntries[i].Chunks[j].Size = mergedEntries[i].Chunks[j].EndOffset - mergedEntries[i].Chunks[j].Offset
+			lastChunkOffset = mergedEntries[i].Chunks[j].Offset
+		}
 	}
 	return mergedEntries, nil
 }
+
+// validateChunkChecksum checks if the file at $root/$path[offset:chunk.ChunkSize] has the
+// same digest as chunk.ChunkDigest
+func validateChunkChecksum(chunk *internal.FileMetadata, root, path string, offset int64, copyBuffer []byte) bool {
+	parentDirfd, err := unix.Open(root, unix.O_PATH, 0)
+	if err != nil {
+		return false
+	}
+	defer unix.Close(parentDirfd)
+
+	fd, err := openFileUnderRoot(path, parentDirfd, unix.O_RDONLY, 0)
+	if err != nil {
+		return false
+	}
+	defer fd.Close()
+
+	if _, err := unix.Seek(int(fd.Fd()), offset, 0); err != nil {
+		return false
+	}
+
+	r := io.LimitReader(fd, chunk.ChunkSize)
+	digester := digest.Canonical.Digester()
+
+	if _, err := io.CopyBuffer(digester.Hash(), r, copyBuffer); err != nil {
+		return false
+	}
+
+	digest, err := digest.Parse(chunk.ChunkDigest)
+	if err != nil {
+		return false
+	}
+
+	return digester.Digest() == digest
+}
diff --git a/vendor/github.com/containers/storage/pkg/config/config.go b/vendor/github.com/containers/storage/pkg/config/config.go
index e6622cf1466..f6e0cfcfe86 100644
--- a/vendor/github.com/containers/storage/pkg/config/config.go
+++ b/vendor/github.com/containers/storage/pkg/config/config.go
@@ -12,109 +12,109 @@ type ThinpoolOptionsConfig struct {
 	// grown. This is specified in terms of % of pool size. So a value of
 	// 20 means that when threshold is hit, pool will be grown by 20% of
 	// existing pool size.
-	AutoExtendPercent string `toml:"autoextend_percent"`
+	AutoExtendPercent string `toml:"autoextend_percent,omitempty"`
 
 	// AutoExtendThreshold determines the pool extension threshold in terms
 	// of percentage of pool size. For example, if threshold is 60, that
 	// means when pool is 60% full, threshold has been hit.
-	AutoExtendThreshold string `toml:"autoextend_threshold"`
+	AutoExtendThreshold string `toml:"autoextend_threshold,omitempty"`
 
 	// BaseSize specifies the size to use when creating the base device,
 	// which limits the size of images and containers.
-	BaseSize string `toml:"basesize"`
+	BaseSize string `toml:"basesize,omitempty"`
 
 	// BlockSize specifies a custom blocksize to use for the thin pool.
-	BlockSize string `toml:"blocksize"`
+	BlockSize string `toml:"blocksize,omitempty"`
 
 	// DirectLvmDevice specifies a custom block storage device to use for
 	// the thin pool.
-	DirectLvmDevice string `toml:"directlvm_device"`
+	DirectLvmDevice string `toml:"directlvm_device,omitempty"`
 
 	// DirectLvmDeviceForcewipes device even if device already has a
 	// filesystem
-	DirectLvmDeviceForce string `toml:"directlvm_device_force"`
+	DirectLvmDeviceForce string `toml:"directlvm_device_force,omitempty"`
 
 	// Fs specifies the filesystem type to use for the base device.
-	Fs string `toml:"fs"`
+	Fs string `toml:"fs,omitempty"`
 
 	// log_level sets the log level of devicemapper.
-	LogLevel string `toml:"log_level"`
+	LogLevel string `toml:"log_level,omitempty"`
 
 	// MetadataSize specifies the size of the metadata for the thinpool
 	// It will be used with the `pvcreate --metadata` option.
-	MetadataSize string `toml:"metadatasize"`
+	MetadataSize string `toml:"metadatasize,omitempty"`
 
 	// MinFreeSpace specifies the min free space percent in a thin pool
 	// require for new device creation to
-	MinFreeSpace string `toml:"min_free_space"`
+	MinFreeSpace string `toml:"min_free_space,omitempty"`
 
 	// MkfsArg specifies extra mkfs arguments to be used when creating the
 	// basedevice.
-	MkfsArg string `toml:"mkfsarg"`
+	MkfsArg string `toml:"mkfsarg,omitempty"`
 
 	// MountOpt specifies extra mount options used when mounting the thin
 	// devices.
-	MountOpt string `toml:"mountopt"`
+	MountOpt string `toml:"mountopt,omitempty"`
 
 	// Size
-	Size string `toml:"size"`
+	Size string `toml:"size,omitempty"`
 
 	// UseDeferredDeletion marks device for deferred deletion
-	UseDeferredDeletion string `toml:"use_deferred_deletion"`
+	UseDeferredDeletion string `toml:"use_deferred_deletion,omitempty"`
 
 	// UseDeferredRemoval marks device for deferred removal
-	UseDeferredRemoval string `toml:"use_deferred_removal"`
+	UseDeferredRemoval string `toml:"use_deferred_removal,omitempty"`
 
 	// XfsNoSpaceMaxRetriesFreeSpace specifies the maximum number of
 	// retries XFS should attempt to complete IO when ENOSPC (no space)
 	// error is returned by underlying storage device.
-	XfsNoSpaceMaxRetries string `toml:"xfs_nospace_max_retries"`
+	XfsNoSpaceMaxRetries string `toml:"xfs_nospace_max_retries,omitempty"`
 }
 
 type AufsOptionsConfig struct {
 	// MountOpt specifies extra mount options used when mounting
-	MountOpt string `toml:"mountopt"`
+	MountOpt string `toml:"mountopt,omitempty"`
 }
 
 type BtrfsOptionsConfig struct {
 	// MinSpace is the minimal spaces allocated to the device
-	MinSpace string `toml:"min_space"`
+	MinSpace string `toml:"min_space,omitempty"`
 	// Size
-	Size string `toml:"size"`
+	Size string `toml:"size,omitempty"`
 }
 
 type OverlayOptionsConfig struct {
 	// IgnoreChownErrors is a flag for whether chown errors should be
 	// ignored when building an image.
-	IgnoreChownErrors string `toml:"ignore_chown_errors"`
+	IgnoreChownErrors string `toml:"ignore_chown_errors,omitempty"`
 	// MountOpt specifies extra mount options used when mounting
-	MountOpt string `toml:"mountopt"`
+	MountOpt string `toml:"mountopt,omitempty"`
 	// Alternative program to use for the mount of the file system
-	MountProgram string `toml:"mount_program"`
+	MountProgram string `toml:"mount_program,omitempty"`
 	// Size
-	Size string `toml:"size"`
+	Size string `toml:"size,omitempty"`
 	// Inodes is used to set a maximum inodes of the container image.
-	Inodes string `toml:"inodes"`
+	Inodes string `toml:"inodes,omitempty"`
 	// Do not create a bind mount on the storage home
-	SkipMountHome string `toml:"skip_mount_home"`
+	SkipMountHome string `toml:"skip_mount_home,omitempty"`
 	// ForceMask indicates the permissions mask (e.g. "0755") to use for new
 	// files and directories
-	ForceMask string `toml:"force_mask"`
+	ForceMask string `toml:"force_mask,omitempty"`
 }
 
 type VfsOptionsConfig struct {
 	// IgnoreChownErrors is a flag for whether chown errors should be
 	// ignored when building an image.
-	IgnoreChownErrors string `toml:"ignore_chown_errors"`
+	IgnoreChownErrors string `toml:"ignore_chown_errors,omitempty"`
 }
 
 type ZfsOptionsConfig struct {
 	// MountOpt specifies extra mount options used when mounting
-	MountOpt string `toml:"mountopt"`
+	MountOpt string `toml:"mountopt,omitempty"`
 	// Name is the File System name of the ZFS File system
-	Name string `toml:"fsname"`
+	Name string `toml:"fsname,omitempty"`
 	// Size
-	Size string `toml:"size"`
+	Size string `toml:"size,omitempty"`
 }
 
 // OptionsConfig represents the "storage.options" TOML config table.
@@ -122,82 +122,82 @@ type OptionsConfig struct {
 	// AdditionalImagesStores is the location of additional read/only
 	// Image stores.  Usually used to access Networked File System
 	// for shared image content
-	AdditionalImageStores []string `toml:"additionalimagestores"`
+	AdditionalImageStores []string `toml:"additionalimagestores,omitempty"`
 
 	// AdditionalLayerStores is the location of additional read/only
 	// Layer stores.  Usually used to access Networked File System
 	// for shared image content
 	// This API is experimental and can be changed without bumping the
 	// major version number.
-	AdditionalLayerStores []string `toml:"additionallayerstores"`
+	AdditionalLayerStores []string `toml:"additionallayerstores,omitempty"`
 
 	// Size
-	Size string `toml:"size"`
+	Size string `toml:"size,omitempty"`
 
 	// RemapUIDs is a list of default UID mappings to use for layers.
-	RemapUIDs string `toml:"remap-uids"`
+	RemapUIDs string `toml:"remap-uids,omitempty"`
 	// RemapGIDs is a list of default GID mappings to use for layers.
-	RemapGIDs string `toml:"remap-gids"`
+	RemapGIDs string `toml:"remap-gids,omitempty"`
 	// IgnoreChownErrors is a flag for whether chown errors should be
 	// ignored when building an image.
-	IgnoreChownErrors string `toml:"ignore_chown_errors"`
+	IgnoreChownErrors string `toml:"ignore_chown_errors,omitempty"`
 
 	// ForceMask indicates the permissions mask (e.g. "0755") to use for new
 	// files and directories.
-	ForceMask os.FileMode `toml:"force_mask"`
+	ForceMask os.FileMode `toml:"force_mask,omitempty"`
 
 	// RemapUser is the name of one or more entries in /etc/subuid which
 	// should be used to set up default UID mappings.
-	RemapUser string `toml:"remap-user"`
+	RemapUser string `toml:"remap-user,omitempty"`
 	// RemapGroup is the name of one or more entries in /etc/subgid which
 	// should be used to set up default GID mappings.
-	RemapGroup string `toml:"remap-group"`
+	RemapGroup string `toml:"remap-group,omitempty"`
 
 	// RootAutoUsernsUser is the name of one or more entries in /etc/subuid and
 	// /etc/subgid which should be used to set up automatically a userns.
-	RootAutoUsernsUser string `toml:"root-auto-userns-user"`
+	RootAutoUsernsUser string `toml:"root-auto-userns-user,omitempty"`
 
 	// AutoUsernsMinSize is the minimum size for a user namespace that is
 	// created automatically.
-	AutoUsernsMinSize uint32 `toml:"auto-userns-min-size"`
+	AutoUsernsMinSize uint32 `toml:"auto-userns-min-size,omitempty"`
 
 	// AutoUsernsMaxSize is the maximum size for a user namespace that is
 	// created automatically.
-	AutoUsernsMaxSize uint32 `toml:"auto-userns-max-size"`
+	AutoUsernsMaxSize uint32 `toml:"auto-userns-max-size,omitempty"`
 
 	// Aufs container options to be handed to aufs drivers
-	Aufs struct{ AufsOptionsConfig } `toml:"aufs"`
+	Aufs struct{ AufsOptionsConfig } `toml:"aufs,omitempty"`
 
 	// Btrfs container options to be handed to btrfs drivers
-	Btrfs struct{ BtrfsOptionsConfig } `toml:"btrfs"`
+	Btrfs struct{ BtrfsOptionsConfig } `toml:"btrfs,omitempty"`
 
 	// Thinpool container options to be handed to thinpool drivers
-	Thinpool struct{ ThinpoolOptionsConfig } `toml:"thinpool"`
+	Thinpool struct{ ThinpoolOptionsConfig } `toml:"thinpool,omitempty"`
 
 	// Overlay container options to be handed to overlay drivers
-	Overlay struct{ OverlayOptionsConfig } `toml:"overlay"`
+	Overlay struct{ OverlayOptionsConfig } `toml:"overlay,omitempty"`
 
 	// Vfs container options to be handed to VFS drivers
-	Vfs struct{ VfsOptionsConfig } `toml:"vfs"`
+	Vfs struct{ VfsOptionsConfig } `toml:"vfs,omitempty"`
 
 	// Zfs container options to be handed to ZFS drivers
-	Zfs struct{ ZfsOptionsConfig } `toml:"zfs"`
+	Zfs struct{ ZfsOptionsConfig } `toml:"zfs,omitempty"`
 
 	// Do not create a bind mount on the storage home
-	SkipMountHome string `toml:"skip_mount_home"`
+	SkipMountHome string `toml:"skip_mount_home,omitempty"`
 
 	// Alternative program to use for the mount of the file system
-	MountProgram string `toml:"mount_program"`
+	MountProgram string `toml:"mount_program,omitempty"`
 
 	// MountOpt specifies extra mount options used when mounting
-	MountOpt string `toml:"mountopt"`
+	MountOpt string `toml:"mountopt,omitempty"`
 
 	// PullOptions specifies options to be handed to pull managers
 	// This API is experimental and can be changed without bumping the major version number.
-	PullOptions map[string]string `toml:"pull_options"`
+	PullOptions map[string]string `toml:"pull_options,omitempty"`
 
 	// DisableVolatile doesn't allow volatile mounts when it is set.
-	DisableVolatile bool `toml:"disable-volatile"`
+	DisableVolatile bool `toml:"disable-volatile,omitempty"`
 }
 
 // GetGraphDriverOptions returns the driver specific options
diff --git a/vendor/github.com/containers/storage/pkg/directory/directory_unix.go b/vendor/github.com/containers/storage/pkg/directory/directory_unix.go
index 8d58d24cac8..36e1bdd5fc8 100644
--- a/vendor/github.com/containers/storage/pkg/directory/directory_unix.go
+++ b/vendor/github.com/containers/storage/pkg/directory/directory_unix.go
@@ -1,8 +1,10 @@
+//go:build linux || darwin || freebsd || solaris
 // +build linux darwin freebsd solaris
 
 package directory
 
 import (
+	"io/fs"
 	"os"
 	"path/filepath"
 	"syscall"
@@ -21,7 +23,7 @@ func Size(dir string) (size int64, err error) {
 func Usage(dir string) (usage *DiskUsage, err error) {
 	usage = &DiskUsage{}
 	data := make(map[uint64]struct{})
-	err = filepath.Walk(dir, func(d string, fileInfo os.FileInfo, err error) error {
+	err = filepath.WalkDir(dir, func(d string, entry fs.DirEntry, err error) error {
 		if err != nil {
 			// if dir does not exist, Usage() returns the error.
 			// if dir/x disappeared while walking, Usage() ignores dir/x.
@@ -31,8 +33,9 @@ func Usage(dir string) (usage *DiskUsage, err error) {
 			return err
 		}
 
-		if fileInfo == nil {
-			return nil
+		fileInfo, err := entry.Info()
+		if err != nil {
+			return err
 		}
 
 		// Check inode to only count the sizes of files with multiple hard links once.
@@ -44,9 +47,8 @@ func Usage(dir string) (usage *DiskUsage, err error) {
 
 		// inode is not a uint64 on all platforms. Cast it to avoid issues.
 		data[uint64(inode)] = struct{}{}
-
 		// Ignore directory sizes
-		if fileInfo.IsDir() {
+		if entry.IsDir() {
 			return nil
 		}
 
diff --git a/vendor/github.com/containers/storage/pkg/directory/directory_windows.go b/vendor/github.com/containers/storage/pkg/directory/directory_windows.go
index a7a81240bc2..482bc51a26e 100644
--- a/vendor/github.com/containers/storage/pkg/directory/directory_windows.go
+++ b/vendor/github.com/containers/storage/pkg/directory/directory_windows.go
@@ -1,8 +1,10 @@
+//go:build windows
 // +build windows
 
 package directory
 
 import (
+	"io/fs"
 	"os"
 	"path/filepath"
 )
@@ -19,11 +21,11 @@ func Size(dir string) (size int64, err error) {
 // Usage walks a directory tree and returns its total size in bytes and the number of inodes.
 func Usage(dir string) (usage *DiskUsage, err error) {
 	usage = &DiskUsage{}
-	err = filepath.Walk(dir, func(d string, fileInfo os.FileInfo, err error) error {
+	err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			// if dir does not exist, Size() returns the error.
 			// if dir/x disappeared while walking, Size() ignores dir/x.
-			if os.IsNotExist(err) && d != dir {
+			if os.IsNotExist(err) && path != dir {
 				return nil
 			}
 			return err
@@ -32,16 +34,15 @@ func Usage(dir string) (usage *DiskUsage, err error) {
 		usage.InodeCount++
 
 		// Ignore directory sizes
-		if fileInfo == nil {
+		if d.IsDir() {
 			return nil
 		}
 
-		s := fileInfo.Size()
-		if fileInfo.IsDir() || s == 0 {
-			return nil
+		fileInfo, err := d.Info()
+		if err != nil {
+			return err
 		}
-
-		usage.Size += s
+		usage.Size += fileInfo.Size()
 
 		return nil
 	})
diff --git a/vendor/github.com/containers/storage/pkg/homedir/homedir.go b/vendor/github.com/containers/storage/pkg/homedir/homedir.go
new file mode 100644
index 00000000000..85c5e76c844
--- /dev/null
+++ b/vendor/github.com/containers/storage/pkg/homedir/homedir.go
@@ -0,0 +1,52 @@
+package homedir
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+)
+
+// GetConfigHome returns XDG_CONFIG_HOME.
+// GetConfigHome returns $HOME/.config and nil error if XDG_CONFIG_HOME is not set.
+//
+// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
+func GetConfigHome() (string, error) {
+	if xdgConfigHome := os.Getenv("XDG_CONFIG_HOME"); xdgConfigHome != "" {
+		return xdgConfigHome, nil
+	}
+	home := Get()
+	if home == "" {
+		return "", errors.New("could not get either XDG_CONFIG_HOME or HOME")
+	}
+	return filepath.Join(home, ".config"), nil
+}
+
+// GetDataHome returns XDG_DATA_HOME.
+// GetDataHome returns $HOME/.local/share and nil error if XDG_DATA_HOME is not set.
+//
+// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
+func GetDataHome() (string, error) {
+	if xdgDataHome := os.Getenv("XDG_DATA_HOME"); xdgDataHome != "" {
+		return xdgDataHome, nil
+	}
+	home := Get()
+	if home == "" {
+		return "", errors.New("could not get either XDG_DATA_HOME or HOME")
+	}
+	return filepath.Join(home, ".local", "share"), nil
+}
+
+// GetCacheHome returns XDG_CACHE_HOME.
+// GetCacheHome returns $HOME/.cache and nil error if XDG_CACHE_HOME is not set.
+//
+// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
+func GetCacheHome() (string, error) {
+	if xdgCacheHome := os.Getenv("XDG_CACHE_HOME"); xdgCacheHome != "" {
+		return xdgCacheHome, nil
+	}
+	home := Get()
+	if home == "" {
+		return "", errors.New("could not get either XDG_CACHE_HOME or HOME")
+	}
+	return filepath.Join(home, ".cache"), nil
+}
diff --git a/vendor/github.com/containers/storage/pkg/homedir/homedir_others.go b/vendor/github.com/containers/storage/pkg/homedir/homedir_others.go
index 06b53854b93..027db259c19 100644
--- a/vendor/github.com/containers/storage/pkg/homedir/homedir_others.go
+++ b/vendor/github.com/containers/storage/pkg/homedir/homedir_others.go
@@ -18,18 +18,3 @@ func GetRuntimeDir() (string, error) {
 func StickRuntimeDirContents(files []string) ([]string, error) {
 	return nil, errors.New("homedir.StickRuntimeDirContents() is not supported on this system")
 }
-
-// GetDataHome is unsupported on non-linux system.
-func GetDataHome() (string, error) {
-	return "", errors.New("homedir.GetDataHome() is not supported on this system")
-}
-
-// GetConfigHome is unsupported on non-linux system.
-func GetConfigHome() (string, error) {
-	return "", errors.New("homedir.GetConfigHome() is not supported on this system")
-}
-
-// GetCacheHome is unsupported on non-linux system.
-func GetCacheHome() (string, error) {
-	return "", errors.New("homedir.GetCacheHome() is not supported on this system")
-}
diff --git a/vendor/github.com/containers/storage/pkg/homedir/homedir_unix.go b/vendor/github.com/containers/storage/pkg/homedir/homedir_unix.go
index 2475e351bb5..33177bdf306 100644
--- a/vendor/github.com/containers/storage/pkg/homedir/homedir_unix.go
+++ b/vendor/github.com/containers/storage/pkg/homedir/homedir_unix.go
@@ -93,48 +93,3 @@ func stick(f string) error {
 	m |= os.ModeSticky
 	return os.Chmod(f, m)
 }
-
-// GetDataHome returns XDG_DATA_HOME.
-// GetDataHome returns $HOME/.local/share and nil error if XDG_DATA_HOME is not set.
-//
-// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
-func GetDataHome() (string, error) {
-	if xdgDataHome := os.Getenv("XDG_DATA_HOME"); xdgDataHome != "" {
-		return xdgDataHome, nil
-	}
-	home := Get()
-	if home == "" {
-		return "", errors.New("could not get either XDG_DATA_HOME or HOME")
-	}
-	return filepath.Join(home, ".local", "share"), nil
-}
-
-// GetConfigHome returns XDG_CONFIG_HOME.
-// GetConfigHome returns $HOME/.config and nil error if XDG_CONFIG_HOME is not set.
-//
-// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
-func GetConfigHome() (string, error) {
-	if xdgConfigHome := os.Getenv("XDG_CONFIG_HOME"); xdgConfigHome != "" {
-		return xdgConfigHome, nil
-	}
-	home := Get()
-	if home == "" {
-		return "", errors.New("could not get either XDG_CONFIG_HOME or HOME")
-	}
-	return filepath.Join(home, ".config"), nil
-}
-
-// GetCacheHome returns XDG_CACHE_HOME.
-// GetCacheHome returns $HOME/.cache and nil error if XDG_CACHE_HOME is not set.
-//
-// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
-func GetCacheHome() (string, error) {
-	if xdgCacheHome := os.Getenv("XDG_CACHE_HOME"); xdgCacheHome != "" {
-		return xdgCacheHome, nil
-	}
-	home := Get()
-	if home == "" {
-		return "", errors.New("could not get either XDG_CACHE_HOME or HOME")
-	}
-	return filepath.Join(home, ".cache"), nil
-}
diff --git a/vendor/github.com/containers/storage/pkg/homedir/homedir_windows.go b/vendor/github.com/containers/storage/pkg/homedir/homedir_windows.go
index 4f2615ed32f..af65f2c03de 100644
--- a/vendor/github.com/containers/storage/pkg/homedir/homedir_windows.go
+++ b/vendor/github.com/containers/storage/pkg/homedir/homedir_windows.go
@@ -17,7 +17,12 @@ func Key() string {
 // environment variables depending on the target operating system.
 // Returned path should be used with "path/filepath" to form new paths.
 func Get() string {
-	return os.Getenv(Key())
+	home := os.Getenv(Key())
+	if home != "" {
+		return home
+	}
+	home, _ = os.UserHomeDir()
+	return home
 }
 
 // GetShortcutString returns the string that is shortcut to user's home directory
diff --git a/vendor/github.com/containers/storage/pkg/idtools/idtools.go b/vendor/github.com/containers/storage/pkg/idtools/idtools.go
index 83bc8c34ff4..7a8fec0ce5f 100644
--- a/vendor/github.com/containers/storage/pkg/idtools/idtools.go
+++ b/vendor/github.com/containers/storage/pkg/idtools/idtools.go
@@ -3,15 +3,18 @@ package idtools
 import (
 	"bufio"
 	"fmt"
+	"io/ioutil"
 	"os"
 	"os/user"
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"syscall"
 
 	"github.com/containers/storage/pkg/system"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 )
 
 // IDMap contains a single entry for user namespace range remapping. An array
@@ -82,7 +85,7 @@ func GetRootUIDGID(uidMap, gidMap []IDMap) (int, int, error) {
 	if len(uidMap) == 1 && uidMap[0].Size == 1 {
 		uid = uidMap[0].HostID
 	} else {
-		uid, err = toHost(0, uidMap)
+		uid, err = RawToHost(0, uidMap)
 		if err != nil {
 			return -1, -1, err
 		}
@@ -90,7 +93,7 @@ func GetRootUIDGID(uidMap, gidMap []IDMap) (int, int, error) {
 	if len(gidMap) == 1 && gidMap[0].Size == 1 {
 		gid = gidMap[0].HostID
 	} else {
-		gid, err = toHost(0, gidMap)
+		gid, err = RawToHost(0, gidMap)
 		if err != nil {
 			return -1, -1, err
 		}
@@ -98,10 +101,14 @@ func GetRootUIDGID(uidMap, gidMap []IDMap) (int, int, error) {
 	return uid, gid, nil
 }
 
-// toContainer takes an id mapping, and uses it to translate a
-// host ID to the remapped ID. If no map is provided, then the translation
-// assumes a 1-to-1 mapping and returns the passed in id
-func toContainer(hostID int, idMap []IDMap) (int, error) {
+// RawToContainer takes an id mapping, and uses it to translate a host ID to
+// the remapped ID. If no map is provided, then the translation assumes a
+// 1-to-1 mapping and returns the passed in id.
+//
+// If you wish to map a (uid,gid) combination you should use the corresponding
+// IDMappings methods, which ensure that you are mapping the correct ID against
+// the correct mapping.
+func RawToContainer(hostID int, idMap []IDMap) (int, error) {
 	if idMap == nil {
 		return hostID, nil
 	}
@@ -114,10 +121,14 @@ func toContainer(hostID int, idMap []IDMap) (int, error) {
 	return -1, fmt.Errorf("Host ID %d cannot be mapped to a container ID", hostID)
 }
 
-// toHost takes an id mapping and a remapped ID, and translates the
-// ID to the mapped host ID. If no map is provided, then the translation
-// assumes a 1-to-1 mapping and returns the passed in id #
-func toHost(contID int, idMap []IDMap) (int, error) {
+// RawToHost takes an id mapping and a remapped ID, and translates the ID to
+// the mapped host ID. If no map is provided, then the translation assumes a
+// 1-to-1 mapping and returns the passed in id.
+//
+// If you wish to map a (uid,gid) combination you should use the corresponding
+// IDMappings methods, which ensure that you are mapping the correct ID against
+// the correct mapping.
+func RawToHost(contID int, idMap []IDMap) (int, error) {
 	if idMap == nil {
 		return contID, nil
 	}
@@ -182,31 +193,87 @@ func (i *IDMappings) RootPair() IDPair {
 }
 
 // ToHost returns the host UID and GID for the container uid, gid.
-// Remapping is only performed if the ids aren't already the remapped root ids
 func (i *IDMappings) ToHost(pair IDPair) (IDPair, error) {
+	var err error
+	var target IDPair
+
+	target.UID, err = RawToHost(pair.UID, i.uids)
+	if err != nil {
+		return target, err
+	}
+
+	target.GID, err = RawToHost(pair.GID, i.gids)
+	return target, err
+}
+
+var (
+	overflowUIDOnce sync.Once
+	overflowGIDOnce sync.Once
+	overflowUID     int
+	overflowGID     int
+)
+
+// getOverflowUID returns the UID mapped to the overflow user
+func getOverflowUID() int {
+	overflowUIDOnce.Do(func() {
+		// 65534 is the value on older kernels where /proc/sys/kernel/overflowuid is not present
+		overflowUID = 65534
+		if content, err := ioutil.ReadFile("/proc/sys/kernel/overflowuid"); err == nil {
+			if tmp, err := strconv.Atoi(string(content)); err == nil {
+				overflowUID = tmp
+			}
+		}
+	})
+	return overflowUID
+}
+
+// getOverflowUID returns the GID mapped to the overflow user
+func getOverflowGID() int {
+	overflowGIDOnce.Do(func() {
+		// 65534 is the value on older kernels where /proc/sys/kernel/overflowgid is not present
+		overflowGID = 65534
+		if content, err := ioutil.ReadFile("/proc/sys/kernel/overflowgid"); err == nil {
+			if tmp, err := strconv.Atoi(string(content)); err == nil {
+				overflowGID = tmp
+			}
+		}
+	})
+	return overflowGID
+}
+
+// ToHost returns the host UID and GID for the container uid, gid.
+// Remapping is only performed if the ids aren't already the remapped root ids
+// If the mapping is not possible because the target ID is not mapped into
+// the namespace, then the overflow ID is used.
+func (i *IDMappings) ToHostOverflow(pair IDPair) (IDPair, error) {
 	var err error
 	target := i.RootPair()
 
 	if pair.UID != target.UID {
-		target.UID, err = toHost(pair.UID, i.uids)
+		target.UID, err = RawToHost(pair.UID, i.uids)
 		if err != nil {
-			return target, err
+			target.UID = getOverflowUID()
+			logrus.Debugf("Failed to map UID %v to the target mapping, using the overflow ID %v", pair.UID, target.UID)
 		}
 	}
 
 	if pair.GID != target.GID {
-		target.GID, err = toHost(pair.GID, i.gids)
+		target.GID, err = RawToHost(pair.GID, i.gids)
+		if err != nil {
+			target.GID = getOverflowGID()
+			logrus.Debugf("Failed to map GID %v to the target mapping, using the overflow ID %v", pair.GID, target.GID)
+		}
 	}
-	return target, err
+	return target, nil
 }
 
 // ToContainer returns the container UID and GID for the host uid and gid
 func (i *IDMappings) ToContainer(pair IDPair) (int, int, error) {
-	uid, err := toContainer(pair.UID, i.uids)
+	uid, err := RawToContainer(pair.UID, i.uids)
 	if err != nil {
 		return -1, -1, err
 	}
-	gid, err := toContainer(pair.GID, i.gids)
+	gid, err := RawToContainer(pair.GID, i.gids)
 	return uid, gid, err
 }
 
@@ -293,7 +360,7 @@ func parseSubidFile(path, username string) (ranges, error) {
 
 func checkChownErr(err error, name string, uid, gid int) error {
 	if e, ok := err.(*os.PathError); ok && e.Err == syscall.EINVAL {
-		return errors.Wrapf(err, "potentially insufficient UIDs or GIDs available in user namespace (requested %d:%d for %s): Check /etc/subuid and /etc/subgid", uid, gid, name)
+		return errors.Wrapf(err, "potentially insufficient UIDs or GIDs available in user namespace (requested %d:%d for %s): Check /etc/subuid and /etc/subgid if configured locally and run podman-system-migrate", uid, gid, name)
 	}
 	return err
 }
diff --git a/vendor/github.com/containers/storage/pkg/idtools/idtools_supported.go b/vendor/github.com/containers/storage/pkg/idtools/idtools_supported.go
index db50a62e4c0..6e6e3b22bc9 100644
--- a/vendor/github.com/containers/storage/pkg/idtools/idtools_supported.go
+++ b/vendor/github.com/containers/storage/pkg/idtools/idtools_supported.go
@@ -12,11 +12,21 @@ import (
 #cgo LDFLAGS: -l subid
 #include <shadow/subid.h>
 #include <stdlib.h>
+#include <stdio.h>
 const char *Prog = "storage";
+FILE *shadow_logfd = NULL;
+
 struct subid_range get_range(struct subid_range *ranges, int i)
 {
-    return ranges[i];
+	shadow_logfd = stderr;
+	return ranges[i];
 }
+
+#if !defined(SUBID_ABI_MAJOR) || (SUBID_ABI_MAJOR < 4)
+# define subid_get_uid_ranges get_subuid_ranges
+# define subid_get_gid_ranges get_subgid_ranges
+#endif
+
 */
 import "C"
 
@@ -32,9 +42,9 @@ func readSubid(username string, isUser bool) (ranges, error) {
 	var nRanges C.int
 	var cRanges *C.struct_subid_range
 	if isUser {
-		nRanges = C.get_subuid_ranges(cUsername, &cRanges)
+		nRanges = C.subid_get_uid_ranges(cUsername, &cRanges)
 	} else {
-		nRanges = C.get_subgid_ranges(cUsername, &cRanges)
+		nRanges = C.subid_get_gid_ranges(cUsername, &cRanges)
 	}
 	if nRanges < 0 {
 		return nil, errors.New("cannot read subids")
diff --git a/vendor/github.com/containers/storage/pkg/idtools/idtools_unix.go b/vendor/github.com/containers/storage/pkg/idtools/idtools_unix.go
index 9776b2a1287..7f270c61f82 100644
--- a/vendor/github.com/containers/storage/pkg/idtools/idtools_unix.go
+++ b/vendor/github.com/containers/storage/pkg/idtools/idtools_unix.go
@@ -46,6 +46,9 @@ func mkdirAs(path string, mode os.FileMode, ownerUID, ownerGID int, mkAll, chown
 		// walk back to "/" looking for directories which do not exist
 		// and add them to the paths array for chown after creation
 		dirPath := path
+		if !filepath.IsAbs(dirPath) {
+			return fmt.Errorf("path: %s should be absolute", dirPath)
+		}
 		for {
 			dirPath = filepath.Dir(dirPath)
 			if dirPath == "/" {
diff --git a/vendor/github.com/containers/storage/pkg/mount/flags_freebsd.go b/vendor/github.com/containers/storage/pkg/mount/flags_freebsd.go
new file mode 100644
index 00000000000..3ba99cf9351
--- /dev/null
+++ b/vendor/github.com/containers/storage/pkg/mount/flags_freebsd.go
@@ -0,0 +1,48 @@
+package mount
+
+import (
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// RDONLY will mount the file system read-only.
+	RDONLY = unix.MNT_RDONLY
+
+	// NOSUID will not allow set-user-identifier or set-group-identifier bits to
+	// take effect.
+	NOSUID = unix.MNT_NOSUID
+
+	// NOEXEC will not allow execution of any binaries on the mounted file system.
+	NOEXEC = unix.MNT_NOEXEC
+
+	// SYNCHRONOUS will allow I/O to the file system to be done synchronously.
+	SYNCHRONOUS = unix.MNT_SYNCHRONOUS
+
+	// REMOUNT will attempt to remount an already-mounted file system. This is
+	// commonly used to change the mount flags for a file system, especially to
+	// make a readonly file system writeable. It does not change device or mount
+	// point.
+	REMOUNT = unix.MNT_UPDATE
+
+	// NOATIME will not update the file access time when reading from a file.
+	NOATIME = unix.MNT_NOATIME
+
+	mntDetach = unix.MNT_FORCE
+
+	NODIRATIME  = 0
+	NODEV       = 0
+	DIRSYNC     = 0
+	MANDLOCK    = 0
+	BIND        = 0
+	RBIND       = 0
+	UNBINDABLE  = 0
+	RUNBINDABLE = 0
+	PRIVATE     = 0
+	RPRIVATE    = 0
+	SLAVE       = 0
+	RSLAVE      = 0
+	SHARED      = 0
+	RSHARED     = 0
+	RELATIME    = 0
+	STRICTATIME = 0
+)
diff --git a/vendor/github.com/containers/storage/pkg/mount/flags_unsupported.go b/vendor/github.com/containers/storage/pkg/mount/flags_unsupported.go
index 9afd26d4c06..ee0f593a50a 100644
--- a/vendor/github.com/containers/storage/pkg/mount/flags_unsupported.go
+++ b/vendor/github.com/containers/storage/pkg/mount/flags_unsupported.go
@@ -1,4 +1,5 @@
-// +build !linux
+//go:build !linux && !freebsd
+// +build !linux,!freebsd
 
 package mount
 
diff --git a/vendor/github.com/containers/storage/pkg/mount/mounter_freebsd.go b/vendor/github.com/containers/storage/pkg/mount/mounter_freebsd.go
index b31cf99d0ff..2404e331dee 100644
--- a/vendor/github.com/containers/storage/pkg/mount/mounter_freebsd.go
+++ b/vendor/github.com/containers/storage/pkg/mount/mounter_freebsd.go
@@ -1,3 +1,6 @@
+//go:build freebsd && cgo
+// +build freebsd,cgo
+
 package mount
 
 /*
@@ -28,14 +31,25 @@ func allocateIOVecs(options []string) []C.struct_iovec {
 func mount(device, target, mType string, flag uintptr, data string) error {
 	isNullFS := false
 
-	xs := strings.Split(data, ",")
-	for _, x := range xs {
-		if x == "bind" {
-			isNullFS = true
+	options := []string{"fspath", target}
+
+	if data != "" {
+		xs := strings.Split(data, ",")
+		for _, x := range xs {
+			if x == "bind" {
+				isNullFS = true
+				continue
+			}
+			opt := strings.SplitN(x, "=", 2)
+			options = append(options, opt[0])
+			if len(opt) == 2 {
+				options = append(options, opt[1])
+			} else {
+				options = append(options, "")
+			}
 		}
 	}
 
-	options := []string{"fspath", target}
 	if isNullFS {
 		options = append(options, "fstype", "nullfs", "target", device)
 	} else {
diff --git a/vendor/github.com/containers/storage/pkg/mount/mounter_unsupported.go b/vendor/github.com/containers/storage/pkg/mount/mounter_unsupported.go
index 9d20cfbf869..74fe666090f 100644
--- a/vendor/github.com/containers/storage/pkg/mount/mounter_unsupported.go
+++ b/vendor/github.com/containers/storage/pkg/mount/mounter_unsupported.go
@@ -1,4 +1,6 @@
-// +build !linux,!freebsd
+//go:build !linux && !(freebsd && cgo)
+// +build !linux
+// +build !freebsd !cgo
 
 package mount
 
diff --git a/vendor/github.com/containers/storage/pkg/reexec/command_freebsd.go b/vendor/github.com/containers/storage/pkg/reexec/command_freebsd.go
new file mode 100644
index 00000000000..6f63ae99170
--- /dev/null
+++ b/vendor/github.com/containers/storage/pkg/reexec/command_freebsd.go
@@ -0,0 +1,37 @@
+// +build freebsd
+
+package reexec
+
+import (
+	"context"
+	"os"
+	"os/exec"
+
+	"golang.org/x/sys/unix"
+)
+
+// Self returns the path to the current process's binary.
+// Uses sysctl.
+func Self() string {
+	path, err := unix.SysctlArgs("kern.proc.pathname", -1)
+	if err == nil {
+		return path
+	}
+	return os.Args[0]
+}
+
+// Command returns *exec.Cmd which has Path as current binary.
+// For example if current binary is "docker" at "/usr/bin/", then cmd.Path will
+// be set to "/usr/bin/docker".
+func Command(args ...string) *exec.Cmd {
+	cmd := exec.Command(Self())
+	cmd.Args = args
+	return cmd
+}
+
+// CommandContext returns *exec.Cmd which has Path as current binary.
+func CommandContext(ctx context.Context, args ...string) *exec.Cmd {
+	cmd := exec.CommandContext(ctx, Self())
+	cmd.Args = args
+	return cmd
+}
diff --git a/vendor/github.com/containers/storage/pkg/reexec/command_linux.go b/vendor/github.com/containers/storage/pkg/reexec/command_linux.go
index 372bee7321f..d3dd86d349f 100644
--- a/vendor/github.com/containers/storage/pkg/reexec/command_linux.go
+++ b/vendor/github.com/containers/storage/pkg/reexec/command_linux.go
@@ -17,6 +17,7 @@ func Self() string {
 // This will use the in-memory version (/proc/self/exe) of the current binary,
 // it is thus safe to delete or replace the on-disk binary (os.Args[0]).
 func Command(args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	cmd := exec.Command(Self())
 	cmd.Args = args
 	return cmd
@@ -26,6 +27,7 @@ func Command(args ...string) *exec.Cmd {
 // This will use the in-memory version (/proc/self/exe) of the current binary,
 // it is thus safe to delete or replace the on-disk binary (os.Args[0]).
 func CommandContext(ctx context.Context, args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	cmd := exec.CommandContext(ctx, Self())
 	cmd.Args = args
 	return cmd
diff --git a/vendor/github.com/containers/storage/pkg/reexec/command_unix.go b/vendor/github.com/containers/storage/pkg/reexec/command_unix.go
index 1ecaa906fed..a56ada2161e 100644
--- a/vendor/github.com/containers/storage/pkg/reexec/command_unix.go
+++ b/vendor/github.com/containers/storage/pkg/reexec/command_unix.go
@@ -1,4 +1,5 @@
-// +build freebsd solaris darwin
+//go:build solaris || darwin
+// +build solaris darwin
 
 package reexec
 
@@ -17,6 +18,7 @@ func Self() string {
 // For example if current binary is "docker" at "/usr/bin/", then cmd.Path will
 // be set to "/usr/bin/docker".
 func Command(args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	cmd := exec.Command(Self())
 	cmd.Args = args
 	return cmd
@@ -24,6 +26,7 @@ func Command(args ...string) *exec.Cmd {
 
 // CommandContext returns *exec.Cmd which has Path as current binary.
 func CommandContext(ctx context.Context, args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	cmd := exec.CommandContext(ctx, Self())
 	cmd.Args = args
 	return cmd
diff --git a/vendor/github.com/containers/storage/pkg/reexec/command_unsupported.go b/vendor/github.com/containers/storage/pkg/reexec/command_unsupported.go
index 9d937426854..5b3605f319c 100644
--- a/vendor/github.com/containers/storage/pkg/reexec/command_unsupported.go
+++ b/vendor/github.com/containers/storage/pkg/reexec/command_unsupported.go
@@ -9,10 +9,12 @@ import (
 
 // Command is unsupported on operating systems apart from Linux, Windows, Solaris and Darwin.
 func Command(args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	return nil
 }
 
 // CommandContext is unsupported on operating systems apart from Linux, Windows, Solaris and Darwin.
 func CommandContext(ctx context.Context, args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	return nil
 }
diff --git a/vendor/github.com/containers/storage/pkg/reexec/command_windows.go b/vendor/github.com/containers/storage/pkg/reexec/command_windows.go
index 673ab476abd..d868564767f 100644
--- a/vendor/github.com/containers/storage/pkg/reexec/command_windows.go
+++ b/vendor/github.com/containers/storage/pkg/reexec/command_windows.go
@@ -17,6 +17,7 @@ func Self() string {
 // For example if current binary is "docker.exe" at "C:\", then cmd.Path will
 // be set to "C:\docker.exe".
 func Command(args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	cmd := exec.Command(Self())
 	cmd.Args = args
 	return cmd
@@ -26,6 +27,7 @@ func Command(args ...string) *exec.Cmd {
 // For example if current binary is "docker.exe" at "C:\", then cmd.Path will
 // be set to "C:\docker.exe".
 func CommandContext(ctx context.Context, args ...string) *exec.Cmd {
+	panicIfNotInitialized()
 	cmd := exec.CommandContext(ctx, Self())
 	cmd.Args = args
 	return cmd
diff --git a/vendor/github.com/containers/storage/pkg/reexec/reexec.go b/vendor/github.com/containers/storage/pkg/reexec/reexec.go
index c56671d9192..a1938cd4f34 100644
--- a/vendor/github.com/containers/storage/pkg/reexec/reexec.go
+++ b/vendor/github.com/containers/storage/pkg/reexec/reexec.go
@@ -7,7 +7,10 @@ import (
 	"path/filepath"
 )
 
-var registeredInitializers = make(map[string]func())
+var (
+	registeredInitializers = make(map[string]func())
+	initWasCalled          = false
+)
 
 // Register adds an initialization func under the specified name
 func Register(name string, initializer func()) {
@@ -22,6 +25,7 @@ func Register(name string, initializer func()) {
 // initialization function was called.
 func Init() bool {
 	initializer, exists := registeredInitializers[os.Args[0]]
+	initWasCalled = true
 	if exists {
 		initializer()
 
@@ -30,6 +34,21 @@ func Init() bool {
 	return false
 }
 
+func panicIfNotInitialized() {
+	if !initWasCalled {
+		// The reexec package is used to run subroutines in
+		// subprocesses which would otherwise have unacceptable side
+		// effects on the main thread.  If you found this error, then
+		// your program uses a package which needs to do this.  In
+		// order for that to work, main() should start with this
+		// boilerplate, or an equivalent:
+		//     if reexec.Init() {
+		//         return
+		//     }
+		panic("a library subroutine needed to run a subprocess, but reexec.Init() was not called in main()")
+	}
+}
+
 func naiveSelf() string {
 	name := os.Args[0]
 	if filepath.Base(name) == name {
diff --git a/vendor/github.com/containers/storage/pkg/system/xattrs_linux.go b/vendor/github.com/containers/storage/pkg/system/xattrs_linux.go
index 10355848bdb..6b47c4e717f 100644
--- a/vendor/github.com/containers/storage/pkg/system/xattrs_linux.go
+++ b/vendor/github.com/containers/storage/pkg/system/xattrs_linux.go
@@ -13,6 +13,9 @@ const (
 
 	// Operation not supported
 	EOPNOTSUPP unix.Errno = unix.EOPNOTSUPP
+
+	// Value is too small or too large for maximum size allowed
+	EOVERFLOW unix.Errno = unix.EOVERFLOW
 )
 
 // Lgetxattr retrieves the value of the extended attribute identified by attr
diff --git a/vendor/github.com/containers/storage/pkg/system/xattrs_unsupported.go b/vendor/github.com/containers/storage/pkg/system/xattrs_unsupported.go
index bc8b8e3a5fe..3fc27f0b139 100644
--- a/vendor/github.com/containers/storage/pkg/system/xattrs_unsupported.go
+++ b/vendor/github.com/containers/storage/pkg/system/xattrs_unsupported.go
@@ -10,6 +10,9 @@ const (
 
 	// Operation not supported
 	EOPNOTSUPP syscall.Errno = syscall.Errno(0)
+
+	// Value is too small or too large for maximum size allowed
+	EOVERFLOW syscall.Errno = syscall.Errno(0)
 )
 
 // Lgetxattr is not supported on platforms other than linux.
diff --git a/vendor/github.com/containers/storage/pkg/unshare/unshare_linux.go b/vendor/github.com/containers/storage/pkg/unshare/unshare_linux.go
index 6d351ce80a9..baeb8f1aab5 100644
--- a/vendor/github.com/containers/storage/pkg/unshare/unshare_linux.go
+++ b/vendor/github.com/containers/storage/pkg/unshare/unshare_linux.go
@@ -1,3 +1,4 @@
+//go:build linux
 // +build linux
 
 package unshare
@@ -9,6 +10,7 @@ import (
 	"io"
 	"os"
 	"os/exec"
+	"os/signal"
 	"os/user"
 	"runtime"
 	"strconv"
@@ -75,6 +77,28 @@ func getRootlessGID() int {
 	return os.Getegid()
 }
 
+// IsSetID checks if specified path has correct FileMode (Setuid|SETGID) or the
+// matching file capabilitiy
+func IsSetID(path string, modeid os.FileMode, capid capability.Cap) (bool, error) {
+	info, err := os.Stat(path)
+	if err != nil {
+		return false, err
+	}
+
+	mode := info.Mode()
+	if mode&modeid == modeid {
+		return true, nil
+	}
+	cap, err := capability.NewFile2(path)
+	if err != nil {
+		return false, err
+	}
+	if err := cap.Load(); err != nil {
+		return false, err
+	}
+	return cap.Get(capability.EFFECTIVE, capid), nil
+}
+
 func (c *Cmd) Start() error {
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
@@ -214,15 +238,26 @@ func (c *Cmd) Start() error {
 			gidmapSet := false
 			// Set the GID map.
 			if c.UseNewgidmap {
-				cmd := exec.Command("newgidmap", append([]string{pidString}, strings.Fields(strings.Replace(g.String(), "\n", " ", -1))...)...)
+				path, err := exec.LookPath("newgidmap")
+				if err != nil {
+					return errors.Wrapf(err, "error finding newgidmap")
+				}
+				cmd := exec.Command(path, append([]string{pidString}, strings.Fields(strings.Replace(g.String(), "\n", " ", -1))...)...)
 				g.Reset()
 				cmd.Stdout = g
 				cmd.Stderr = g
-				err := cmd.Run()
-				if err == nil {
+				if err := cmd.Run(); err == nil {
 					gidmapSet = true
 				} else {
 					logrus.Warnf("Error running newgidmap: %v: %s", err, g.String())
+					isSetgid, err := IsSetID(path, os.ModeSetgid, capability.CAP_SETGID)
+					if err != nil {
+						logrus.Warnf("Failed to check for setgid on %s: %v", path, err)
+					} else {
+						if !isSetgid {
+							logrus.Warnf("%s should be setgid or have filecaps setgid", path)
+						}
+					}
 					logrus.Warnf("Falling back to single mapping")
 					g.Reset()
 					g.Write([]byte(fmt.Sprintf("0 %d 1\n", os.Getegid())))
@@ -261,17 +296,29 @@ func (c *Cmd) Start() error {
 				fmt.Fprintf(u, "%d %d %d\n", m.ContainerID, m.HostID, m.Size)
 			}
 			uidmapSet := false
-			// Set the GID map.
+			// Set the UID map.
 			if c.UseNewuidmap {
-				cmd := exec.Command("newuidmap", append([]string{pidString}, strings.Fields(strings.Replace(u.String(), "\n", " ", -1))...)...)
+				path, err := exec.LookPath("newuidmap")
+				if err != nil {
+					return errors.Wrapf(err, "error finding newuidmap")
+				}
+				cmd := exec.Command(path, append([]string{pidString}, strings.Fields(strings.Replace(u.String(), "\n", " ", -1))...)...)
 				u.Reset()
 				cmd.Stdout = u
 				cmd.Stderr = u
-				err := cmd.Run()
-				if err == nil {
+				if err := cmd.Run(); err == nil {
 					uidmapSet = true
 				} else {
 					logrus.Warnf("Error running newuidmap: %v: %s", err, u.String())
+					isSetuid, err := IsSetID(path, os.ModeSetuid, capability.CAP_SETUID)
+					if err != nil {
+						logrus.Warnf("Failed to check for setuid on %s: %v", path, err)
+					} else {
+						if !isSetuid {
+							logrus.Warnf("%s should be setuid or have filecaps setuid", path)
+						}
+					}
+
 					logrus.Warnf("Falling back to single mapping")
 					u.Reset()
 					u.Write([]byte(fmt.Sprintf("0 %d 1\n", os.Geteuid())))
@@ -484,6 +531,30 @@ func MaybeReexecUsingUserNamespace(evenForRoot bool) {
 
 	// Finish up.
 	logrus.Debugf("Running %+v with environment %+v, UID map %+v, and GID map %+v", cmd.Cmd.Args, os.Environ(), cmd.UidMappings, cmd.GidMappings)
+
+	// Forward SIGHUP, SIGINT, and SIGTERM to our child process.
+	interrupted := make(chan os.Signal, 100)
+	defer func() {
+		signal.Stop(interrupted)
+		close(interrupted)
+	}()
+	cmd.Hook = func(int) error {
+		go func() {
+			for receivedSignal := range interrupted {
+				cmd.Cmd.Process.Signal(receivedSignal)
+			}
+		}()
+		return nil
+	}
+	signal.Notify(interrupted, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
+
+	// Make sure our child process gets SIGKILLed if we exit, for whatever
+	// reason, before it does.
+	if cmd.Cmd.SysProcAttr == nil {
+		cmd.Cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+	cmd.Cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
+
 	ExecRunnable(cmd, nil)
 }
 
@@ -501,11 +572,11 @@ func ExecRunnable(cmd Runnable, cleanup func()) {
 			if exitError.ProcessState.Exited() {
 				if waitStatus, ok := exitError.ProcessState.Sys().(syscall.WaitStatus); ok {
 					if waitStatus.Exited() {
-						logrus.Errorf("%v", exitError)
+						logrus.Debugf("%v", exitError)
 						exit(waitStatus.ExitStatus())
 					}
 					if waitStatus.Signaled() {
-						logrus.Errorf("%v", exitError)
+						logrus.Debugf("%v", exitError)
 						exit(int(waitStatus.Signal()) + 128)
 					}
 				}
diff --git a/vendor/github.com/containers/storage/storage.conf b/vendor/github.com/containers/storage/storage.conf
index 722750c0cca..c17dd6d37ea 100644
--- a/vendor/github.com/containers/storage/storage.conf
+++ b/vendor/github.com/containers/storage/storage.conf
@@ -1,5 +1,14 @@
 # This file is is the configuration file for all tools
-# that use the containers/storage library.
+# that use the containers/storage library. The storage.conf file
+# overrides all other storage.conf files. Container engines using the
+# container/storage library do not inherit fields from other storage.conf
+# files.
+#
+#  Note: The storage.conf file overrides other storage.conf files based on this precedence:
+#      /usr/containers/storage.conf
+#      /etc/containers/storage.conf
+#      $HOME/.config/containers/storage.conf
+#      $XDG_CONFIG_HOME/containers/storage.conf (If XDG_CONFIG_HOME is set)
 # See man 5 containers-storage.conf for more information
 # The "container storage" table contains all of the server options.
 [storage]
@@ -11,8 +20,14 @@ driver = "overlay"
 runroot = "/run/containers/storage"
 
 # Primary Read/Write location of container storage
+# When changing the graphroot location on an SELINUX system, you must
+# ensure  the labeling matches the default locations labels with the
+# following commands:
+# semanage fcontext -a -e /var/lib/containers/storage /NEWSTORAGEPATH
+# restorecon -R -v /NEWSTORAGEPATH
 graphroot = "/var/lib/containers/storage"
 
+
 # Storage path for rootless users
 #
 # rootless_storage_path = "$HOME/.local/share/containers/storage"
diff --git a/vendor/github.com/containers/storage/storage.conf-freebsd b/vendor/github.com/containers/storage/storage.conf-freebsd
new file mode 100644
index 00000000000..34d80152c02
--- /dev/null
+++ b/vendor/github.com/containers/storage/storage.conf-freebsd
@@ -0,0 +1,205 @@
+# This file is is the configuration file for all tools
+# that use the containers/storage library. The storage.conf file
+# overrides all other storage.conf files. Container engines using the
+# container/storage library do not inherit fields from other storage.conf
+# files.
+#
+#  Note: The storage.conf file overrides other storage.conf files based on this precedence:
+#      /usr/local/share/containers/storage.conf
+#      /usr/local/etc/containers/storage.conf
+#      $HOME/.config/containers/storage.conf
+#      $XDG_CONFIG_HOME/containers/storage.conf (If XDG_CONFIG_HOME is set)
+# See man 5 containers-storage.conf for more information
+# The "container storage" table contains all of the server options.
+[storage]
+
+# Default Storage Driver, Must be set for proper operation.
+driver = "zfs"
+
+# Temporary storage location
+runroot = "/var/run/containers/storage"
+
+# Primary Read/Write location of container storage
+graphroot = "/var/db/containers/storage"
+
+
+# Storage path for rootless users
+#
+# rootless_storage_path = "$HOME/.local/share/containers/storage"
+
+[storage.options]
+# Storage options to be passed to underlying storage drivers
+
+# AdditionalImageStores is used to pass paths to additional Read/Only image stores
+# Must be comma separated list.
+additionalimagestores = [
+]
+
+# Remap-UIDs/GIDs is the mapping from UIDs/GIDs as they should appear inside of
+# a container, to the UIDs/GIDs as they should appear outside of the container,
+# and the length of the range of UIDs/GIDs.  Additional mapped sets can be
+# listed and will be heeded by libraries, but there are limits to the number of
+# mappings which the kernel will allow when you later attempt to run a
+# container.
+#
+# remap-uids = 0:1668442479:65536
+# remap-gids = 0:1668442479:65536
+
+# Remap-User/Group is a user name which can be used to look up one or more UID/GID
+# ranges in the /etc/subuid or /etc/subgid file.  Mappings are set up starting
+# with an in-container ID of 0 and then a host-level ID taken from the lowest
+# range that matches the specified name, and using the length of that range.
+# Additional ranges are then assigned, using the ranges which specify the
+# lowest host-level IDs first, to the lowest not-yet-mapped in-container ID,
+# until all of the entries have been used for maps.
+#
+# remap-user = "containers"
+# remap-group = "containers"
+
+# Root-auto-userns-user is a user name which can be used to look up one or more UID/GID
+# ranges in the /etc/subuid and /etc/subgid file.  These ranges will be partitioned
+# to containers configured to create automatically a user namespace.  Containers
+# configured to automatically create a user namespace can still overlap with containers
+# having an explicit mapping set.
+# This setting is ignored when running as rootless.
+# root-auto-userns-user = "storage"
+#
+# Auto-userns-min-size is the minimum size for a user namespace created automatically.
+# auto-userns-min-size=1024
+#
+# Auto-userns-max-size is the minimum size for a user namespace created automatically.
+# auto-userns-max-size=65536
+
+[storage.options.overlay]
+# ignore_chown_errors can be set to allow a non privileged user running with
+# a single UID within a user namespace to run containers. The user can pull
+# and use any image even those with multiple uids.  Note multiple UIDs will be
+# squashed down to the default uid in the container.  These images will have no
+# separation between the users in the container. Only supported for the overlay
+# and vfs drivers.
+#ignore_chown_errors = "false"
+
+# Inodes is used to set a maximum inodes of the container image.
+# inodes = ""
+
+# Path to an helper program to use for mounting the file system instead of mounting it
+# directly.
+#mount_program = "/usr/bin/fuse-overlayfs"
+
+# mountopt specifies comma separated list of extra mount options
+mountopt = "nodev"
+
+# Set to skip a PRIVATE bind mount on the storage home directory.
+# skip_mount_home = "false"
+
+# Size is used to set a maximum size of the container image.
+# size = ""
+
+# ForceMask specifies the permissions mask that is used for new files and
+# directories.
+#
+# The values "shared" and "private" are accepted.
+# Octal permission masks are also accepted.
+#
+#  "": No value specified.
+#     All files/directories, get set with the permissions identified within the
+#     image.
+#  "private": it is equivalent to 0700.
+#     All files/directories get set with 0700 permissions.  The owner has rwx
+#     access to the files. No other users on the system can access the files.
+#     This setting could be used with networked based homedirs.
+#  "shared": it is equivalent to 0755.
+#     The owner has rwx access to the files and everyone else can read, access
+#     and execute them. This setting is useful for sharing containers storage
+#     with other users.  For instance have a storage owned by root but shared
+#     to rootless users as an additional store.
+#     NOTE:  All files within the image are made readable and executable by any
+#     user on the system. Even /etc/shadow within your image is now readable by
+#     any user.
+#
+#   OCTAL: Users can experiment with other OCTAL Permissions.
+#
+#  Note: The force_mask Flag is an experimental feature, it could change in the
+#  future.  When "force_mask" is set the original permission mask is stored in
+#  the "user.containers.override_stat" xattr and the "mount_program" option must
+#  be specified. Mount programs like "/usr/bin/fuse-overlayfs" present the
+#  extended attribute permissions to processes within containers rather then the
+#  "force_mask"  permissions.
+#
+# force_mask = ""
+
+[storage.options.thinpool]
+# Storage Options for thinpool
+
+# autoextend_percent determines the amount by which pool needs to be
+# grown. This is specified in terms of % of pool size. So a value of 20 means
+# that when threshold is hit, pool will be grown by 20% of existing
+# pool size.
+# autoextend_percent = "20"
+
+# autoextend_threshold determines the pool extension threshold in terms
+# of percentage of pool size. For example, if threshold is 60, that means when
+# pool is 60% full, threshold has been hit.
+# autoextend_threshold = "80"
+
+# basesize specifies the size to use when creating the base device, which
+# limits the size of images and containers.
+# basesize = "10G"
+
+# blocksize specifies a custom blocksize to use for the thin pool.
+# blocksize="64k"
+
+# directlvm_device specifies a custom block storage device to use for the
+# thin pool. Required if you setup devicemapper.
+# directlvm_device = ""
+
+# directlvm_device_force wipes device even if device already has a filesystem.
+# directlvm_device_force = "True"
+
+# fs specifies the filesystem type to use for the base device.
+# fs="xfs"
+
+# log_level sets the log level of devicemapper.
+# 0: LogLevelSuppress 0 (Default)
+# 2: LogLevelFatal
+# 3: LogLevelErr
+# 4: LogLevelWarn
+# 5: LogLevelNotice
+# 6: LogLevelInfo
+# 7: LogLevelDebug
+# log_level = "7"
+
+# min_free_space specifies the min free space percent in a thin pool require for
+# new device creation to succeed. Valid values are from 0% - 99%.
+# Value 0% disables
+# min_free_space = "10%"
+
+# mkfsarg specifies extra mkfs arguments to be used when creating the base
+# device.
+# mkfsarg = ""
+
+# metadata_size is used to set the `pvcreate --metadatasize` options when
+# creating thin devices. Default is 128k
+# metadata_size = ""
+
+# Size is used to set a maximum size of the container image.
+# size = ""
+
+# use_deferred_removal marks devicemapper block device for deferred removal.
+# If the thinpool is in use when the driver attempts to remove it, the driver
+# tells the kernel to remove it as soon as possible. Note this does not free
+# up the disk space, use deferred deletion to fully remove the thinpool.
+# use_deferred_removal = "True"
+
+# use_deferred_deletion marks thinpool device for deferred deletion.
+# If the device is busy when the driver attempts to delete it, the driver
+# will attempt to delete device every 30 seconds until successful.
+# If the program using the driver exits, the driver will continue attempting
+# to cleanup the next time the driver is used. Deferred deletion permanently
+# deletes the device and all data stored in device will be lost.
+# use_deferred_deletion = "True"
+
+# xfs_nospace_max_retries specifies the maximum number of retries XFS should
+# attempt to complete IO when ENOSPC (no space) error is returned by
+# underlying storage device.
+# xfs_nospace_max_retries = "0"
diff --git a/vendor/github.com/containers/storage/store.go b/vendor/github.com/containers/storage/store.go
index 169c7d1513e..45912d0ca59 100644
--- a/vendor/github.com/containers/storage/store.go
+++ b/vendor/github.com/containers/storage/store.go
@@ -31,6 +31,14 @@ import (
 	"github.com/pkg/errors"
 )
 
+type updateNameOperation int
+
+const (
+	setNames updateNameOperation = iota
+	addNames
+	removeNames
+)
+
 var (
 	stores     []*store
 	storesLock sync.Mutex
@@ -368,8 +376,17 @@ type Store interface {
 
 	// SetNames changes the list of names for a layer, image, or container.
 	// Duplicate names are removed from the list automatically.
+	// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 	SetNames(id string, names []string) error
 
+	// AddNames adds the list of names for a layer, image, or container.
+	// Duplicate names are removed from the list automatically.
+	AddNames(id string, names []string) error
+
+	// RemoveNames removes the list of names for a layer, image, or container.
+	// Duplicate names are removed from the list automatically.
+	RemoveNames(id string, names []string) error
+
 	// ListImageBigData retrieves a list of the (possibly large) chunks of
 	// named data associated with an image.
 	ListImageBigData(id string) ([]string, error)
@@ -575,10 +592,11 @@ type ContainerOptions struct {
 	// container's layer will inherit settings from the image's top layer
 	// or, if it is not being created based on an image, the Store object.
 	types.IDMappingOptions
-	LabelOpts []string
-	Flags     map[string]interface{}
-	MountOpts []string
-	Volatile  bool
+	LabelOpts  []string
+	Flags      map[string]interface{}
+	MountOpts  []string
+	Volatile   bool
+	StorageOpt map[string]string
 }
 
 type store struct {
@@ -646,17 +664,21 @@ func GetStore(options types.StoreOptions) (Store, error) {
 	storesLock.Lock()
 	defer storesLock.Unlock()
 
+	// return if BOTH run and graph root are matched, otherwise our run-root can be overridden if the graph is found first
 	for _, s := range stores {
-		if s.graphRoot == options.GraphRoot && (options.GraphDriverName == "" || s.graphDriverName == options.GraphDriverName) {
+		if (s.graphRoot == options.GraphRoot) && (s.runRoot == options.RunRoot) && (options.GraphDriverName == "" || s.graphDriverName == options.GraphDriverName) {
 			return s, nil
 		}
 	}
 
-	if options.GraphRoot == "" {
-		return nil, errors.Wrap(ErrIncompleteOptions, "no storage root specified")
-	}
-	if options.RunRoot == "" {
-		return nil, errors.Wrap(ErrIncompleteOptions, "no storage runroot specified")
+	// if passed a run-root or graph-root alone, the other should be defaulted only error if we have neither.
+	switch {
+	case options.RunRoot == "" && options.GraphRoot == "":
+		return nil, errors.Wrap(ErrIncompleteOptions, "no storage runroot or graphroot specified")
+	case options.GraphRoot == "":
+		options.GraphRoot = types.Options().GraphRoot
+	case options.RunRoot == "":
+		options.RunRoot = types.Options().RunRoot
 	}
 
 	if err := os.MkdirAll(options.RunRoot, 0700); err != nil {
@@ -1173,6 +1195,11 @@ func (s *store) imageTopLayerForMapping(image *Image, ristore ROImageStore, crea
 				if layer == nil {
 					layer = cLayer
 					parentLayer = cParentLayer
+					if store != rlstore {
+						// The layer is in another store, so we cannot
+						// create a mapped version of it to the image.
+						createMappedLayer = false
+					}
 				}
 			}
 		}
@@ -1384,7 +1411,7 @@ func (s *store) CreateContainer(id string, names []string, image, layer, metadat
 		options.Flags["MountLabel"] = mountLabel
 	}
 
-	clayer, err := rlstore.Create(layer, imageTopLayer, nil, options.Flags["MountLabel"].(string), nil, layerOptions, true)
+	clayer, err := rlstore.Create(layer, imageTopLayer, nil, options.Flags["MountLabel"].(string), options.StorageOpt, layerOptions, true)
 	if err != nil {
 		return nil, err
 	}
@@ -1608,7 +1635,7 @@ func (s *store) ImageBigData(id, key string) ([]byte, error) {
 		}
 	}
 	if foundImage {
-		return nil, errors.Wrapf(os.ErrNotExist, "error locating item named %q for image with ID %q", key, id)
+		return nil, errors.Wrapf(os.ErrNotExist, "error locating item named %q for image with ID %q (consider removing the image to resolve the issue)", key, id)
 	}
 	return nil, errors.Wrapf(ErrImageUnknown, "error locating image with ID %q", id)
 }
@@ -2045,7 +2072,20 @@ func dedupeNames(names []string) []string {
 	return deduped
 }
 
+// Deprecated: Prone to race conditions, suggested alternatives are `AddNames` and `RemoveNames`.
 func (s *store) SetNames(id string, names []string) error {
+	return s.updateNames(id, names, setNames)
+}
+
+func (s *store) AddNames(id string, names []string) error {
+	return s.updateNames(id, names, addNames)
+}
+
+func (s *store) RemoveNames(id string, names []string) error {
+	return s.updateNames(id, names, removeNames)
+}
+
+func (s *store) updateNames(id string, names []string, op updateNameOperation) error {
 	deduped := dedupeNames(names)
 
 	rlstore, err := s.LayerStore()
@@ -2058,7 +2098,16 @@ func (s *store) SetNames(id string, names []string) error {
 		return err
 	}
 	if rlstore.Exists(id) {
-		return rlstore.SetNames(id, deduped)
+		switch op {
+		case setNames:
+			return rlstore.SetNames(id, deduped)
+		case removeNames:
+			return rlstore.RemoveNames(id, deduped)
+		case addNames:
+			return rlstore.AddNames(id, deduped)
+		default:
+			return errInvalidUpdateNameOperation
+		}
 	}
 
 	ristore, err := s.ImageStore()
@@ -2071,7 +2120,16 @@ func (s *store) SetNames(id string, names []string) error {
 		return err
 	}
 	if ristore.Exists(id) {
-		return ristore.SetNames(id, deduped)
+		switch op {
+		case setNames:
+			return ristore.SetNames(id, deduped)
+		case removeNames:
+			return ristore.RemoveNames(id, deduped)
+		case addNames:
+			return ristore.AddNames(id, deduped)
+		default:
+			return errInvalidUpdateNameOperation
+		}
 	}
 
 	// Check is id refers to a RO Store
@@ -2109,7 +2167,16 @@ func (s *store) SetNames(id string, names []string) error {
 		return err
 	}
 	if rcstore.Exists(id) {
-		return rcstore.SetNames(id, deduped)
+		switch op {
+		case setNames:
+			return rcstore.SetNames(id, deduped)
+		case removeNames:
+			return rcstore.RemoveNames(id, deduped)
+		case addNames:
+			return rcstore.AddNames(id, deduped)
+		default:
+			return errInvalidUpdateNameOperation
+		}
 	}
 	return ErrLayerUnknown
 }
@@ -2370,22 +2437,16 @@ func (s *store) DeleteImage(id string, commit bool) (layers []string, err error)
 		if err != nil {
 			return nil, err
 		}
-		childrenByParent := make(map[string]*[]string)
+		childrenByParent := make(map[string][]string)
 		for _, layer := range layers {
-			parent := layer.Parent
-			if list, ok := childrenByParent[parent]; ok {
-				newList := append(*list, layer.ID)
-				childrenByParent[parent] = &newList
-			} else {
-				childrenByParent[parent] = &([]string{layer.ID})
-			}
+			childrenByParent[layer.Parent] = append(childrenByParent[layer.Parent], layer.ID)
 		}
-		otherImagesByTopLayer := make(map[string]string)
+		otherImagesTopLayers := make(map[string]struct{})
 		for _, img := range images {
 			if img.ID != id {
-				otherImagesByTopLayer[img.TopLayer] = img.ID
+				otherImagesTopLayers[img.TopLayer] = struct{}{}
 				for _, layerID := range img.MappedTopLayers {
-					otherImagesByTopLayer[layerID] = img.ID
+					otherImagesTopLayers[layerID] = struct{}{}
 				}
 			}
 		}
@@ -2395,43 +2456,44 @@ func (s *store) DeleteImage(id string, commit bool) (layers []string, err error)
 			}
 		}
 		layer := image.TopLayer
-		lastRemoved := ""
+		layersToRemoveMap := make(map[string]struct{})
+		layersToRemove = append(layersToRemove, image.MappedTopLayers...)
+		for _, mappedTopLayer := range image.MappedTopLayers {
+			layersToRemoveMap[mappedTopLayer] = struct{}{}
+		}
 		for layer != "" {
 			if rcstore.Exists(layer) {
 				break
 			}
-			if _, ok := otherImagesByTopLayer[layer]; ok {
+			if _, used := otherImagesTopLayers[layer]; used {
 				break
 			}
 			parent := ""
 			if l, err := rlstore.Get(layer); err == nil {
 				parent = l.Parent
 			}
-			hasOtherRefs := func() bool {
+			hasChildrenNotBeingRemoved := func() bool {
 				layersToCheck := []string{layer}
 				if layer == image.TopLayer {
 					layersToCheck = append(layersToCheck, image.MappedTopLayers...)
 				}
 				for _, layer := range layersToCheck {
-					if childList, ok := childrenByParent[layer]; ok && childList != nil {
-						children := *childList
-						for _, child := range children {
-							if child != lastRemoved {
-								return true
+					if childList := childrenByParent[layer]; len(childList) > 0 {
+						for _, child := range childList {
+							if _, childIsSlatedForRemoval := layersToRemoveMap[child]; childIsSlatedForRemoval {
+								continue
 							}
+							return true
 						}
 					}
 				}
 				return false
 			}
-			if hasOtherRefs() {
+			if hasChildrenNotBeingRemoved() {
 				break
 			}
-			lastRemoved = layer
-			if layer == image.TopLayer {
-				layersToRemove = append(layersToRemove, image.MappedTopLayers...)
-			}
-			layersToRemove = append(layersToRemove, lastRemoved)
+			layersToRemove = append(layersToRemove, layer)
+			layersToRemoveMap[layer] = struct{}{}
 			layer = parent
 		}
 	} else {
@@ -2499,23 +2561,29 @@ func (s *store) DeleteContainer(id string) error {
 			gcpath := filepath.Join(s.GraphRoot(), middleDir, container.ID)
 			wg.Add(1)
 			go func() {
-				var err error
-				for attempts := 0; attempts < 50; attempts++ {
-					err = os.RemoveAll(gcpath)
-					if err == nil || !system.IsEBUSY(err) {
-						break
-					}
-					time.Sleep(time.Millisecond * 100)
+				defer wg.Done()
+				// attempt a simple rm -rf first
+				err := os.RemoveAll(gcpath)
+				if err == nil {
+					errChan <- nil
+					return
 				}
-				errChan <- err
-				wg.Done()
+				// and if it fails get to the more complicated cleanup
+				errChan <- system.EnsureRemoveAll(gcpath)
 			}()
 
 			rcpath := filepath.Join(s.RunRoot(), middleDir, container.ID)
 			wg.Add(1)
 			go func() {
-				errChan <- os.RemoveAll(rcpath)
-				wg.Done()
+				defer wg.Done()
+				// attempt a simple rm -rf first
+				err := os.RemoveAll(rcpath)
+				if err == nil {
+					errChan <- nil
+					return
+				}
+				// and if it fails get to the more complicated cleanup
+				errChan <- system.EnsureRemoveAll(rcpath)
 			}()
 
 			go func() {
@@ -2524,17 +2592,12 @@ func (s *store) DeleteContainer(id string) error {
 			}()
 
 			var errors []error
-			for {
-				select {
-				case err, ok := <-errChan:
-					if !ok {
-						return multierror.Append(nil, errors...).ErrorOrNil()
-					}
-					if err != nil {
-						errors = append(errors, err)
-					}
+			for err := range errChan {
+				if err != nil {
+					errors = append(errors, err)
 				}
 			}
+			return multierror.Append(nil, errors...).ErrorOrNil()
 		}
 	}
 	return ErrNotAContainer
@@ -2830,10 +2893,33 @@ func (s *store) Diff(from, to string, options *DiffOptions) (io.ReadCloser, erro
 	if err != nil {
 		return nil, err
 	}
+
+	// NaiveDiff could cause mounts to happen without a lock, so be safe
+	// and treat the .Diff operation as a Mount.
+	s.graphLock.Lock()
+	defer s.graphLock.Unlock()
+
+	modified, err := s.graphLock.Modified()
+	if err != nil {
+		return nil, err
+	}
+
+	// We need to make sure the home mount is present when the Mount is done.
+	if modified {
+		s.graphDriver = nil
+		s.layerStore = nil
+		s.graphDriver, err = s.getGraphDriver()
+		if err != nil {
+			return nil, err
+		}
+		s.lastLoaded = time.Now()
+	}
+
 	for _, s := range append([]ROLayerStore{lstore}, lstores...) {
 		store := s
 		store.RLock()
 		if err := store.ReloadIfChanged(); err != nil {
+			store.Unlock()
 			return nil, err
 		}
 		if store.Exists(to) {
diff --git a/vendor/github.com/containers/storage/types/errors.go b/vendor/github.com/containers/storage/types/errors.go
index d920d12eb50..ad12ffdbf2d 100644
--- a/vendor/github.com/containers/storage/types/errors.go
+++ b/vendor/github.com/containers/storage/types/errors.go
@@ -55,4 +55,6 @@ var (
 	ErrStoreIsReadOnly = errors.New("called a write method on a read-only store")
 	// ErrNotSupported is returned when the requested functionality is not supported.
 	ErrNotSupported = errors.New("not supported")
+	// ErrInvalidMappings is returned when the specified mappings are invalid.
+	ErrInvalidMappings = errors.New("invalid mappings specified")
 )
diff --git a/vendor/github.com/containers/storage/types/options.go b/vendor/github.com/containers/storage/types/options.go
index f9bf7e6b658..d318421a44b 100644
--- a/vendor/github.com/containers/storage/types/options.go
+++ b/vendor/github.com/containers/storage/types/options.go
@@ -3,44 +3,57 @@ package types
 import (
 	"fmt"
 	"os"
-	"os/exec"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 
 	"github.com/BurntSushi/toml"
-	"github.com/containers/storage/drivers/overlay"
 	cfg "github.com/containers/storage/pkg/config"
 	"github.com/containers/storage/pkg/idtools"
 	"github.com/sirupsen/logrus"
 )
 
 // TOML-friendly explicit tables used for conversions.
-type tomlConfig struct {
+type TomlConfig struct {
 	Storage struct {
-		Driver              string            `toml:"driver"`
-		RunRoot             string            `toml:"runroot"`
-		GraphRoot           string            `toml:"graphroot"`
-		RootlessStoragePath string            `toml:"rootless_storage_path"`
-		Options             cfg.OptionsConfig `toml:"options"`
+		Driver              string            `toml:"driver,omitempty"`
+		RunRoot             string            `toml:"runroot,omitempty"`
+		GraphRoot           string            `toml:"graphroot,omitempty"`
+		RootlessStoragePath string            `toml:"rootless_storage_path,omitempty"`
+		Options             cfg.OptionsConfig `toml:"options,omitempty"`
 	} `toml:"storage"`
 }
 
-// defaultConfigFile path to the system wide storage.conf file
-var (
-	defaultConfigFile    = "/etc/containers/storage.conf"
-	defaultConfigFileSet = false
-	// DefaultStoreOptions is a reasonable default set of options.
-	defaultStoreOptions StoreOptions
+const (
+	overlayDriver = "overlay"
+	overlay2      = "overlay2"
 )
 
 func init() {
-	defaultStoreOptions.RunRoot = "/run/containers/storage"
-	defaultStoreOptions.GraphRoot = "/var/lib/containers/storage"
+	defaultStoreOptions.RunRoot = defaultRunRoot
+	defaultStoreOptions.GraphRoot = defaultGraphRoot
 	defaultStoreOptions.GraphDriverName = ""
 
-	ReloadConfigurationFileIfNeeded(defaultConfigFile, &defaultStoreOptions)
+	if _, err := os.Stat(defaultOverrideConfigFile); err == nil {
+		// The DefaultConfigFile(rootless) function returns the path
+		// of the used storage.conf file, by returning defaultConfigFile
+		// If override exists containers/storage uses it by default.
+		defaultConfigFile = defaultOverrideConfigFile
+		ReloadConfigurationFileIfNeeded(defaultOverrideConfigFile, &defaultStoreOptions)
+	} else {
+		if !os.IsNotExist(err) {
+			logrus.Warningf("Attempting to use %s, %v", defaultConfigFile, err)
+		}
+		ReloadConfigurationFileIfNeeded(defaultConfigFile, &defaultStoreOptions)
+	}
+	// reload could set values to empty for run and graph root if config does not contains anything
+	if defaultStoreOptions.RunRoot == "" {
+		defaultStoreOptions.RunRoot = defaultRunRoot
+	}
+	if defaultStoreOptions.GraphRoot == "" {
+		defaultStoreOptions.GraphRoot = defaultGraphRoot
+	}
 }
 
 // defaultStoreOptionsIsolated is an internal implementation detail of DefaultStoreOptions to allow testing.
@@ -168,7 +181,6 @@ func isRootlessDriver(driver string) bool {
 // getRootlessStorageOpts returns the storage opts for containers running as non root
 func getRootlessStorageOpts(rootlessUID int, systemOpts StoreOptions) (StoreOptions, error) {
 	var opts StoreOptions
-	const overlayDriver = "overlay"
 
 	dataDir, rootlessRuntime, err := getRootlessDirInfo(rootlessUID)
 	if err != nil {
@@ -190,25 +202,16 @@ func getRootlessStorageOpts(rootlessUID int, systemOpts StoreOptions) (StoreOpti
 	if driver := os.Getenv("STORAGE_DRIVER"); driver != "" {
 		opts.GraphDriverName = driver
 	}
-	if opts.GraphDriverName == "" || opts.GraphDriverName == overlayDriver {
-		supported, err := overlay.SupportsNativeOverlay(opts.GraphRoot, rootlessRuntime)
-		if err != nil {
-			return opts, err
-		}
-		if supported {
-			opts.GraphDriverName = overlayDriver
-		} else {
-			if path, err := exec.LookPath("fuse-overlayfs"); err == nil {
-				opts.GraphDriverName = overlayDriver
-				opts.GraphDriverOptions = []string{fmt.Sprintf("overlay.mount_program=%s", path)}
-			}
-		}
-		if opts.GraphDriverName == overlayDriver {
-			for _, o := range systemOpts.GraphDriverOptions {
-				if strings.Contains(o, "ignore_chown_errors") {
-					opts.GraphDriverOptions = append(opts.GraphDriverOptions, o)
-					break
-				}
+	if opts.GraphDriverName == overlay2 {
+		logrus.Warnf("Switching default driver from overlay2 to the equivalent overlay driver.")
+		opts.GraphDriverName = overlayDriver
+	}
+
+	if opts.GraphDriverName == overlayDriver {
+		for _, o := range systemOpts.GraphDriverOptions {
+			if strings.Contains(o, "ignore_chown_errors") {
+				opts.GraphDriverOptions = append(opts.GraphDriverOptions, o)
+				break
 			}
 		}
 	}
@@ -271,7 +274,7 @@ func ReloadConfigurationFileIfNeeded(configFile string, storeOptions *StoreOptio
 // ReloadConfigurationFile parses the specified configuration file and overrides
 // the configuration in storeOptions.
 func ReloadConfigurationFile(configFile string, storeOptions *StoreOptions) {
-	config := new(tomlConfig)
+	config := new(TomlConfig)
 
 	meta, err := toml.DecodeFile(configFile, &config)
 	if err == nil {
@@ -286,7 +289,7 @@ func ReloadConfigurationFile(configFile string, storeOptions *StoreOptions) {
 		}
 	}
 
-	// Clear storeOptions of previos settings
+	// Clear storeOptions of previous settings
 	*storeOptions = StoreOptions{}
 	if config.Storage.Driver != "" {
 		storeOptions.GraphDriverName = config.Storage.Driver
@@ -295,6 +298,10 @@ func ReloadConfigurationFile(configFile string, storeOptions *StoreOptions) {
 		config.Storage.Driver = os.Getenv("STORAGE_DRIVER")
 		storeOptions.GraphDriverName = config.Storage.Driver
 	}
+	if storeOptions.GraphDriverName == overlay2 {
+		logrus.Warnf("Switching default driver from overlay2 to the equivalent overlay driver.")
+		storeOptions.GraphDriverName = overlayDriver
+	}
 	if storeOptions.GraphDriverName == "" {
 		logrus.Errorf("The storage 'driver' option must be set in %s, guarantee proper operation.", configFile)
 	}
@@ -385,3 +392,39 @@ func ReloadConfigurationFile(configFile string, storeOptions *StoreOptions) {
 func Options() StoreOptions {
 	return defaultStoreOptions
 }
+
+// Save overwrites the tomlConfig in storage.conf with the given conf
+func Save(conf TomlConfig, rootless bool) error {
+	configFile, err := DefaultConfigFile(rootless)
+	if err != nil {
+		return err
+	}
+
+	if err = os.Remove(configFile); !os.IsNotExist(err) && err != nil {
+		return err
+	}
+
+	f, err := os.Create(configFile)
+	if err != nil {
+		return err
+	}
+
+	return toml.NewEncoder(f).Encode(conf)
+}
+
+// StorageConfig is used to retrieve the storage.conf toml in order to overwrite it
+func StorageConfig(rootless bool) (*TomlConfig, error) {
+	config := new(TomlConfig)
+
+	configFile, err := DefaultConfigFile(rootless)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = toml.DecodeFile(configFile, &config)
+	if err != nil {
+		return nil, err
+	}
+
+	return config, nil
+}
diff --git a/vendor/github.com/containers/storage/types/options_darwin.go b/vendor/github.com/containers/storage/types/options_darwin.go
new file mode 100644
index 00000000000..d5ad50bc0bd
--- /dev/null
+++ b/vendor/github.com/containers/storage/types/options_darwin.go
@@ -0,0 +1,17 @@
+package types
+
+const (
+	// these are default path for run and graph root for rootful users
+	// for rootless path is constructed via getRootlessStorageOpts
+	defaultRunRoot   string = "/run/containers/storage"
+	defaultGraphRoot string = "/var/lib/containers/storage"
+)
+
+// defaultConfigFile path to the system wide storage.conf file
+var (
+	defaultConfigFile         = "/usr/share/containers/storage.conf"
+	defaultOverrideConfigFile = "/etc/containers/storage.conf"
+	defaultConfigFileSet      = false
+	// DefaultStoreOptions is a reasonable default set of options.
+	defaultStoreOptions StoreOptions
+)
diff --git a/vendor/github.com/containers/storage/types/options_freebsd.go b/vendor/github.com/containers/storage/types/options_freebsd.go
new file mode 100644
index 00000000000..d5976b6d581
--- /dev/null
+++ b/vendor/github.com/containers/storage/types/options_freebsd.go
@@ -0,0 +1,17 @@
+package types
+
+const (
+	// these are default path for run and graph root for rootful users
+	// for rootless path is constructed via getRootlessStorageOpts
+	defaultRunRoot   string = "/var/run/containers/storage"
+	defaultGraphRoot string = "/var/db/containers/storage"
+)
+
+// defaultConfigFile path to the system wide storage.conf file
+var (
+	defaultConfigFile         = "/usr/local/share/containers/storage.conf"
+	defaultOverrideConfigFile = "/usr/local/etc/containers/storage.conf"
+	defaultConfigFileSet      = false
+	// DefaultStoreOptions is a reasonable default set of options.
+	defaultStoreOptions StoreOptions
+)
diff --git a/vendor/github.com/containers/storage/types/options_linux.go b/vendor/github.com/containers/storage/types/options_linux.go
new file mode 100644
index 00000000000..d5ad50bc0bd
--- /dev/null
+++ b/vendor/github.com/containers/storage/types/options_linux.go
@@ -0,0 +1,17 @@
+package types
+
+const (
+	// these are default path for run and graph root for rootful users
+	// for rootless path is constructed via getRootlessStorageOpts
+	defaultRunRoot   string = "/run/containers/storage"
+	defaultGraphRoot string = "/var/lib/containers/storage"
+)
+
+// defaultConfigFile path to the system wide storage.conf file
+var (
+	defaultConfigFile         = "/usr/share/containers/storage.conf"
+	defaultOverrideConfigFile = "/etc/containers/storage.conf"
+	defaultConfigFileSet      = false
+	// DefaultStoreOptions is a reasonable default set of options.
+	defaultStoreOptions StoreOptions
+)
diff --git a/vendor/github.com/containers/storage/types/options_windows.go b/vendor/github.com/containers/storage/types/options_windows.go
new file mode 100644
index 00000000000..d5ad50bc0bd
--- /dev/null
+++ b/vendor/github.com/containers/storage/types/options_windows.go
@@ -0,0 +1,17 @@
+package types
+
+const (
+	// these are default path for run and graph root for rootful users
+	// for rootless path is constructed via getRootlessStorageOpts
+	defaultRunRoot   string = "/run/containers/storage"
+	defaultGraphRoot string = "/var/lib/containers/storage"
+)
+
+// defaultConfigFile path to the system wide storage.conf file
+var (
+	defaultConfigFile         = "/usr/share/containers/storage.conf"
+	defaultOverrideConfigFile = "/etc/containers/storage.conf"
+	defaultConfigFileSet      = false
+	// DefaultStoreOptions is a reasonable default set of options.
+	defaultStoreOptions StoreOptions
+)
diff --git a/vendor/github.com/containers/storage/utils.go b/vendor/github.com/containers/storage/utils.go
index 80d56041b06..37d4b79b01b 100644
--- a/vendor/github.com/containers/storage/utils.go
+++ b/vendor/github.com/containers/storage/utils.go
@@ -40,3 +40,35 @@ func validateMountOptions(mountOptions []string) error {
 	}
 	return nil
 }
+
+func applyNameOperation(oldNames []string, opParameters []string, op updateNameOperation) ([]string, error) {
+	var result []string
+	switch op {
+	case setNames:
+		// ignore all old names and just return new names
+		result = opParameters
+	case removeNames:
+		// remove given names from old names
+		result = make([]string, 0, len(oldNames))
+		for _, name := range oldNames {
+			// only keep names in final result which do not intersect with input names
+			// basically `result = oldNames - opParameters`
+			nameShouldBeRemoved := false
+			for _, opName := range opParameters {
+				if name == opName {
+					nameShouldBeRemoved = true
+				}
+			}
+			if !nameShouldBeRemoved {
+				result = append(result, name)
+			}
+		}
+	case addNames:
+		result = make([]string, 0, len(opParameters)+len(oldNames))
+		result = append(result, opParameters...)
+		result = append(result, oldNames...)
+	default:
+		return result, errInvalidUpdateNameOperation
+	}
+	return dedupeNames(result), nil
+}
diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore
index b35f8449bf2..d31b3781527 100644
--- a/vendor/github.com/klauspost/compress/.gitignore
+++ b/vendor/github.com/klauspost/compress/.gitignore
@@ -23,3 +23,10 @@ _testmain.go
 *.test
 *.prof
 /s2/cmd/_s2sx/sfx-exe
+
+# Linux perf files
+perf.data
+perf.data.old
+
+# gdb history
+.gdb_history
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index e8ff994f8bc..c3ec9d8a78c 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,59 @@ This package provides various compression algorithms.
 
 # changelog
 
+* May 5, 2022 (v1.15.3)
+	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
+	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+
+* Apr 26, 2022 (v1.15.2)
+	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
+	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
+	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
+	* Minimum version is Go 1.16, added CI test on 1.18.
+
+* Mar 11, 2022 (v1.15.1)
+	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
+	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
+	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
+	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
+	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
+
+* Mar 3, 2022 (v1.15.0)
+	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
+	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
+	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
+	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)
+	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
+	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
+
+<details>
+	<summary>See  Details</summary>
+Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
+
+Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
+
+While the release has been extensively tested, it is recommended to testing when upgrading.
+</details>
+
+* Feb 22, 2022 (v1.14.4)
+	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
+	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
+	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501
+	* huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
+
+* Feb 17, 2022 (v1.14.3)
+	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)
+	* flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483)
+	* s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486)
+
+* Jan 25, 2022 (v1.14.2)
+	* zstd: improve header decoder by @dsnet  [#476](https://github.com/klauspost/compress/pull/476)
+	* zstd: Add bigger default blocks  [#469](https://github.com/klauspost/compress/pull/469)
+	* zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470)
+	* zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472)
+	* flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473)
+	* zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475)
+
 * Jan 11, 2022 (v1.14.1)
 	* s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
 	* flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
@@ -53,6 +106,9 @@ This package provides various compression algorithms.
 	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
 	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
 
+<details>
+	<summary>See changes to v1.12.x</summary>
+	
 * May 25, 2021 (v1.12.3)
 	* deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374)
 	* deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375)
@@ -74,9 +130,10 @@ This package provides various compression algorithms.
 	* s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352)
 	* zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346)
 	* s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349)
+</details>
 
 <details>
-	<summary>See changes prior to v1.12.1</summary>
+	<summary>See changes to v1.11.x</summary>
 	
 * Mar 26, 2021 (v1.11.13)
 	* zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345)
@@ -135,7 +192,7 @@ This package provides various compression algorithms.
 </details>
 
 <details>
-	<summary>See changes prior to v1.11.0</summary>
+	<summary>See changes to v1.10.x</summary>
  
 * July 8, 2020 (v1.10.11) 
 	* zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278)
@@ -297,11 +354,6 @@ This package provides various compression algorithms.
 
 # deflate usage
 
-* [High Throughput Benchmark](http://blog.klauspost.com/go-gzipdeflate-benchmarks/).
-* [Small Payload/Webserver Benchmarks](http://blog.klauspost.com/gzip-performance-for-go-webservers/).
-* [Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
-* [Re-balancing Deflate Compression Levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/)
-
 The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
 
 | old import         | new import                              | Documentation
@@ -323,6 +375,8 @@ Memory usage is typically 1MB for a Writer. stdlib is in the same range.
 If you expect to have a lot of concurrently allocated Writers consider using 
 the stateless compress described below.
 
+For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).
+
 # Stateless compression
 
 This package offers stateless compression as a special option for gzip/deflate. 
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index 0b2e54972cd..d55ea2a7759 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index fd49efd75b1..25f6d1108fc 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -8,6 +8,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"io"
+	"math"
 )
 
 const (
@@ -24,6 +25,10 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
@@ -36,8 +41,11 @@ const (
 	bufferSize = bufferFlushSize + 8
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = [32]int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -51,6 +59,9 @@ var lengthBase = [32]uint8{
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
 var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
@@ -78,10 +89,10 @@ func init() {
 
 	for i := range offsetCombined[:] {
 		// Don't use extended window values...
-		if offsetBase[i] > 0x006000 {
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
 			continue
 		}
-		offsetCombined[i] = uint32(offsetExtraBits[i])<<16 | (offsetBase[i])
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
 	}
 }
 
@@ -97,7 +108,7 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint16
+	nbits           uint8
 	nbytes          uint8
 	lastHuffMan     bool
 	literalEncoding *huffmanEncoder
@@ -215,7 +226,7 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
 	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
@@ -571,7 +582,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -672,19 +686,21 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 			size = reuseSize
 		}
 
-		if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
-			// Check if we get a reasonable size decrease.
-			if storable && ssize <= size {
-				w.writeStoredHeader(len(input), eof)
-				w.writeBytes(input)
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
 				return
 			}
-			w.writeFixedHeader(eof)
-			if !sync {
-				tokens.AddEOB()
-			}
-			w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
-			return
 		}
 		// Check if we get a reasonable size decrease.
 		if storable && ssize <= size {
@@ -717,19 +733,21 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
 
 		// Store predefined, if we don't get a reasonable improvement.
-		if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
-			// Store bytes, if we don't get an improvement.
-			if storable && ssize <= preSize {
-				w.writeStoredHeader(len(input), eof)
-				w.writeBytes(input)
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
 				return
 			}
-			w.writeFixedHeader(eof)
-			if !sync {
-				tokens.AddEOB()
-			}
-			w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
-			return
 		}
 
 		if storable && ssize <= size {
@@ -833,9 +851,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
 
 	for _, t := range tokens {
-		if t < matchType {
+		if t < 256 {
 			//w.writeCode(lits[t.literal()])
-			c := lits[t.literal()]
+			c := lits[t]
 			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {
@@ -858,12 +876,12 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
+		lengthCode := lengthCode(length) & 31
 		if false {
-			w.writeCode(lengths[lengthCode&31])
+			w.writeCode(lengths[lengthCode])
 		} else {
 			// inlined
-			c := lengths[lengthCode&31]
+			c := lengths[lengthCode]
 			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {
@@ -883,10 +901,10 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			}
 		}
 
-		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
-		if extraLengthBits > 0 {
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
 			//w.writeBits(extraLength, extraLengthBits)
-			extraLength := int32(length - lengthBase[lengthCode&31])
+			extraLength := int32(length - lengthBase[lengthCode])
 			bits |= uint64(extraLength) << (nbits & 63)
 			nbits += extraLengthBits
 			if nbits >= 48 {
@@ -907,10 +925,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offset >> 16
-		offset &= matchOffsetOnlyMask
+		offsetCode := (offset >> 16) & 31
 		if false {
-			w.writeCode(offs[offsetCode&31])
+			w.writeCode(offs[offsetCode])
 		} else {
 			// inlined
 			c := offs[offsetCode]
@@ -932,11 +949,12 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 				}
 			}
 		}
-		offsetComb := offsetCombined[offsetCode]
-		if offsetComb > 1<<16 {
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
 			//w.writeBits(extraOffset, extraOffsetBits)
-			bits |= uint64(offset-(offsetComb&0xffff)) << (nbits & 63)
-			nbits += uint16(offsetComb >> 16)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -1002,6 +1020,29 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
 	histogram(input, w.literalFreq[:numLiterals], fill)
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
 	w.literalFreq[endBlockMarker] = 1
 	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
 	if fill {
@@ -1019,8 +1060,10 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	estBits += estBits >> w.logNewTablePenalty
 
 	// Store bytes, if we don't get a reasonable improvement.
-	ssize, storable := w.storedSize(input)
 	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -1031,7 +1074,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 
 		if estBits < reuseSize {
 			if debugDeflate {
-				//fmt.Println("not reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
 			}
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
@@ -1065,6 +1108,9 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
 	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
 
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
 	// Unroll, write 3 codes/loop.
 	// Fastest number of unrolls.
 	for len(input) > 3 {
@@ -1074,13 +1120,16 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 			bits >>= (n * 8) & 63
 			nbits -= n * 8
-			nbytes += uint8(n)
+			nbytes += n
 		}
 		if nbytes >= bufferFlushSize {
 			if w.err != nil {
 				nbytes = 0
 				return
 			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
 			_, w.err = w.writer.Write(w.bytes[:nbytes])
 			nbytes = 0
 		}
@@ -1096,13 +1145,6 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 
 	// Remaining...
 	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		bits |= uint64(c.code) << (nbits & 63)
-		nbits += c.len
-		if debugDeflate {
-			count += int(c.len)
-		}
 		if nbits >= 48 {
 			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -1114,17 +1156,33 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 					nbytes = 0
 					return
 				}
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
 				_, w.err = w.writer.Write(w.bytes[:nbytes])
 				nbytes = 0
 			}
 		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		if debugDeflate {
+			count += int(c.len)
+		}
 	}
 	// Restore...
 	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
 
 	if debugDeflate {
-		fmt.Println("wrote", count/8, "bytes")
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
+	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
 	}
+
 	if eof || sync {
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index f35e00261d3..9ab497c275b 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -17,7 +17,8 @@ const (
 
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
-	code, len uint16
+	code uint16
+	len  uint8
 }
 
 type huffmanEncoder struct {
@@ -56,7 +57,7 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
+func (h *hcode) set(code uint16, length uint8) {
 	h.len = length
 	h.code = code
 }
@@ -80,7 +81,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 	var ch uint16
 	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -99,7 +100,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = hcode{code: reverseBits(bits, size), len: size}
 	}
 	return h
 }
@@ -187,14 +188,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     int32(list[1].freq),
-			nextCharFreq: int32(list[2].freq),
-			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -205,8 +211,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
 			// We've run out of both leafs and pairs.
@@ -238,7 +244,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
 			levels[l.level-1].needed = 2
 		}
 
@@ -296,7 +308,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 
 		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)}
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -309,6 +321,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // maxBits  The maximum number of bits to use for any literal.
 func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -317,11 +330,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			list[count] = literalNode{}
-			h.codes[i].len = 0
+			codes[i].len = 0
 		}
 	}
-	list[len(freq)] = literalNode{}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
index d5f62f6a2ca..414c0bea9fa 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -36,6 +36,13 @@ type lengthExtra struct {
 
 var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
 
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
 // Initialize the fixedHuffmanDecoder only once upon first use.
 var fixedOnce sync.Once
 var fixedHuffmanDecoder huffmanDecoder
@@ -559,221 +566,6 @@ func (f *decompressor) readHuffman() error {
 	return nil
 }
 
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlockGeneric() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := f.r.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlockGeneric
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
-		}
-
-		var dist uint32
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			sym, err := f.huffSym(f.hd)
-			if err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
-				}
-				f.err = err
-				return
-			}
-			dist = uint32(sym)
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
-			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, int(dist)
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index cc6db27925c..61342b6b88f 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -21,6 +21,11 @@ func (f *decompressor) huffmanBytesBuffer() {
 	)
 	fr := f.r.(*bytes.Buffer)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -39,41 +44,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -83,15 +82,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesBuffer
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -101,9 +102,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -111,25 +113,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -137,12 +141,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -152,38 +156,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -197,9 +198,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -207,14 +209,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -223,9 +227,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -238,20 +243,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -265,6 +272,11 @@ func (f *decompressor) huffmanBytesReader() {
 	)
 	fr := f.r.(*bytes.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -283,41 +295,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -327,15 +333,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -345,9 +353,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -355,25 +364,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -381,12 +392,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -396,38 +407,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -441,9 +449,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -451,14 +460,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -467,9 +478,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -482,20 +494,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -509,6 +523,11 @@ func (f *decompressor) huffmanBufioReader() {
 	)
 	fr := f.r.(*bufio.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -527,41 +546,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -571,15 +584,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBufioReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -589,9 +604,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -599,25 +615,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -625,12 +643,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -640,38 +658,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -685,9 +700,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -695,14 +711,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -711,9 +729,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -726,20 +745,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -753,6 +774,11 @@ func (f *decompressor) huffmanStringsReader() {
 	)
 	fr := f.r.(*strings.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -771,41 +797,286 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -815,15 +1086,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanStringsReader
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanGenericReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -833,9 +1106,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -843,25 +1117,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -869,12 +1145,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -884,38 +1160,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -929,9 +1202,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -939,14 +1213,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -955,9 +1231,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -970,20 +1247,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 func (f *decompressor) huffmanBlockDecoder() func() {
@@ -996,7 +1275,9 @@ func (f *decompressor) huffmanBlockDecoder() func() {
 		return f.huffmanBufioReader
 	case *strings.Reader:
 		return f.huffmanStringsReader
+	case Reader:
+		return f.huffmanGenericReader
 	default:
-		return f.huffmanBlockGeneric
+		return f.huffmanGenericReader
 	}
 }
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 1e5eea3968a..0f14f8d63b4 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -125,11 +154,43 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index 234c4389ab3..8603fbd55ad 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -134,7 +134,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index c22b4244a5c..039639f8989 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -5,7 +5,7 @@ import "fmt"
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
-	table [tableSize]tableEntryPrev
+	table [1 << 16]tableEntryPrev
 }
 
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
@@ -13,6 +13,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 8 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -73,7 +75,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hash4u(cv, tableBits)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -141,7 +143,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -156,7 +166,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(t+4) < len(src) && t > 0 {
 					cv := load3232(src, t)
-					nextHash := hash(cv)
+					nextHash := hash4u(cv, tableBits)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t},
@@ -165,30 +175,31 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				goto emitRemainder
 			}
 
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntryPrev{
-				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3},
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 5 {
+				nextHash := hash4u(load3232(src, i), tableBits)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
 			}
-			x >>= 8
-			prevHash = hash(uint32(x))
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
-			prevHash = hash(uint32(x))
+			prevHash = hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
-			currHash := hash(uint32(x))
+			currHash := hash4u(uint32(x), tableBits)
 			candidates := e.table[currHash]
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
@@ -200,15 +211,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			candidate = candidates.Cur
 			minOffset := e.cur + s - (maxMatchOffset - 4)
 
-			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
+			if candidate.offset > minOffset {
+				if cv == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
 				candidate = candidates.Prev
 				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
+					// Match at prev...
+					continue
 				}
 			}
 			cv = uint32(x >> 8)
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index e62f0c02b1e..1cbffa1aefe 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -135,7 +135,15 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index 293a3a320b7..4b97576bd38 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -210,7 +210,15 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index a709977ec49..62888edf3cd 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -243,7 +243,15 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if false {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index 53e89912463..544162a4318 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -249,7 +249,15 @@ func statelessEnc(dst *tokens, src []byte, startAt int16) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
index 3a9618ee193..d818790c132 100644
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -13,11 +13,10 @@ import (
 )
 
 const (
-	// From top
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 5 bits    offsetcode
-	// 16 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
 	lengthShift         = 22
 	offsetMask          = 1<<lengthShift - 1
 	typeMask            = 3 << 30
@@ -196,12 +195,11 @@ func (t *tokens) indexTokens(in []token) {
 
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
 		dst.litHist[v]++
+		dst.n++
 	}
-	dst.n += uint16(len(lit))
 }
 
 func (t *tokens) AddLiteral(lit byte) {
@@ -276,7 +274,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 	xoffset |= oCode << 16
 
 	t.extraHist[lengthCodes1[uint8(xlength)]]++
-	t.offHist[oCode]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
 	t.n++
 }
@@ -295,12 +293,16 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 		xl := xlength
 		if xl > 258 {
 			// We need to have at least baseMatchLength left over for next loop.
-			xl = 258 - baseMatchLength
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
 		}
 		xlength -= xl
 		xl -= baseMatchLength
 		t.extraHist[lengthCodes1[uint8(xl)]]++
-		t.offHist[oc]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
 		t.n++
 	}
@@ -356,8 +358,8 @@ func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
 func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-// The code is never more than 8 bits, but is returned as uint32 for convenience.
-func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e8868a..451160edda3 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -8,115 +8,10 @@ package huff0
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 )
 
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReader struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
-	b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
@@ -213,10 +108,17 @@ func (b *bitReaderBytes) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderBytes) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
@@ -263,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 
+// peekTopBits(n) is equvialent to peekBitFast(64 - n)
+func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
+	return uint16(b.value >> n)
+}
+
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
@@ -318,10 +225,17 @@ func (b *bitReaderShifted) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderShifted) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 8323dc05389..bc95ac623bd 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@ package huff0
 
 import (
 	"fmt"
+	"math"
 	"runtime"
 	"sync"
 )
@@ -289,6 +290,10 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
 		if err != nil {
 			return nil, err
 		}
+		if len(s.Out)-idx > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -332,6 +337,10 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
 			return nil, errs[i]
 		}
 		o := s.tmpOut[i]
+		if len(o) > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 2a06bd1a7e5..04f6529955e 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -216,6 +217,7 @@ func (s *Scratch) Decoder() *Decoder {
 	return &Decoder{
 		dt:             s.dt,
 		actualTableLog: s.actualTableLog,
+		bufs:           &s.decPool,
 	}
 }
 
@@ -223,6 +225,15 @@ func (s *Scratch) Decoder() *Decoder {
 type Decoder struct {
 	dt             dTable
 	actualTableLog uint8
+	bufs           *sync.Pool
+}
+
+func (d *Decoder) buffer() *[4][256]byte {
+	buf, ok := d.bufs.Get().(*[4][256]byte)
+	if ok {
+		return buf
+	}
+	return &[4][256]byte{}
 }
 
 // Decompress1X will decompress a 1X encoded stream.
@@ -249,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	for br.off >= 8 {
@@ -277,6 +289,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
+				d.bufs.Put(bufs)
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
@@ -284,6 +297,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -310,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
@@ -319,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -341,7 +357,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	switch d.actualTableLog {
@@ -369,6 +386,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
 					br.close()
+					d.bufs.Put(bufs)
 					return nil, ErrMaxDecodedSizeExceeded
 				}
 				dst = append(dst, buf[:]...)
@@ -398,6 +416,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
 					br.close()
+					d.bufs.Put(bufs)
 					return nil, ErrMaxDecodedSizeExceeded
 				}
 				dst = append(dst, buf[:]...)
@@ -426,6 +445,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -455,6 +475,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -484,6 +505,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -513,6 +535,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -542,6 +565,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -571,6 +595,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -578,10 +603,12 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			}
 		}
 	default:
+		d.bufs.Put(bufs)
 		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -601,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
+			d.bufs.Put(bufs)
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
@@ -609,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -628,7 +657,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	const shift = 56
@@ -655,6 +685,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
+				d.bufs.Put(bufs)
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
@@ -663,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -679,6 +711,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
@@ -688,195 +721,10 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256 / 4
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == bufoff {
-			if bufoff > dstEvery {
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	for i := range br {
-		offset := dstEvery * i
-		br := &br[i]
-		bitsLeft := br.off*8 + uint(64-br.bitsRead)
-		for bitsLeft > 0 {
-			br.fill()
-			if false && br.bitsRead >= 32 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value = (br.value << 32) | uint64(low)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value = (br.value << 8) | uint64(br.in[br.off-1])
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= len(out) {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@@ -916,12 +764,12 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -942,8 +790,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
 
 			v = single[uint8(br1.value>>shift)].entry
 			v2 = single[uint8(br2.value>>shift)].entry
@@ -951,8 +799,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
 
 			v = single[uint8(br1.value>>shift)].entry
 			v2 = single[uint8(br2.value>>shift)].entry
@@ -960,8 +808,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
 
 			v = single[uint8(br1.value>>shift)].entry
 			v2 = single[uint8(br2.value>>shift)].entry
@@ -969,8 +817,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
@@ -987,8 +835,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
 
 			v = single[uint8(br1.value>>shift)].entry
 			v2 = single[uint8(br2.value>>shift)].entry
@@ -996,8 +844,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
 
 			v = single[uint8(br1.value>>shift)].entry
 			v2 = single[uint8(br2.value>>shift)].entry
@@ -1005,8 +853,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
 
 			v = single[uint8(br1.value>>shift)].entry
 			v2 = single[uint8(br2.value>>shift)].entry
@@ -1014,25 +862,26 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			br1.value <<= v & 63
 			br2.bitsRead += uint8(v2)
 			br2.value <<= v2 & 63
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -1040,23 +889,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1076,7 +933,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
@@ -1084,16 +942,22 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			v := single[uint8(br.value>>shift)].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
@@ -1135,12 +999,12 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -1150,104 +1014,109 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[uint8(br[stream].value>>shift)].entry
-			v2 := single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-
-			v = single[uint8(br[stream].value>>shift)].entry
-			v2 = single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br[stream].value>>shift)].entry
-			v2 = single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br[stream].value>>shift)].entry
-			v2 = single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[uint8(br[stream].value>>shift)].entry
-			v2 := single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-
-			v = single[uint8(br[stream].value>>shift)].entry
-			v2 = single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br[stream].value>>shift)].entry
-			v2 = single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br[stream].value>>shift)].entry
-			v2 = single[uint8(br[stream2].value>>shift)].entry
-			br[stream].bitsRead += uint8(v)
-			br[stream].value <<= v & 63
-			br[stream2].bitsRead += uint8(v2)
-			br[stream2].value <<= v2 & 63
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -1257,21 +1126,27 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1291,7 +1166,8 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
@@ -1299,16 +1175,23 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 			v := single[br.peekByteFast()].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 00000000000..3415e5da226
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,148 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// that uses an asm implementation of its main loop.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+type decompress4xContext struct {
+	pbr0     *bitReaderShifted
+	pbr1     *bitReaderShifted
+	pbr2     *bitReaderShifted
+	pbr3     *bitReaderShifted
+	peekBits uint8
+	out      *byte
+	dstEvery int
+	tbl      *dEntrySingle
+	decoded  int
+	limit    *byte
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+
+	use8BitTables := d.actualTableLog <= 8
+	if cap(dst) < fallback8BitSize && use8BitTables {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	var decoded int
+
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+		ctx := decompress4xContext{
+			pbr0:     &br[0],
+			pbr1:     &br[1],
+			pbr2:     &br[2],
+			pbr3:     &br[3],
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			out:      &out[0],
+			dstEvery: dstEvery,
+			tbl:      &single[0],
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
+		}
+		if use8BitTables {
+			decompress4x_8b_main_loop_amd64(&ctx)
+		} else {
+			decompress4x_main_loop_amd64(&ctx)
+		}
+
+		decoded = ctx.decoded
+		out = out[decoded/4:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 00000000000..06287f56859
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,662 @@
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), AX
+	MOVBQZX 32(AX), SI
+	MOVQ    40(AX), DI
+	MOVQ    DI, BX
+	MOVQ    72(AX), CX
+	MOVQ    CX, (SP)
+	MOVQ    48(AX), R8
+	MOVQ    56(AX), R9
+	MOVQ    (AX), R10
+	MOVQ    8(AX), R11
+	MOVQ    16(AX), R12
+	MOVQ    24(AX), R13
+
+	// Main loop
+main_loop:
+	MOVQ  BX, DI
+	CMPQ  DI, (SP)
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R10), R14
+	MOVBQZX 40(R10), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R10), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R10)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill0:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br0.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ R14, 32(R10)
+	MOVB R15, 40(R10)
+	ADDQ R8, DI
+
+	// br1.fillFast32()
+	MOVQ    32(R11), R14
+	MOVBQZX 40(R11), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill1
+	MOVQ    24(R11), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R11), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R11)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br1.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ R14, 32(R11)
+	MOVB R15, 40(R11)
+	ADDQ R8, DI
+
+	// br2.fillFast32()
+	MOVQ    32(R12), R14
+	MOVBQZX 40(R12), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill2
+	MOVQ    24(R12), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R12), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R12)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill2:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br2.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ R14, 32(R12)
+	MOVB R15, 40(R12)
+	ADDQ R8, DI
+
+	// br3.fillFast32()
+	MOVQ    32(R13), R14
+	MOVBQZX 40(R13), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill3
+	MOVQ    24(R13), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R13), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R13)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill3:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br3.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ  R14, 32(R13)
+	MOVB  R15, 40(R13)
+	ADDQ  $0x02, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  BX, DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
+
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), CX
+	MOVBQZX 32(CX), BX
+	MOVQ    40(CX), SI
+	MOVQ    SI, (SP)
+	MOVQ    72(CX), DX
+	MOVQ    DX, 8(SP)
+	MOVQ    48(CX), DI
+	MOVQ    56(CX), R8
+	MOVQ    (CX), R9
+	MOVQ    8(CX), R10
+	MOVQ    16(CX), R11
+	MOVQ    24(CX), R12
+
+	// Main loop
+main_loop:
+	MOVQ  (SP), SI
+	CMPQ  SI, 8(SP)
+	SETGE DL
+
+	// br1000.fillFast32()
+	MOVQ    32(R9), R13
+	MOVBQZX 40(R9), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1000
+	MOVQ    24(R9), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R9), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R9)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1000.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1000:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R9)
+	MOVB R14, 40(R9)
+	ADDQ DI, SI
+
+	// br1001.fillFast32()
+	MOVQ    32(R10), R13
+	MOVBQZX 40(R10), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1001
+	MOVQ    24(R10), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R10), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R10)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1001.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1001:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R10)
+	MOVB R14, 40(R10)
+	ADDQ DI, SI
+
+	// br1002.fillFast32()
+	MOVQ    32(R11), R13
+	MOVBQZX 40(R11), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1002
+	MOVQ    24(R11), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R11), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R11)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1002.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1002:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R11)
+	MOVB R14, 40(R11)
+	ADDQ DI, SI
+
+	// br1003.fillFast32()
+	MOVQ    32(R12), R13
+	MOVBQZX 40(R12), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1003
+	MOVQ    24(R12), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R12), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R12)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1003.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1003:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ  R13, 32(R12)
+	MOVB  R14, 40(R12)
+	ADDQ  $0x04, (SP)
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  (SP), DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 00000000000..126b4d68a94
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,193 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
+			out = out[bufoff:]
+			decoded += bufoff * 4
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 3ee00ecb470..e8ad17ad08e 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"math"
 	"math/bits"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -116,6 +117,7 @@ type Scratch struct {
 	nodes          []nodeElt
 	tmpOut         [4][]byte
 	fse            *fse.Scratch
+	decPool        sync.Pool // *[4][256]byte buffers.
 	huffWeight     [maxSymbolValue + 1]byte
 }
 
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
new file mode 100644
index 00000000000..3954c51219b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@@ -0,0 +1,34 @@
+// Package cpuinfo gives runtime info about the current CPU.
+//
+// This is a very limited module meant for use internally
+// in this project. For more versatile solution check
+// https://github.com/klauspost/cpuid.
+package cpuinfo
+
+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
+func HasBMI1() bool {
+	return hasBMI1
+}
+
+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
+func HasBMI2() bool {
+	return hasBMI2
+}
+
+// DisableBMI2 will disable BMI2, for testing purposes.
+// Call returned function to restore previous state.
+func DisableBMI2() func() {
+	old := hasBMI2
+	hasBMI2 = false
+	return func() {
+		hasBMI2 = old
+	}
+}
+
+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
+func HasBMI() bool {
+	return HasBMI1() && HasBMI2()
+}
+
+var hasBMI1 bool
+var hasBMI2 bool
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
new file mode 100644
index 00000000000..e802579c4f9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@@ -0,0 +1,11 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package cpuinfo
+
+// go:noescape
+func x86extensions() (bmi1, bmi2 bool)
+
+func init() {
+	hasBMI1, hasBMI2 = x86extensions()
+}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
new file mode 100644
index 00000000000..4465fbe9e90
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@@ -0,0 +1,36 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+TEXT ·x86extensions(SB), NOSPLIT, $0
+	// 1. determine max EAX value
+	XORQ AX, AX
+	CPUID
+
+	CMPQ AX, $7
+	JB   unsupported
+
+	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
+	MOVQ $7, AX
+	MOVQ $0, CX
+	CPUID
+
+	BTQ   $3, BX // bit 3 = BMI1
+	SETCS AL
+
+	BTQ   $8, BX // bit 8 = BMI2
+	SETCS AH
+
+	MOVB AL, bmi1+0(FP)
+	MOVB AH, bmi2+1(FP)
+	RET
+
+unsupported:
+	XORQ AX, AX
+	MOVB AL, bmi1+0(FP)
+	MOVB AL, bmi2+1(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index c8f0f16fc1e..beb7fa87201 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -78,6 +78,9 @@ of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is
 in the future. So if you want to limit concurrency for future updates, specify the concurrency
 you would like.
 
+If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)`
+which will compress input as each block is completed, blocking on writes until each has completed.
+
 You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined 
 compression settings can be specified.
 
@@ -104,7 +107,8 @@ and seems to ignore concatenated streams, even though [it is part of the spec](h
 For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
 
 `EncodeAll` will encode all input in src and append it to dst.
-This function can be called concurrently, but each call will only run on a single goroutine.
+This function can be called concurrently. 
+Each call will only run on a same goroutine as the caller.
 
 Encoded blocks can be concatenated and the result will be the combined input stream.
 Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
@@ -149,10 +153,10 @@ http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 
 This package:
 file    out     level   insize      outsize     millis  mb/s
-silesia.tar zskp    1   211947520   73101992    643     313.87
-silesia.tar zskp    2   211947520   67504318    969     208.38
-silesia.tar zskp    3   211947520   64595893    2007    100.68
-silesia.tar zskp    4   211947520   60995370    8825    22.90
+silesia.tar zskp    1   211947520   73821326    634     318.47
+silesia.tar zskp    2   211947520   67655404    1508    133.96
+silesia.tar zskp    3   211947520   64746933    3000    67.37
+silesia.tar zskp    4   211947520   60073508    16926   11.94
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
@@ -161,94 +165,94 @@ silesia.tar zstd    6   211947520   62916450    1913    105.66
 silesia.tar zstd    9   211947520   60212393    5063    39.92
 
 gzip, stdlib/this package:
-silesia.tar gzstd   1   211947520   80007735    1654    122.21
-silesia.tar gzkp    1   211947520   80136201    1152    175.45
+silesia.tar gzstd   1   211947520   80007735    1498    134.87
+silesia.tar gzkp    1   211947520   80088272    1009    200.31
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
 
 file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  235022249   3088    590.30
-gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  zskp    3   1911399616  175034659   9636    189.17
-gob-stream  zskp    4   1911399616  165609838   50369   36.19
+gob-stream  zskp    1   1911399616  233948096   3230    564.34
+gob-stream  zskp    2   1911399616  203997694   4997    364.73
+gob-stream  zskp    3   1911399616  173526523   13435   135.68
+gob-stream  zskp    4   1911399616  162195235   47559   38.33
 
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
 gob-stream  zstd    9   1911399616  177620386   16175   112.70
 
-gob-stream  gzstd   1   1911399616  357382641   10251   177.82
-gob-stream  gzkp    1   1911399616  359753026   5438    335.20
+gob-stream  gzstd   1   1911399616  357382013   9046    201.49
+gob-stream  gzkp    1   1911399616  359136669   4885    373.08
 
 The test data for the Large Text Compression Benchmark is the first
 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
 http://mattmahoney.net/dc/textdata.html
 
 file    out level   insize      outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343848582   3609    264.18
-enwik9  zskp    2   1000000000  317276632   5746    165.97
-enwik9  zskp    3   1000000000  292243069   12162   78.41
-enwik9  zskp    4   1000000000  262183768   82837   11.51
+enwik9  zskp    1   1000000000  343833605   3687    258.64
+enwik9  zskp    2   1000000000  317001237   7672    124.29
+enwik9  zskp    3   1000000000  291915823   15923   59.89
+enwik9  zskp    4   1000000000  261710291   77697   12.27
 
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
 enwik9  zstd    9   1000000000  278348700   28549   33.40
 
-enwik9  gzstd   1   1000000000  382578136   9604    99.30
-enwik9  gzkp    1   1000000000  383825945   6544    145.73
+enwik9  gzstd   1   1000000000  382578136   8608    110.78
+enwik9  gzkp    1   1000000000  382781160   5628    169.45
 
 Highly compressible JSON file.
 https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 
 file                        out level   insize      outsize     millis  mb/s
-github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
-github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
-github-june-2days-2019.json zskp    3   6273951764  524340691   34043   175.75
-github-june-2days-2019.json zskp    4   6273951764  470320075   170190  35.16
+github-june-2days-2019.json zskp    1   6273951764  697439532   9789    611.17
+github-june-2days-2019.json zskp    2   6273951764  610876538   18553   322.49
+github-june-2days-2019.json zskp    3   6273951764  517662858   44186   135.41
+github-june-2days-2019.json zskp    4   6273951764  464617114   165373  36.18
 
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
 github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
 
-github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
-github-june-2days-2019.json gzkp    1   6273951764  1125417694  21788   274.61
+github-june-2days-2019.json gzstd   1   6273951764  1164397768  26793   223.32
+github-june-2days-2019.json gzkp    1   6273951764  1120631856  17693   338.16
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
 
 file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
-rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    zskp    3   8558382592  3158085214  77675   105.08
-rawstudio-mint14.tar    zskp    4   8558382592  2965110639  857750  9.52
+rawstudio-mint14.tar    zskp    1   8558382592  3718400221  18206   448.29
+rawstudio-mint14.tar    zskp    2   8558382592  3326118337  37074   220.15
+rawstudio-mint14.tar    zskp    3   8558382592  3163842361  87306   93.49
+rawstudio-mint14.tar    zskp    4   8558382592  2970480650  783862  10.41
 
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
 rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
 
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
-rawstudio-mint14.tar    gzkp    1   8558382592  3962605659  45113   180.92
+rawstudio-mint14.tar    gzstd   1   8558382592  3926234992  51345   158.96
+rawstudio-mint14.tar    gzkp    1   8558382592  3960117298  36722   222.26
 
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 
 file                    out level   insize      outsize     millis  mb/s
-nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
-nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-nyc-taxi-data-10M.csv   zskp    3   3325605752  530289687   25239   125.66
-nyc-taxi-data-10M.csv   zskp    4   3325605752  476268884   135958  23.33
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641319332   9462    335.17
+nyc-taxi-data-10M.csv   zskp    2   3325605752  588976126   17570   180.50
+nyc-taxi-data-10M.csv   zskp    3   3325605752  529329260   32432   97.79
+nyc-taxi-data-10M.csv   zskp    4   3325605752  474949772   138025  22.98
 
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
 nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
 
-nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  922257165   16780   189.00
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928654908   21270   149.11
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  922273214   13929   227.68
 ```
 
 ## Decompressor
@@ -283,8 +287,13 @@ func Decompress(in io.Reader, out io.Writer) error {
 }
 ```
 
-It is important to use the "Close" function when you no longer need the Reader to stop running goroutines. 
-See "Allocation-less operation" below.
+It is important to use the "Close" function when you no longer need the Reader to stop running goroutines, 
+when running with default settings.
+Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream.
+
+Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput.
+However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data 
+as it is being requested only.
 
 For decoding buffers, it could look something like this:
 
@@ -293,7 +302,7 @@ import "github.com/klauspost/compress/zstd"
 
 // Create a reader that caches decompressors.
 // For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil)
+var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0))
 
 // Decompress a buffer. We don't supply a destination buffer,
 // so it will be allocated by the decoder.
@@ -303,9 +312,12 @@ func Decompress(src []byte) ([]byte, error) {
 ```
 
 Both of these cases should provide the functionality needed. 
-The decoder can be used for *concurrent* decompression of multiple buffers. 
+The decoder can be used for *concurrent* decompression of multiple buffers.
+By default 4 decompressors will be created. 
+
 It will only allow a certain number of concurrent operations to run. 
-To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
+To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
+It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders.
 
 ### Dictionaries
 
@@ -357,62 +369,48 @@ In this case no unneeded allocations should be made.
 The buffer decoder does everything on the same goroutine and does nothing concurrently.
 It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
 
-The stream decoder operates on
+The stream decoder will create goroutines that:
 
-* One goroutine reads input and splits the input to several block decoders.
-* A number of decoders will decode blocks.
-* A goroutine coordinates these blocks and sends history from one to the next.
+1) Reads input and splits the input into blocks.
+2) Decompression of literals.
+3) Decompression of sequences.
+4) Reconstruction of output stream.
 
 So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
 
+The concurrency level will, for streams, determine how many blocks ahead the compression will start.
+
 Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
 
-In practice this means that concurrency is often limited to utilizing about 2 cores effectively.
- 
- 
+In practice this means that concurrency is often limited to utilizing about 3 cores effectively.
+  
 ### Benchmarks
 
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
-
 The first two are streaming decodes and the last are smaller inputs. 
- 
+
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+
 ```
-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
-
-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
-BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-
-Concurrent performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
-
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
+
+Concurrent blocks, performance:
+
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
 ```
 
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
 
 ## Zstd inside ZIP files
 
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 753d17df634..d7cd15ba29d 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -7,6 +7,7 @@ package zstd
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 	"math/bits"
 )
@@ -132,6 +133,9 @@ func (b *bitReader) remain() uint {
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	if !b.finished() {
+		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index dc587b2c949..b2bca330182 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
 	"sync"
 
 	"github.com/klauspost/compress/huff0"
@@ -38,6 +43,9 @@ const (
 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
 	maxCompressedBlockSize = 128 << 10
 
+	compressedBlockOverAlloc    = 16
+	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+
 	// Maximum possible block size (all Raw+Uncompressed).
 	maxBlockSize = (1 << 21) - 1
 
@@ -76,16 +84,25 @@ type blockDec struct {
 	// Window size of the block.
 	WindowSize uint64
 
-	history chan *history
-	input   chan struct{}
-	result  chan decodeOutput
-	err     error
-	decWG   sync.WaitGroup
+	err error
+
+	// Check against this crc
+	checkCRC []byte
 
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
 	localFrame *frameDec
 
+	sequence []seqVals
+
+	async struct {
+		newHist  *history
+		literals []byte
+		seqData  []byte
+		seqSize  int // Size of uncompressed sequences
+		fcs      uint64
+	}
+
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
@@ -108,13 +125,8 @@ func (b *blockDec) String() string {
 
 func newBlockDec(lowMem bool) *blockDec {
 	b := blockDec{
-		lowMem:  lowMem,
-		result:  make(chan decodeOutput, 1),
-		input:   make(chan struct{}, 1),
-		history: make(chan *history, 1),
+		lowMem: lowMem,
 	}
-	b.decWG.Add(1)
-	go b.startDecoder()
 	return &b
 }
 
@@ -132,11 +144,17 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
-	maxSize := maxBlockSize
+	maxSize := maxCompressedBlockSizeAlloc
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
 		b.RLESize = uint32(cSize)
 		if b.lowMem {
 			maxSize = cSize
@@ -147,9 +165,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
-		maxSize = maxCompressedBlockSize
+		maxSize = maxCompressedBlockSizeAlloc
 		if windowSize < maxCompressedBlockSize && b.lowMem {
-			maxSize = int(windowSize)
+			maxSize = int(windowSize) + compressedBlockOverAlloc
 		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
 			if debugDecoder {
@@ -157,7 +175,19 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			}
 			return ErrCompressedSizeTooBig
 		}
+		// Empty compressed blocks must at least be 2 bytes
+		// for Literals_Block_Type and one for Sequences_Section_Header.
+		if cSize < 2 {
+			return ErrBlockTooSmall
+		}
 	case blockTypeRaw:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
+
 		b.RLESize = 0
 		// We do not need a destination for raw blocks.
 		maxSize = -1
@@ -168,9 +198,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	// Read block data.
 	if cap(b.dataStorage) < cSize {
 		if b.lowMem || cSize > maxCompressedBlockSize {
-			b.dataStorage = make([]byte, 0, cSize)
+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
-			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
 	if cap(b.dst) <= maxSize {
@@ -192,85 +222,14 @@ func (b *blockDec) sendErr(err error) {
 	b.Last = true
 	b.Type = blockTypeReserved
 	b.err = err
-	b.input <- struct{}{}
 }
 
 // Close will release resources.
 // Closed blockDec cannot be reset.
 func (b *blockDec) Close() {
-	close(b.input)
-	close(b.history)
-	close(b.result)
-	b.decWG.Wait()
-}
-
-// decodeAsync will prepare decoding the block when it receives input.
-// This will separate output and history.
-func (b *blockDec) startDecoder() {
-	defer b.decWG.Done()
-	for range b.input {
-		//println("blockDec: Got block input")
-		switch b.Type {
-		case blockTypeRLE:
-			if cap(b.dst) < int(b.RLESize) {
-				if b.lowMem {
-					b.dst = make([]byte, b.RLESize)
-				} else {
-					b.dst = make([]byte, maxBlockSize)
-				}
-			}
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst[:b.RLESize],
-				err: nil,
-			}
-			v := b.data[0]
-			for i := range o.b {
-				o.b[i] = v
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeRaw:
-			o := decodeOutput{
-				d:   b,
-				b:   b.data,
-				err: nil,
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeCompressed:
-			b.dst = b.dst[:0]
-			err := b.decodeCompressed(nil)
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst,
-				err: err,
-			}
-			if debugDecoder {
-				println("Decompressed to", len(b.dst), "bytes, error:", err)
-			}
-			b.result <- o
-		case blockTypeReserved:
-			// Used for returning errors.
-			<-b.history
-			b.result <- decodeOutput{
-				d:   b,
-				b:   nil,
-				err: b.err,
-			}
-		default:
-			panic("Invalid block type")
-		}
-		if debugDecoder {
-			println("blockDec: Finished block")
-		}
-	}
 }
 
-// decodeAsync will prepare decoding the block when it receives the history.
-// If history is provided, it will not fetch it from the channel.
+// decodeBuf
 func (b *blockDec) decodeBuf(hist *history) error {
 	switch b.Type {
 	case blockTypeRLE:
@@ -293,14 +252,23 @@ func (b *blockDec) decodeBuf(hist *history) error {
 		return nil
 	case blockTypeCompressed:
 		saved := b.dst
-		b.dst = hist.b
-		hist.b = nil
+		// Append directly to history
+		if hist.ignoreBuffer == 0 {
+			b.dst = hist.b
+			hist.b = nil
+		} else {
+			b.dst = b.dst[:0]
+		}
 		err := b.decodeCompressed(hist)
 		if debugDecoder {
 			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
 		}
-		hist.b = b.dst
-		b.dst = saved
+		if hist.ignoreBuffer == 0 {
+			hist.b = b.dst
+			b.dst = saved
+		} else {
+			hist.appendKeep(b.dst)
+		}
 		return err
 	case blockTypeReserved:
 		// Used for returning errors.
@@ -310,30 +278,18 @@ func (b *blockDec) decodeBuf(hist *history) error {
 	}
 }
 
-// decodeCompressed will start decompressing a block.
-// If no history is supplied the decoder will decodeAsync as much as possible
-// before fetching from blockDec.history
-func (b *blockDec) decodeCompressed(hist *history) error {
-	in := b.data
-	delayedHistory := hist == nil
-
-	if delayedHistory {
-		// We must always grab history.
-		defer func() {
-			if hist == nil {
-				<-b.history
-			}
-		}()
-	}
+func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
 	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
 	if len(in) < 2 {
-		return ErrBlockTooSmall
+		return in, ErrBlockTooSmall
 	}
+
 	litType := literalsBlockType(in[0] & 3)
 	var litRegenSize int
 	var litCompSize int
 	sizeFormat := (in[0] >> 2) & 3
 	var fourStreams bool
+	var literals []byte
 	switch litType {
 	case literalsBlockRaw, literalsBlockRLE:
 		switch sizeFormat {
@@ -349,7 +305,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
 			in = in[3:]
@@ -360,7 +316,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
 			litRegenSize = int(n & 1023)
@@ -371,7 +327,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 4 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
 			litRegenSize = int(n & 16383)
@@ -381,7 +337,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 5 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
 			litRegenSize = int(n & 262143)
@@ -392,13 +348,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	if debugDecoder {
 		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
 	}
-	var literals []byte
-	var huff *huff0.Scratch
+	if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
+		return in, ErrWindowSizeExceeded
+	}
+
 	switch litType {
 	case literalsBlockRaw:
 		if len(in) < litRegenSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litRegenSize]
 		in = in[litRegenSize:]
@@ -406,7 +364,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	case literalsBlockRLE:
 		if len(in) < 1 {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
@@ -417,7 +375,6 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 					b.literalBuf = make([]byte, litRegenSize)
 				} else {
 					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
-
 				}
 			}
 		}
@@ -433,7 +390,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	case literalsBlockTreeless:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		// Store compressed literals, so we defer decoding until we get history.
 		literals = in[:litCompSize]
@@ -441,31 +398,65 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		if debugDecoder {
 			printf("Found %d compressed literals\n", litCompSize)
 		}
+		huff := hist.huffTree
+		if huff == nil {
+			return in, errors.New("literal block was treeless, but no history was defined")
+		}
+		// Ensure we have space to store it.
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, 0, litRegenSize)
+			} else {
+				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+			}
+		}
+		var err error
+		// Use our out buffer.
+		huff.MaxDecodedSize = maxCompressedBlockSize
+		if fourStreams {
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
+		} else {
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
+		}
+		// Make sure we don't leak our literals buffer
+		if err != nil {
+			println("decompressing literals:", err)
+			return in, err
+		}
+		if len(literals) != litRegenSize {
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+		}
+
 	case literalsBlockCompressed:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litCompSize]
 		in = in[litCompSize:]
-		huff = huffDecoderPool.Get().(*huff0.Scratch)
-		var err error
 		// Ensure we have space to store it.
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
 				b.literalBuf = make([]byte, 0, litRegenSize)
 			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
 			}
 		}
-		if huff == nil {
-			huff = &huff0.Scratch{}
+		huff := hist.huffTree
+		if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
+			huff = huffDecoderPool.Get().(*huff0.Scratch)
+			if huff == nil {
+				huff = &huff0.Scratch{}
+			}
 		}
+		var err error
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
-			return err
+			return in, err
 		}
+		hist.huffTree = huff
+		huff.MaxDecodedSize = maxCompressedBlockSize
 		// Use our out buffer.
 		if fourStreams {
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -474,27 +465,61 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
-			return err
+			return in, err
 		}
 		// Make sure we don't leak our literals buffer
 		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
 		if debugDecoder {
 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
 		}
 	}
+	hist.decoders.literals = literals
+	return in, nil
+}
 
+// decodeCompressed will start decompressing a block.
+func (b *blockDec) decodeCompressed(hist *history) error {
+	in := b.data
+	in, err := b.decodeLiterals(in, hist)
+	if err != nil {
+		return err
+	}
+	err = b.prepareSequences(in, hist)
+	if err != nil {
+		return err
+	}
+	if hist.decoders.nSeqs == 0 {
+		b.dst = append(b.dst, hist.decoders.literals...)
+		return nil
+	}
+	before := len(hist.decoders.out)
+	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
+	if err != nil {
+		return err
+	}
+	if hist.decoders.maxSyncLen > 0 {
+		hist.decoders.maxSyncLen += uint64(before)
+		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
+	}
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
+	return nil
+}
+
+func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
+	if debugDecoder {
+		printf("prepareSequences: %d byte(s) input\n", len(in))
+	}
 	// Decode Sequences
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
 	if len(in) < 1 {
 		return ErrBlockTooSmall
 	}
+	var nSeqs int
 	seqHeader := in[0]
-	nSeqs := 0
 	switch {
-	case seqHeader == 0:
-		in = in[1:]
 	case seqHeader < 128:
 		nSeqs = int(seqHeader)
 		in = in[1:]
@@ -511,8 +536,16 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
 		in = in[3:]
 	}
+	if nSeqs == 0 && len(in) != 0 {
+		// When no sequences, there should not be any more data...
+		if debugDecoder {
+			printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
+		}
+		return ErrUnexpectedBlockSize
+	}
 
-	var seqs = &sequenceDecs{}
+	var seqs = &hist.decoders
+	seqs.nSeqs = nSeqs
 	if nSeqs > 0 {
 		if len(in) < 1 {
 			return ErrBlockTooSmall
@@ -541,6 +574,9 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			}
 			switch mode {
 			case compModePredefined:
+				if seq.fse != nil && !seq.fse.preDefined {
+					fseDecoderPool.Put(seq.fse)
+				}
 				seq.fse = &fsePredef[i]
 			case compModeRLE:
 				if br.remain() < 1 {
@@ -548,34 +584,36 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 				}
 				v := br.Uint8()
 				br.advance(1)
-				dec := fseDecoderPool.Get().(*fseDecoder)
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
 				symb, err := decSymbolValue(v, symbolTableX[i])
 				if err != nil {
 					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
 					return err
 				}
-				dec.setRLE(symb)
-				seq.fse = dec
+				seq.fse.setRLE(symb)
 				if debugDecoder {
 					printf("RLE set to %+v, code: %v", symb, v)
 				}
 			case compModeFSE:
 				println("Reading table for", tableIndex(i))
-				dec := fseDecoderPool.Get().(*fseDecoder)
-				err := dec.readNCount(&br, uint16(maxTableSymbol[i]))
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
+				err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
 				if err != nil {
 					println("Read table error:", err)
 					return err
 				}
-				err = dec.transform(symbolTableX[i])
+				err = seq.fse.transform(symbolTableX[i])
 				if err != nil {
 					println("Transform table error:", err)
 					return err
 				}
 				if debugDecoder {
-					println("Read table ok", "symbolLen:", dec.symbolLen)
+					println("Read table ok", "symbolLen:", seq.fse.symbolLen)
 				}
-				seq.fse = dec
 			case compModeRepeat:
 				seq.repeat = true
 			}
@@ -585,140 +623,106 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		in = br.unread()
 	}
-
-	// Wait for history.
-	// All time spent after this is critical since it is strictly sequential.
-	if hist == nil {
-		hist = <-b.history
-		if hist.error {
-			return ErrDecoderClosed
-		}
-	}
-
-	// Decode treeless literal block.
-	if litType == literalsBlockTreeless {
-		// TODO: We could send the history early WITHOUT the stream history.
-		//   This would allow decoding treeless literals before the byte history is available.
-		//   Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
-		//   So not much obvious gain here.
-
-		if hist.huffTree == nil {
-			return errors.New("literal block was treeless, but no history was defined")
-		}
-		// Ensure we have space to store it.
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
-			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
-			}
-		}
-		var err error
-		// Use our out buffer.
-		huff = hist.huffTree
-		if fourStreams {
-			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
-		} else {
-			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
-		}
-		// Make sure we don't leak our literals buffer
-		if err != nil {
-			println("decompressing literals:", err)
-			return err
-		}
-		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
-		}
-	} else {
-		if hist.huffTree != nil && huff != nil {
-			if hist.dict == nil || hist.dict.litEnc != hist.huffTree {
-				huffDecoderPool.Put(hist.huffTree)
-			}
-			hist.huffTree = nil
-		}
-	}
-	if huff != nil {
-		hist.huffTree = huff
-	}
 	if debugDecoder {
-		println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.")
+		println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
 	}
 
 	if nSeqs == 0 {
-		// Decompressed content is defined entirely as Literals Section content.
-		b.dst = append(b.dst, literals...)
-		if delayedHistory {
-			hist.append(literals)
+		if len(b.sequence) > 0 {
+			b.sequence = b.sequence[:0]
 		}
 		return nil
 	}
+	br := seqs.br
+	if br == nil {
+		br = &bitReader{}
+	}
+	if err := br.init(in); err != nil {
+		return err
+	}
 
-	seqs, err := seqs.mergeHistory(&hist.decoders)
-	if err != nil {
+	if err := seqs.initialize(br, hist, b.dst); err != nil {
+		println("initializing sequences:", err)
 		return err
 	}
-	if debugDecoder {
-		println("History merged ok")
+	// Extract blocks...
+	if false && hist.dict == nil {
+		fatalErr := func(err error) {
+			if err != nil {
+				panic(err)
+			}
+		}
+		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
+		var buf bytes.Buffer
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
+		buf.Write(in)
+		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
 	}
-	br := &bitReader{}
-	if err := br.init(in); err != nil {
-		return err
+
+	return nil
+}
+
+func (b *blockDec) decodeSequences(hist *history) error {
+	if cap(b.sequence) < hist.decoders.nSeqs {
+		if b.lowMem {
+			b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
+		} else {
+			b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
+		}
 	}
+	b.sequence = b.sequence[:hist.decoders.nSeqs]
+	if hist.decoders.nSeqs == 0 {
+		hist.decoders.seqSize = len(hist.decoders.literals)
+		return nil
+	}
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.prevOffset = hist.recentOffsets
 
-	// TODO: Investigate if sending history without decoders are faster.
-	//   This would allow the sequences to be decoded async and only have to construct stream history.
-	//   If only recent offsets were not transferred, this would be an obvious win.
-	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+	err := hist.decoders.decode(b.sequence)
+	hist.recentOffsets = hist.decoders.prevOffset
+	return err
+}
 
+func (b *blockDec) executeSequences(hist *history) error {
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
-		// We do not need history any more.
+		// We do not need history anymore.
 		if hist.dict != nil {
 			hist.dict.content = nil
 		}
 	}
-
-	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
-
-	err = seqs.decode(nSeqs, br, hbytes)
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.out = b.dst[:0]
+	err := hist.decoders.execute(b.sequence, hbytes)
 	if err != nil {
 		return err
 	}
-	if !br.finished() {
-		return fmt.Errorf("%d extra bits on block, should be 0", br.remain())
-	}
+	return b.updateHistory(hist)
+}
 
-	err = br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
+func (b *blockDec) updateHistory(hist *history) error {
 	if len(b.data) > maxCompressedBlockSize {
 		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
 	}
 	// Set output and release references.
-	b.dst = seqs.out
-	seqs.out, seqs.literals, seqs.hist = nil, nil, nil
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
 
-	if !delayedHistory {
-		// If we don't have delayed history, no need to update.
-		hist.recentOffsets = seqs.prevOffset
-		return nil
-	}
 	if b.Last {
 		// if last block we don't care about history.
 		println("Last block, no history returned")
 		hist.b = hist.b[:0]
 		return nil
+	} else {
+		hist.append(b.dst)
+		if debugDecoder {
+			println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
+		}
 	}
-	hist.append(b.dst)
-	hist.recentOffsets = seqs.prevOffset
-	if debugDecoder {
-		println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.")
-	}
+	hist.decoders.out, hist.decoders.literals = nil, nil
 
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index aab71c6cf85..b80191e4b1e 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -113,6 +113,9 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
 func (r *readerWrapper) readByte() (byte, error) {
 	n2, err := r.r.Read(r.tmp[:1])
 	if err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
 		return 0, err
 	}
 	if n2 != 1 {
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index f430f58b572..36119f385c5 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,9 +5,13 @@
 package zstd
 
 import (
-	"errors"
+	"bytes"
+	"context"
+	"encoding/binary"
 	"io"
 	"sync"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 // Decoder provides decoding of zstandard streams.
@@ -22,12 +26,19 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 
-	// Streams ready to be decoded.
-	stream chan decodeStream
-
 	// Current read position used for Reader functionality.
 	current decoderState
 
+	// sync stream decoding
+	syncStream struct {
+		decodedFrame uint64
+		br           readerWrapper
+		enabled      bool
+		inFrame      bool
+	}
+
+	frame *frameDec
+
 	// Custom dictionaries.
 	// Always uses copies.
 	dicts map[uint32]dict
@@ -46,7 +57,10 @@ type decoderState struct {
 	output chan decodeOutput
 
 	// cancel remaining output.
-	cancel chan struct{}
+	cancel context.CancelFunc
+
+	// crc of current frame
+	crc *xxhash.Digest
 
 	flushed bool
 }
@@ -81,7 +95,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 			return nil, err
 		}
 	}
-	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	d.current.crc = xxhash.New()
 	d.current.flushed = true
 
 	if r == nil {
@@ -130,7 +144,7 @@ func (d *Decoder) Read(p []byte) (int, error) {
 				break
 			}
 			if !d.nextBlock(n == 0) {
-				return n, nil
+				return n, d.current.err
 			}
 		}
 	}
@@ -162,6 +176,7 @@ func (d *Decoder) Reset(r io.Reader) error {
 
 	d.drainOutput()
 
+	d.syncStream.br.r = nil
 	if r == nil {
 		d.current.err = ErrDecoderNilInput
 		if len(d.current.b) > 0 {
@@ -195,33 +210,39 @@ func (d *Decoder) Reset(r io.Reader) error {
 		}
 		return nil
 	}
-
-	if d.stream == nil {
-		d.stream = make(chan decodeStream, 1)
-		d.streamWg.Add(1)
-		go d.startStreamDecoder(d.stream)
-	}
-
 	// Remove current block.
+	d.stashDecoder()
 	d.current.decodeOutput = decodeOutput{}
 	d.current.err = nil
-	d.current.cancel = make(chan struct{})
 	d.current.flushed = false
 	d.current.d = nil
 
-	d.stream <- decodeStream{
-		r:      r,
-		output: d.current.output,
-		cancel: d.current.cancel,
+	// Ensure no-one else is still running...
+	d.streamWg.Wait()
+	if d.frame == nil {
+		d.frame = newFrameDec(d.o)
+	}
+
+	if d.o.concurrent == 1 {
+		return d.startSyncDecoder(r)
 	}
+
+	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	ctx, cancel := context.WithCancel(context.Background())
+	d.current.cancel = cancel
+	d.streamWg.Add(1)
+	go d.startStreamDecoder(ctx, r, d.current.output)
+
 	return nil
 }
 
 // drainOutput will drain the output until errEndOfStream is sent.
 func (d *Decoder) drainOutput() {
 	if d.current.cancel != nil {
-		println("cancelling current")
-		close(d.current.cancel)
+		if debugDecoder {
+			println("cancelling current")
+		}
+		d.current.cancel()
 		d.current.cancel = nil
 	}
 	if d.current.d != nil {
@@ -243,12 +264,9 @@ func (d *Decoder) drainOutput() {
 			}
 			d.decoders <- v.d
 		}
-		if v.err == errEndOfStream {
-			println("current flushed")
-			d.current.flushed = true
-			return
-		}
 	}
+	d.current.output = nil
+	d.current.flushed = true
 }
 
 // WriteTo writes data to w until there's no more data to write or when an error occurs.
@@ -287,7 +305,7 @@ func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
 // DecodeAll can be used concurrently.
 // The Decoder concurrency limits will be respected.
 func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
-	if d.current.err == ErrDecoderClosed {
+	if d.decoders == nil {
 		return dst, ErrDecoderClosed
 	}
 
@@ -300,6 +318,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		}
 		frame.rawInput = nil
 		frame.bBuf = nil
+		if frame.history.decoders.br != nil {
+			frame.history.decoders.br.in = nil
+		}
 		d.decoders <- block
 	}()
 	frame.bBuf = input
@@ -307,33 +328,39 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	for {
 		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
-		if err == io.EOF {
-			if debugDecoder {
-				println("frame reset return EOF")
+		if err != nil {
+			if err == io.EOF {
+				if debugDecoder {
+					println("frame reset return EOF")
+				}
+				return dst, nil
 			}
-			return dst, nil
+			return dst, err
 		}
 		if frame.DictionaryID != nil {
 			dict, ok := d.dicts[*frame.DictionaryID]
 			if !ok {
 				return nil, ErrUnknownDictionary
 			}
+			if debugDecoder {
+				println("setting dict", frame.DictionaryID)
+			}
 			frame.history.setDict(&dict)
 		}
-		if err != nil {
-			return dst, err
-		}
-		if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
-			return dst, ErrDecoderSizeExceeded
+		if frame.WindowSize > d.o.maxWindowSize {
+			return dst, ErrWindowSizeExceeded
 		}
-		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
-			// Never preallocate moe than 1 GB up front.
+		if frame.FrameContentSize != fcsUnknown {
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+				return dst, ErrDecoderSizeExceeded
+			}
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
 				copy(dst2, dst)
 				dst = dst2
 			}
 		}
+
 		if cap(dst) == 0 {
 			// Allocate len(input) * 2 by default if nothing is provided
 			// and we didn't get frame content size.
@@ -368,33 +395,176 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 // If non-blocking mode is used the returned boolean will be false
 // if no data was available without blocking.
 func (d *Decoder) nextBlock(blocking bool) (ok bool) {
-	if d.current.d != nil {
-		if debugDecoder {
-			printf("re-adding current decoder %p", d.current.d)
-		}
-		d.decoders <- d.current.d
-		d.current.d = nil
-	}
 	if d.current.err != nil {
 		// Keep error state.
-		return blocking
+		return false
+	}
+	d.current.b = d.current.b[:0]
+
+	// SYNC:
+	if d.syncStream.enabled {
+		if !blocking {
+			return false
+		}
+		ok = d.nextBlockSync()
+		if !ok {
+			d.stashDecoder()
+		}
+		return ok
 	}
 
+	//ASYNC:
+	d.stashDecoder()
 	if blocking {
-		d.current.decodeOutput = <-d.current.output
+		d.current.decodeOutput, ok = <-d.current.output
 	} else {
 		select {
-		case d.current.decodeOutput = <-d.current.output:
+		case d.current.decodeOutput, ok = <-d.current.output:
 		default:
 			return false
 		}
 	}
+	if !ok {
+		// This should not happen, so signal error state...
+		d.current.err = io.ErrUnexpectedEOF
+		return false
+	}
+	next := d.current.decodeOutput
+	if next.d != nil && next.d.async.newHist != nil {
+		d.current.crc.Reset()
+	}
 	if debugDecoder {
-		println("got", len(d.current.b), "bytes, error:", d.current.err)
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b)))
+		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
+	}
+
+	if !d.o.ignoreChecksum && len(next.b) > 0 {
+		n, err := d.current.crc.Write(next.b)
+		if err == nil {
+			if n != len(next.b) {
+				d.current.err = io.ErrShortWrite
+			}
+		}
+	}
+	if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
+		got := d.current.crc.Sum64()
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
+		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
+			if debugDecoder {
+				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+			}
+			d.current.err = ErrCRCMismatch
+		} else {
+			if debugDecoder {
+				println("CRC ok", tmp[:])
+			}
+		}
+	}
+
+	return true
+}
+
+func (d *Decoder) nextBlockSync() (ok bool) {
+	if d.current.d == nil {
+		d.current.d = <-d.decoders
+	}
+	for len(d.current.b) == 0 {
+		if !d.syncStream.inFrame {
+			d.frame.history.reset()
+			d.current.err = d.frame.reset(&d.syncStream.br)
+			if d.current.err != nil {
+				return false
+			}
+			if d.frame.DictionaryID != nil {
+				dict, ok := d.dicts[*d.frame.DictionaryID]
+				if !ok {
+					d.current.err = ErrUnknownDictionary
+					return false
+				} else {
+					d.frame.history.setDict(&dict)
+				}
+			}
+			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
+				d.current.err = ErrDecoderSizeExceeded
+				return false
+			}
+
+			d.syncStream.decodedFrame = 0
+			d.syncStream.inFrame = true
+		}
+		d.current.err = d.frame.next(d.current.d)
+		if d.current.err != nil {
+			return false
+		}
+		d.frame.history.ensureBlock()
+		if debugDecoder {
+			println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame)
+		}
+		histBefore := len(d.frame.history.b)
+		d.current.err = d.current.d.decodeBuf(&d.frame.history)
+
+		if d.current.err != nil {
+			println("error after:", d.current.err)
+			return false
+		}
+		d.current.b = d.frame.history.b[histBefore:]
+		if debugDecoder {
+			println("history after:", len(d.frame.history.b))
+		}
+
+		// Check frame size (before CRC)
+		d.syncStream.decodedFrame += uint64(len(d.current.b))
+		if d.syncStream.decodedFrame > d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeExceeded
+			return false
+		}
+
+		// Check FCS
+		if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeMismatch
+			return false
+		}
+
+		// Update/Check CRC
+		if d.frame.HasCheckSum {
+			if !d.o.ignoreChecksum {
+				d.frame.crc.Write(d.current.b)
+			}
+			if d.current.d.Last {
+				if !d.o.ignoreChecksum {
+					d.current.err = d.frame.checkCRC()
+				} else {
+					d.current.err = d.frame.consumeCRC()
+				}
+				if d.current.err != nil {
+					println("CRC error:", d.current.err)
+					return false
+				}
+			}
+		}
+		d.syncStream.inFrame = !d.current.d.Last
 	}
 	return true
 }
 
+func (d *Decoder) stashDecoder() {
+	if d.current.d != nil {
+		if debugDecoder {
+			printf("re-adding current decoder %p", d.current.d)
+		}
+		d.decoders <- d.current.d
+		d.current.d = nil
+	}
+}
+
 // Close will release all resources.
 // It is NOT possible to reuse the decoder after this.
 func (d *Decoder) Close() {
@@ -402,10 +572,10 @@ func (d *Decoder) Close() {
 		return
 	}
 	d.drainOutput()
-	if d.stream != nil {
-		close(d.stream)
+	if d.current.cancel != nil {
+		d.current.cancel()
 		d.streamWg.Wait()
-		d.stream = nil
+		d.current.cancel = nil
 	}
 	if d.decoders != nil {
 		close(d.decoders)
@@ -456,100 +626,307 @@ type decodeOutput struct {
 	err error
 }
 
-type decodeStream struct {
-	r io.Reader
-
-	// Blocks ready to be written to output.
-	output chan decodeOutput
-
-	// cancel reading from the input
-	cancel chan struct{}
+func (d *Decoder) startSyncDecoder(r io.Reader) error {
+	d.frame.history.reset()
+	d.syncStream.br = readerWrapper{r: r}
+	d.syncStream.inFrame = false
+	d.syncStream.enabled = true
+	d.syncStream.decodedFrame = 0
+	return nil
 }
 
-// errEndOfStream indicates that everything from the stream was read.
-var errEndOfStream = errors.New("end-of-stream")
-
 // Create Decoder:
-// Spawn n block decoders. These accept tasks to decode a block.
-// Create goroutine that handles stream processing, this will send history to decoders as they are available.
-// Decoders update the history as they decode.
-// When a block is returned:
-// 		a) history is sent to the next decoder,
-// 		b) content written to CRC.
-// 		c) return data to WRITER.
-// 		d) wait for next block to return data.
-// Once WRITTEN, the decoders reused by the writer frame decoder for re-use.
-func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
+// ASYNC:
+// Spawn 4 go routines.
+// 0: Read frames and decode blocks.
+// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
+// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
+// 3: Wait for stream history, execute sequences, send stream history.
+func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
 	defer d.streamWg.Done()
-	frame := newFrameDec(d.o)
-	for stream := range inStream {
-		if debugDecoder {
-			println("got new stream")
+	br := readerWrapper{r: r}
+
+	var seqPrepare = make(chan *blockDec, d.o.concurrent)
+	var seqDecode = make(chan *blockDec, d.o.concurrent)
+	var seqExecute = make(chan *blockDec, d.o.concurrent)
+
+	// Async 1: Prepare blocks...
+	go func() {
+		var hist history
+		var hasErr bool
+		for block := range seqPrepare {
+			if hasErr {
+				if block != nil {
+					seqDecode <- block
+				}
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 1: new history")
+				}
+				hist.reset()
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+			}
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqDecode <- block
+				continue
+			}
+
+			remain, err := block.decodeLiterals(block.data, &hist)
+			block.err = err
+			hasErr = block.err != nil
+			if err == nil {
+				block.async.literals = hist.decoders.literals
+				block.async.seqData = remain
+			} else if debugDecoder {
+				println("decodeLiterals error:", err)
+			}
+			seqDecode <- block
 		}
-		br := readerWrapper{r: stream.r}
-	decodeStream:
-		for {
-			frame.history.reset()
-			err := frame.reset(&br)
-			if debugDecoder && err != nil {
-				println("Frame decoder returned", err)
+		close(seqDecode)
+	}()
+
+	// Async 2: Decode sequences...
+	go func() {
+		var hist history
+		var hasErr bool
+
+		for block := range seqDecode {
+			if hasErr {
+				if block != nil {
+					seqExecute <- block
+				}
+				continue
 			}
-			if err == nil && frame.DictionaryID != nil {
-				dict, ok := d.dicts[*frame.DictionaryID]
-				if !ok {
-					err = ErrUnknownDictionary
-				} else {
-					frame.history.setDict(&dict)
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
+				}
+				hist.decoders = block.async.newHist.decoders
+				hist.recentOffsets = block.async.newHist.recentOffsets
+				hist.windowSize = block.async.newHist.windowSize
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
 				}
 			}
-			if err != nil {
-				stream.output <- decodeOutput{
-					err: err,
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqExecute <- block
+				continue
+			}
+
+			hist.decoders.literals = block.async.literals
+			block.err = block.prepareSequences(block.async.seqData, &hist)
+			if debugDecoder && block.err != nil {
+				println("prepareSequences returned:", block.err)
+			}
+			hasErr = block.err != nil
+			if block.err == nil {
+				block.err = block.decodeSequences(&hist)
+				if debugDecoder && block.err != nil {
+					println("decodeSequences returned:", block.err)
 				}
-				break
+				hasErr = block.err != nil
+				//				block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs]
+				block.async.seqSize = hist.decoders.seqSize
 			}
-			if debugDecoder {
-				println("starting frame decoder")
-			}
-
-			// This goroutine will forward history between frames.
-			frame.frameDone.Add(1)
-			frame.initAsync()
-
-			go frame.startDecoder(stream.output)
-		decodeFrame:
-			// Go through all blocks of the frame.
-			for {
-				dec := <-d.decoders
-				select {
-				case <-stream.cancel:
-					if !frame.sendErr(dec, io.EOF) {
-						// To not let the decoder dangle, send it back.
-						stream.output <- decodeOutput{d: dec}
+			seqExecute <- block
+		}
+		close(seqExecute)
+	}()
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	// Async 3: Execute sequences...
+	frameHistCache := d.frame.history.b
+	go func() {
+		var hist history
+		var decodedFrame uint64
+		var fcs uint64
+		var hasErr bool
+		for block := range seqExecute {
+			out := decodeOutput{err: block.err, d: block}
+			if block.err != nil || hasErr {
+				hasErr = true
+				output <- out
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 3: new history")
+				}
+				hist.windowSize = block.async.newHist.windowSize
+				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+
+				if cap(hist.b) < hist.allocFrameBuffer {
+					if cap(frameHistCache) >= hist.allocFrameBuffer {
+						hist.b = frameHistCache
+					} else {
+						hist.b = make([]byte, 0, hist.allocFrameBuffer)
+						println("Alloc history sized", hist.allocFrameBuffer)
+					}
+				}
+				hist.b = hist.b[:0]
+				fcs = block.async.fcs
+				decodedFrame = 0
+			}
+			do := decodeOutput{err: block.err, d: block}
+			switch block.Type {
+			case blockTypeRLE:
+				if debugDecoder {
+					println("add rle block length:", block.RLESize)
+				}
+
+				if cap(block.dst) < int(block.RLESize) {
+					if block.lowMem {
+						block.dst = make([]byte, block.RLESize)
+					} else {
+						block.dst = make([]byte, maxBlockSize)
+					}
+				}
+				block.dst = block.dst[:block.RLESize]
+				v := block.data[0]
+				for i := range block.dst {
+					block.dst[i] = v
+				}
+				hist.append(block.dst)
+				do.b = block.dst
+			case blockTypeRaw:
+				if debugDecoder {
+					println("add raw block length:", len(block.data))
+				}
+				hist.append(block.data)
+				do.b = block.data
+			case blockTypeCompressed:
+				if debugDecoder {
+					println("execute with history length:", len(hist.b), "window:", hist.windowSize)
+				}
+				hist.decoders.seqSize = block.async.seqSize
+				hist.decoders.literals = block.async.literals
+				do.err = block.executeSequences(&hist)
+				hasErr = do.err != nil
+				if debugDecoder && hasErr {
+					println("executeSequences returned:", do.err)
+				}
+				do.b = block.dst
+			}
+			if !hasErr {
+				decodedFrame += uint64(len(do.b))
+				if decodedFrame > fcs {
+					println("fcs exceeded", block.Last, fcs, decodedFrame)
+					do.err = ErrFrameSizeExceeded
+					hasErr = true
+				} else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
+					do.err = ErrFrameSizeMismatch
+					hasErr = true
+				} else {
+					if debugDecoder {
+						println("fcs ok", block.Last, fcs, decodedFrame)
 					}
-					break decodeStream
-				default:
 				}
-				err := frame.next(dec)
-				switch err {
-				case io.EOF:
-					// End of current frame, no error
-					println("EOF on next block")
-					break decodeFrame
-				case nil:
-					continue
-				default:
-					println("block decoder returned", err)
-					break decodeStream
+			}
+			output <- do
+		}
+		close(output)
+		frameHistCache = hist.b
+		wg.Done()
+		if debugDecoder {
+			println("decoder goroutines finished")
+		}
+	}()
+
+decodeStream:
+	for {
+		frame := d.frame
+		if debugDecoder {
+			println("New frame...")
+		}
+		var historySent bool
+		frame.history.reset()
+		err := frame.reset(&br)
+		if debugDecoder && err != nil {
+			println("Frame decoder returned", err)
+		}
+		if err == nil && frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				err = ErrUnknownDictionary
+			} else {
+				frame.history.setDict(&dict)
+			}
+		}
+		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+			err = ErrDecoderSizeExceeded
+		}
+		if err != nil {
+			select {
+			case <-ctx.Done():
+			case dec := <-d.decoders:
+				dec.sendErr(err)
+				seqPrepare <- dec
+			}
+			break decodeStream
+		}
+
+		// Go through all blocks of the frame.
+		for {
+			var dec *blockDec
+			select {
+			case <-ctx.Done():
+				break decodeStream
+			case dec = <-d.decoders:
+				// Once we have a decoder, we MUST return it.
+			}
+			err := frame.next(dec)
+			if !historySent {
+				h := frame.history
+				if debugDecoder {
+					println("Alloc History:", h.allocFrameBuffer)
+				}
+				dec.async.newHist = &h
+				dec.async.fcs = frame.FrameContentSize
+				historySent = true
+			} else {
+				dec.async.newHist = nil
+			}
+			if debugDecoder && err != nil {
+				println("next block returned error:", err)
+			}
+			dec.err = err
+			dec.checkCRC = nil
+			if dec.Last && frame.HasCheckSum && err == nil {
+				crc, err := frame.rawInput.readSmall(4)
+				if err != nil {
+					println("CRC missing?", err)
+					dec.err = err
+				}
+				var tmp [4]byte
+				copy(tmp[:], crc)
+				dec.checkCRC = tmp[:]
+				if debugDecoder {
+					println("found crc to check:", dec.checkCRC)
 				}
 			}
-			// All blocks have started decoding, check if there are more frames.
-			println("waiting for done")
-			frame.frameDone.Wait()
-			println("done waiting...")
+			err = dec.err
+			last := dec.Last
+			seqPrepare <- dec
+			if err != nil {
+				break decodeStream
+			}
+			if last {
+				break
+			}
 		}
-		frame.frameDone.Wait()
-		println("Sending EOS")
-		stream.output <- decodeOutput{err: errEndOfStream}
 	}
+	close(seqPrepare)
+	wg.Wait()
+	d.frame.history.b = frameHistCache
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 95cc9b8b81f..c70e6fa0f73 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -19,6 +19,7 @@ type decoderOptions struct {
 	maxDecodedSize uint64
 	maxWindowSize  uint64
 	dicts          []dict
+	ignoreChecksum bool
 }
 
 func (o *decoderOptions) setDefault() {
@@ -28,7 +29,10 @@ func (o *decoderOptions) setDefault() {
 		concurrent:    runtime.GOMAXPROCS(0),
 		maxWindowSize: MaxWindowSize,
 	}
-	o.maxDecodedSize = 1 << 63
+	if o.concurrent > 4 {
+		o.concurrent = 4
+	}
+	o.maxDecodedSize = 64 << 30
 }
 
 // WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -37,16 +41,25 @@ func WithDecoderLowmem(b bool) DOption {
 	return func(o *decoderOptions) error { o.lowMem = b; return nil }
 }
 
-// WithDecoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
-// The value supplied must be at least 1.
-// By default this will be set to GOMAXPROCS.
+// WithDecoderConcurrency sets the number of created decoders.
+// When decoding block with DecodeAll, this will limit the number
+// of possible concurrently running decodes.
+// When decoding streams, this will limit the number of
+// inflight blocks.
+// When decoding streams and setting maximum to 1,
+// no async decoding will be done.
+// When a value of 0 is provided GOMAXPROCS will be used.
+// By default this will be set to 4 or GOMAXPROCS, whatever is lower.
 func WithDecoderConcurrency(n int) DOption {
 	return func(o *decoderOptions) error {
-		if n <= 0 {
+		if n < 0 {
 			return errors.New("concurrency must be at least 1")
 		}
-		o.concurrent = n
+		if n == 0 {
+			o.concurrent = runtime.GOMAXPROCS(0)
+		} else {
+			o.concurrent = n
+		}
 		return nil
 	}
 }
@@ -54,7 +67,7 @@ func WithDecoderConcurrency(n int) DOption {
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
 // non-streaming operations or maximum window size for streaming operations.
 // This can be used to control memory usage of potentially hostile content.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
 func WithDecoderMaxMemory(n uint64) DOption {
 	return func(o *decoderOptions) error {
 		if n == 0 {
@@ -100,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
 		return nil
 	}
 }
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.ignoreChecksum = b
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 5f08a283023..f51ab529a0b 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -85,7 +85,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 7
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -334,7 +334,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index e6e315969b0..dcc987a7cb6 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -98,23 +98,25 @@ func (e *Encoder) Reset(w io.Writer) {
 	if cap(s.filling) == 0 {
 		s.filling = make([]byte, 0, e.o.blockSize)
 	}
-	if cap(s.current) == 0 {
-		s.current = make([]byte, 0, e.o.blockSize)
-	}
-	if cap(s.previous) == 0 {
-		s.previous = make([]byte, 0, e.o.blockSize)
+	if e.o.concurrent > 1 {
+		if cap(s.current) == 0 {
+			s.current = make([]byte, 0, e.o.blockSize)
+		}
+		if cap(s.previous) == 0 {
+			s.previous = make([]byte, 0, e.o.blockSize)
+		}
+		s.current = s.current[:0]
+		s.previous = s.previous[:0]
+		if s.writing == nil {
+			s.writing = &blockEnc{lowMem: e.o.lowMem}
+			s.writing.init()
+		}
+		s.writing.initNewEncode()
 	}
 	if s.encoder == nil {
 		s.encoder = e.o.encoder()
 	}
-	if s.writing == nil {
-		s.writing = &blockEnc{lowMem: e.o.lowMem}
-		s.writing.init()
-	}
-	s.writing.initNewEncode()
 	s.filling = s.filling[:0]
-	s.current = s.current[:0]
-	s.previous = s.previous[:0]
 	s.encoder.Reset(e.o.dict, false)
 	s.headerWritten = false
 	s.eofWritten = false
@@ -258,6 +260,46 @@ func (e *Encoder) nextBlock(final bool) error {
 		return s.err
 	}
 
+	// SYNC:
+	if e.o.concurrent == 1 {
+		src := s.filling
+		s.nInput += int64(len(s.filling))
+		if debugEncoder {
+			println("Adding sync block,", len(src), "bytes, final:", final)
+		}
+		enc := s.encoder
+		blk := enc.Block()
+		blk.reset(nil)
+		enc.Encode(blk, src)
+		blk.last = final
+		if final {
+			s.eofWritten = true
+		}
+
+		err := errIncompressible
+		// If we got the exact same number of literals as input,
+		// assume the literals cannot be compressed.
+		if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
+			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		}
+		switch err {
+		case errIncompressible:
+			if debugEncoder {
+				println("Storing incompressible block as raw")
+			}
+			blk.encodeRaw(src)
+			// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
+		case nil:
+		default:
+			s.err = err
+			return err
+		}
+		_, s.err = s.w.Write(blk.output)
+		s.nWritten += int64(len(blk.output))
+		s.filling = s.filling[:0]
+		return s.err
+	}
+
 	// Move blocks forward.
 	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
 	s.nInput += int64(len(s.current))
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 5f2e1d020ee..44d8dbd199a 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -76,6 +76,7 @@ func WithEncoderCRC(b bool) EOption {
 // WithEncoderConcurrency will set the concurrency,
 // meaning the maximum number of encoders to run concurrently.
 // The value supplied must be at least 1.
+// For streams, setting a value of 1 will disable async compression.
 // By default this will be set to GOMAXPROCS.
 func WithEncoderConcurrency(n int) EOption {
 	return func(o *encoderOptions) error {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 989c79f8c31..3ff109cce4b 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -8,23 +8,17 @@ import (
 	"bytes"
 	"encoding/hex"
 	"errors"
-	"hash"
 	"io"
-	"sync"
 
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 type frameDec struct {
-	o      decoderOptions
-	crc    hash.Hash64
-	offset int64
+	o   decoderOptions
+	crc *xxhash.Digest
 
 	WindowSize uint64
 
-	// In order queue of blocks being decoded.
-	decoding chan *blockDec
-
 	// Frame history passed between blocks
 	history history
 
@@ -34,15 +28,10 @@ type frameDec struct {
 	bBuf byteBuf
 
 	FrameContentSize uint64
-	frameDone        sync.WaitGroup
 
 	DictionaryID  *uint32
 	HasCheckSum   bool
 	SingleSegment bool
-
-	// asyncRunning indicates whether the async routine processes input on 'decoding'.
-	asyncRunningMu sync.Mutex
-	asyncRunning   bool
 }
 
 const (
@@ -208,7 +197,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 	default:
 		fcsSize = 1 << v
 	}
-	d.FrameContentSize = 0
+	d.FrameContentSize = fcsUnknown
 	if fcsSize > 0 {
 		b, err := br.readSmall(fcsSize)
 		if err != nil {
@@ -229,9 +218,10 @@ func (d *frameDec) reset(br byteBuffer) error {
 			d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
 		}
 		if debugDecoder {
-			println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize)
+			println("Read FCS:", d.FrameContentSize)
 		}
 	}
+
 	// Move this to shared.
 	d.HasCheckSum = fhd&(1<<2) != 0
 	if d.HasCheckSum {
@@ -264,10 +254,16 @@ func (d *frameDec) reset(br byteBuffer) error {
 	}
 	d.history.windowSize = int(d.WindowSize)
 	if d.o.lowMem && d.history.windowSize < maxBlockSize {
-		d.history.maxSize = d.history.windowSize * 2
+		d.history.allocFrameBuffer = d.history.windowSize * 2
+		// TODO: Maybe use FrameContent size
 	} else {
-		d.history.maxSize = d.history.windowSize + maxBlockSize
+		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
 	}
+
+	if debugDecoder {
+		println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum)
+	}
+
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -276,62 +272,24 @@ func (d *frameDec) reset(br byteBuffer) error {
 // next will start decoding the next block from stream.
 func (d *frameDec) next(block *blockDec) error {
 	if debugDecoder {
-		printf("decoding new block %p:%p", block, block.data)
+		println("decoding new block")
 	}
 	err := block.reset(d.rawInput, d.WindowSize)
 	if err != nil {
 		println("block error:", err)
 		// Signal the frame decoder we have a problem.
-		d.sendErr(block, err)
+		block.sendErr(err)
 		return err
 	}
-	block.input <- struct{}{}
-	if debugDecoder {
-		println("next block:", block)
-	}
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return nil
-	}
-	if block.Last {
-		// We indicate the frame is done by sending io.EOF
-		d.decoding <- block
-		return io.EOF
-	}
-	d.decoding <- block
 	return nil
 }
 
-// sendEOF will queue an error block on the frame.
-// This will cause the frame decoder to return when it encounters the block.
-// Returns true if the decoder was added.
-func (d *frameDec) sendErr(block *blockDec, err error) bool {
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return false
-	}
-
-	println("sending error", err.Error())
-	block.sendErr(err)
-	d.decoding <- block
-	return true
-}
-
 // checkCRC will check the checksum if the frame has one.
 // Will return ErrCRCMismatch if crc check failed, otherwise nil.
 func (d *frameDec) checkCRC() error {
 	if !d.HasCheckSum {
 		return nil
 	}
-	var tmp [4]byte
-	got := d.crc.Sum64()
-	// Flip to match file order.
-	tmp[0] = byte(got >> 0)
-	tmp[1] = byte(got >> 8)
-	tmp[2] = byte(got >> 16)
-	tmp[3] = byte(got >> 24)
 
 	// We can overwrite upper tmp now
 	want, err := d.rawInput.readSmall(4)
@@ -340,6 +298,18 @@ func (d *frameDec) checkCRC() error {
 		return err
 	}
 
+	if d.o.ignoreChecksum {
+		return nil
+	}
+
+	var tmp [4]byte
+	got := d.crc.Sum64()
+	// Flip to match file order.
+	tmp[0] = byte(got >> 0)
+	tmp[1] = byte(got >> 8)
+	tmp[2] = byte(got >> 16)
+	tmp[3] = byte(got >> 24)
+
 	if !bytes.Equal(tmp[:], want) {
 		if debugDecoder {
 			println("CRC Check Failed:", tmp[:], "!=", want)
@@ -352,123 +322,17 @@ func (d *frameDec) checkCRC() error {
 	return nil
 }
 
-func (d *frameDec) initAsync() {
-	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 2MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize
-	}
-	// re-alloc if more than one extra block size.
-	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.history.b) < d.history.maxSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.decoding) < d.o.concurrent {
-		d.decoding = make(chan *blockDec, d.o.concurrent)
-	}
-	if debugDecoder {
-		h := d.history
-		printf("history init. len: %d, cap: %d", len(h.b), cap(h.b))
-	}
-	d.asyncRunningMu.Lock()
-	d.asyncRunning = true
-	d.asyncRunningMu.Unlock()
-}
-
-// startDecoder will start decoding blocks and write them to the writer.
-// The decoder will stop as soon as an error occurs or at end of frame.
-// When the frame has finished decoding the *bufio.Reader
-// containing the remaining input will be sent on frameDec.frameDone.
-func (d *frameDec) startDecoder(output chan decodeOutput) {
-	written := int64(0)
-
-	defer func() {
-		d.asyncRunningMu.Lock()
-		d.asyncRunning = false
-		d.asyncRunningMu.Unlock()
-
-		// Drain the currently decoding.
-		d.history.error = true
-	flushdone:
-		for {
-			select {
-			case b := <-d.decoding:
-				b.history <- &d.history
-				output <- <-b.result
-			default:
-				break flushdone
-			}
-		}
-		println("frame decoder done, signalling done")
-		d.frameDone.Done()
-	}()
-	// Get decoder for first block.
-	block := <-d.decoding
-	block.history <- &d.history
-	for {
-		var next *blockDec
-		// Get result
-		r := <-block.result
-		if r.err != nil {
-			println("Result contained error", r.err)
-			output <- r
-			return
-		}
-		if debugDecoder {
-			println("got result, from ", d.offset, "to", d.offset+int64(len(r.b)))
-			d.offset += int64(len(r.b))
-		}
-		if !block.Last {
-			// Send history to next block
-			select {
-			case next = <-d.decoding:
-				if debugDecoder {
-					println("Sending ", len(d.history.b), "bytes as history")
-				}
-				next.history <- &d.history
-			default:
-				// Wait until we have sent the block, so
-				// other decoders can potentially get the decoder.
-				next = nil
-			}
-		}
-
-		// Add checksum, async to decoding.
-		if d.HasCheckSum {
-			n, err := d.crc.Write(r.b)
-			if err != nil {
-				r.err = err
-				if n != len(r.b) {
-					r.err = io.ErrShortWrite
-				}
-				output <- r
-				return
-			}
-		}
-		written += int64(len(r.b))
-		if d.SingleSegment && uint64(written) > d.FrameContentSize {
-			println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize)
-			r.err = ErrFrameSizeExceeded
-			output <- r
-			return
-		}
-		if block.Last {
-			r.err = d.checkCRC()
-			output <- r
-			return
-		}
-		output <- r
-		if next == nil {
-			// There was no decoder available, we wait for one now that we have sent to the writer.
-			if debugDecoder {
-				println("Sending ", len(d.history.b), " bytes as history")
-			}
-			next = <-d.decoding
-			next.history <- &d.history
+// consumeCRC reads the checksum data if the frame has one.
+func (d *frameDec) consumeCRC() error {
+	if d.HasCheckSum {
+		_, err := d.rawInput.readSmall(4)
+		if err != nil {
+			println("CRC missing?", err)
+			return err
 		}
-		block = next
 	}
+
+	return nil
 }
 
 // runDecoder will create a sync decoder that will decode a block of data.
@@ -477,8 +341,22 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 
 	// We use the history for output to avoid copying it.
 	d.history.b = dst
+	d.history.ignoreBuffer = len(dst)
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
+	d.history.decoders.maxSyncLen = 0
+	if d.FrameContentSize != fcsUnknown {
+		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+			return dst, ErrDecoderSizeExceeded
+		}
+		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+			// Alloc for output
+			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
+			copy(dst2, dst)
+			dst = dst2
+		}
+	}
 	var err error
 	for {
 		err = dec.reset(d.rawInput, d.WindowSize)
@@ -489,29 +367,41 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 			println("next block:", dec)
 		}
 		err = dec.decodeBuf(&d.history)
-		if err != nil || dec.Last {
+		if err != nil {
 			break
 		}
 		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
 			err = ErrDecoderSizeExceeded
 			break
 		}
-		if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
-			println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
+		if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
+			println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
 			err = ErrFrameSizeExceeded
 			break
 		}
+		if dec.Last {
+			break
+		}
+		if debugDecoder {
+			println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
+		}
 	}
 	dst = d.history.b
 	if err == nil {
-		if d.HasCheckSum {
-			var n int
-			n, err = d.crc.Write(dst[crcStart:])
-			if err == nil {
-				if n != len(dst)-crcStart {
-					err = io.ErrShortWrite
-				} else {
-					err = d.checkCRC()
+		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
+			err = ErrFrameSizeMismatch
+		} else if d.HasCheckSum {
+			if d.o.ignoreChecksum {
+				err = d.consumeCRC()
+			} else {
+				var n int
+				n, err = d.crc.Write(dst[crcStart:])
+				if err == nil {
+					if n != len(dst)-crcStart {
+						err = io.ErrShortWrite
+					} else {
+						err = d.checkCRC()
+					}
 				}
 			}
 		}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index bb3d4fd6c31..fde4e6b6011 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"io"
 )
 
 const (
@@ -182,6 +184,29 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	return s.buildDtable()
 }
 
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
+	fatalErr := func(err error) {
+		if err != nil {
+			panic(err)
+		}
+	}
+	// 	dt             [maxTablesize]decSymbol // Decompression table.
+	//	symbolLen      uint16                  // Length of active part of the symbol table.
+	//	actualTableLog uint8                   // Selected tablelog.
+	//	maxBits        uint8                   // Maximum number of additional bits
+	//	// used for table creation to avoid allocations.
+	//	stateTable [256]uint16
+	//	norm       [maxSymbolValue + 1]int16
+	//	preDefined bool
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
+}
+
 // decSymbol contains information about a state entry,
 // Including the state offset base, the output symbol and
 // the number of bits to read for the low part of the destination state.
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index f783e32d251..28b40153cc2 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -10,20 +10,31 @@ import (
 
 // history contains the information transferred between blocks.
 type history struct {
-	b             []byte
-	huffTree      *huff0.Scratch
-	recentOffsets [3]int
+	// Literal decompression
+	huffTree *huff0.Scratch
+
+	// Sequence decompression
 	decoders      sequenceDecs
-	windowSize    int
-	maxSize       int
-	error         bool
-	dict          *dict
+	recentOffsets [3]int
+
+	// History buffer...
+	b []byte
+
+	// ignoreBuffer is meant to ignore a number of bytes
+	// when checking for matches in history
+	ignoreBuffer int
+
+	windowSize       int
+	allocFrameBuffer int // needed?
+	error            bool
+	dict             *dict
 }
 
 // reset will reset the history to initial state of a frame.
 // The history must already have been initialized to the desired size.
 func (h *history) reset() {
 	h.b = h.b[:0]
+	h.ignoreBuffer = 0
 	h.error = false
 	h.recentOffsets = [3]int{1, 4, 8}
 	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
@@ -35,7 +46,7 @@ func (h *history) reset() {
 	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
 		fseDecoderPool.Put(f)
 	}
-	h.decoders = sequenceDecs{}
+	h.decoders = sequenceDecs{br: h.decoders.br}
 	if h.huffTree != nil {
 		if h.dict == nil || h.dict.litEnc != h.huffTree {
 			huffDecoderPool.Put(h.huffTree)
@@ -54,6 +65,7 @@ func (h *history) setDict(dict *dict) {
 	h.decoders.litLengths = dict.llDec
 	h.decoders.offsets = dict.ofDec
 	h.decoders.matchLengths = dict.mlDec
+	h.decoders.dict = dict.content
 	h.recentOffsets = dict.offsets
 	h.huffTree = dict.litEnc
 }
@@ -83,6 +95,24 @@ func (h *history) append(b []byte) {
 	copy(h.b[h.windowSize-len(b):], b)
 }
 
+// ensureBlock will ensure there is space for at least one block...
+func (h *history) ensureBlock() {
+	if cap(h.b) < h.allocFrameBuffer {
+		h.b = make([]byte, 0, h.allocFrameBuffer)
+		return
+	}
+
+	avail := cap(h.b) - len(h.b)
+	if avail >= h.windowSize || avail > maxCompressedBlockSize {
+		return
+	}
+	// Move data down so we only have window size left.
+	// We know we have less than window size in b at this point.
+	discard := len(h.b) - h.windowSize
+	copy(h.b, h.b[discard:])
+	h.b = h.b[:h.windowSize]
+}
+
 // append bytes to history without ever discarding anything.
 func (h *history) appendKeep(b []byte) {
 	h.b = append(h.b, b...)
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index bc731e4cb69..e80139dd9c6 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -20,6 +20,10 @@ type seq struct {
 	llCode, mlCode, ofCode uint8
 }
 
+type seqVals struct {
+	ll, ml, mo int
+}
+
 func (s seq) String() string {
 	if s.offset <= 3 {
 		if s.offset == 0 {
@@ -61,16 +65,19 @@ type sequenceDecs struct {
 	offsets      sequenceDec
 	matchLengths sequenceDec
 	prevOffset   [3]int
-	hist         []byte
 	dict         []byte
 	literals     []byte
 	out          []byte
+	nSeqs        int
+	br           *bitReader
+	seqSize      int
 	windowSize   int
 	maxBits      uint8
+	maxSyncLen   uint64
 }
 
 // initialize all 3 decoders from the stream input.
-func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error {
+func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error {
 	if err := s.litLengths.init(br); err != nil {
 		return errors.New("litLengths:" + err.Error())
 	}
@@ -80,8 +87,7 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	if err := s.matchLengths.init(br); err != nil {
 		return errors.New("matchLengths:" + err.Error())
 	}
-	s.literals = literals
-	s.hist = hist.b
+	s.br = br
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
@@ -93,12 +99,127 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	return nil
 }
 
+// execute will execute the decoded sequence with the provided history.
+// The sequence must be evaluated before being sent.
+func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+	if len(s.dict) == 0 {
+		return s.executeSimple(seqs, hist)
+	}
+
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Copy from dictionary...
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (seq.mo - (t + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict))
+			}
+			end := dictO + seq.ml
+			if end > len(s.dict) {
+				n := len(s.dict) - dictO
+				copy(out[t:], s.dict[dictO:])
+				t += n
+				seq.ml -= n
+			} else {
+				copy(out[t:], s.dict[dictO:end])
+				t += end - dictO
+				continue
+			}
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+		// We must be in current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+				continue
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
+
 // decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
+	if true {
+		supported, err := s.decodeSyncSimple(hist)
+		if supported {
+			return err
+		}
+	}
+	br := s.br
+	seqs := s.nSeqs
 	startSize := len(s.out)
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	out := s.out
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
 
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
@@ -151,7 +272,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
-						println("temp was 0")
+						println("WARNING: temp was 0")
 						temp = 1
 					}
 
@@ -176,51 +297,49 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		if ll > len(s.literals) {
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := ll + ml + len(s.out)
+		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size", size)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
 		}
-		if size > cap(s.out) {
+		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
 			// but could be if destination slice is too small for sync operations.
 			// over-allocating here can create a large amount of GC pressure so we try to keep
 			// it as contained as possible
-			used := len(s.out) - startSize
+			used := len(out) - startSize
 			addBytes := 256 + ll + ml + used>>2
 			// Clamp to max block size.
 			if used+addBytes > maxBlockSize {
 				addBytes = maxBlockSize - used
 			}
-			s.out = append(s.out, make([]byte, addBytes)...)
-			s.out = s.out[:len(s.out)-addBytes]
+			out = append(out, make([]byte, addBytes)...)
+			out = out[:len(out)-addBytes]
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 
 		// Add literals
-		s.out = append(s.out, s.literals[:ll]...)
+		out = append(out, s.literals[:ll]...)
 		s.literals = s.literals[ll:]
-		out := s.out
 
 		if mo == 0 && ml > 0 {
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		}
 
-		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+		if mo > len(out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 
 			// we may be in dictionary.
-			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
 				out = append(out, s.dict[dictO:]...)
-				mo -= len(s.dict) - dictO
 				ml -= len(s.dict) - dictO
 			} else {
 				out = append(out, s.dict[dictO:end]...)
@@ -231,26 +350,25 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := mo - len(s.out); v > 0 {
+		if v := mo - len(out); v > 0 {
 			// v is the start position in history from end.
-			start := len(s.hist) - v
+			start := len(hist) - v
 			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
-				out = append(out, s.hist[start:]...)
-				mo -= v
+				out = append(out, hist[start:]...)
 				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+ml]...)
+				out = append(out, hist[start:start+ml]...)
 				ml = 0
 			}
 		}
 		// We must be in current buffer now
 		if ml > 0 {
-			start := len(s.out) - mo
-			if ml <= len(s.out)-start {
+			start := len(out) - mo
+			if ml <= len(out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+ml]...)
+				out = append(out, out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
@@ -264,7 +382,6 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 				}
 			}
 		}
-		s.out = out
 		if i == 0 {
 			// This is the last sequence, so we shouldn't update state.
 			break
@@ -291,9 +408,14 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		}
 	}
 
+	// Check if space for literals
+	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+	}
+
 	// Add final literals
-	s.out = append(s.out, s.literals...)
-	return nil
+	s.out = append(out, s.literals...)
+	return br.close()
 }
 
 // update states, at least 27 bits must be available.
@@ -457,36 +579,3 @@ func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int {
 	s.prevOffset[0] = temp
 	return temp
 }
-
-// mergeHistory will merge history.
-func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) {
-	for i := uint(0); i < 3; i++ {
-		var sNew, sHist *sequenceDec
-		switch i {
-		default:
-			// same as "case 0":
-			sNew = &s.litLengths
-			sHist = &hist.litLengths
-		case 1:
-			sNew = &s.offsets
-			sHist = &hist.offsets
-		case 2:
-			sNew = &s.matchLengths
-			sHist = &hist.matchLengths
-		}
-		if sNew.repeat {
-			if sHist.fse == nil {
-				return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i)
-			}
-			continue
-		}
-		if sNew.fse == nil {
-			return nil, fmt.Errorf("sequence stream %d, no fse found", i)
-		}
-		if sHist.fse != nil && !sHist.fse.preDefined {
-			fseDecoderPool.Put(sHist.fse)
-		}
-		sHist.fse = sNew.fse
-	}
-	return hist, nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
new file mode 100644
index 00000000000..4676b09cc18
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,350 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+	"fmt"
+
+	"github.com/klauspost/compress/internal/cpuinfo"
+)
+
+type decodeSyncAsmContext struct {
+	llTable     []decSymbol
+	mlTable     []decSymbol
+	ofTable     []decSymbol
+	llState     uint64
+	mlState     uint64
+	ofState     uint64
+	iteration   int
+	litRemain   int
+	out         []byte
+	outPosition int
+	literals    []byte
+	litPosition int
+	history     []byte
+	windowSize  int
+	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
+	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
+	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
+}
+
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// decode sequences from the stream with the provided history but without a dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	if len(s.dict) > 0 {
+		return false, nil
+	}
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
+		return false, nil
+	}
+	useSafe := false
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+		useSafe = true
+	}
+	if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+		useSafe = true
+	}
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeSyncAsmContext{
+		llTable:     s.litLengths.fse.dt[:maxTablesize],
+		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:     s.offsets.fse.dt[:maxTablesize],
+		llState:     uint64(s.litLengths.state.state),
+		mlState:     uint64(s.matchLengths.state.state),
+		ofState:     uint64(s.offsets.state.state),
+		iteration:   s.nSeqs - 1,
+		litRemain:   len(s.literals),
+		out:         s.out,
+		outPosition: len(s.out),
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+		history:     hist,
+	}
+
+	s.seqSize = 0
+	startSize := len(s.out)
+
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
+		}
+	} else {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
+		}
+	}
+	switch errCode {
+	case noError:
+		break
+
+	case errorMatchLenOfsMismatch:
+		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
+
+	case errorMatchLenTooBig:
+		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
+
+	case errorMatchOffTooBig:
+		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			ctx.mo, ctx.outPosition+len(hist)-startSize)
+
+	case errorNotEnoughLiterals:
+		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
+			ctx.ll, ctx.litRemain+ctx.ll)
+
+	case errorNotEnoughSpace:
+		size := ctx.outPosition + ctx.ll + ctx.ml
+		if debugDecoder {
+			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
+		}
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+
+	default:
+		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+		return true, err
+	}
+
+	s.literals = s.literals[ctx.litPosition:]
+	t := ctx.outPosition
+	s.out = s.out[:t]
+
+	// Add final literals
+	s.out = append(s.out, s.literals...)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(s.out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
+		}
+	}
+
+	return true, nil
+}
+
+// --------------------------------------------------------------------------------
+
+type decodeAsmContext struct {
+	llTable   []decSymbol
+	mlTable   []decSymbol
+	ofTable   []decSymbol
+	llState   uint64
+	mlState   uint64
+	ofState   uint64
+	iteration int
+	seqs      []seqVals
+	litRemain int
+}
+
+const noError = 0
+
+// error reported when mo == 0 && ml > 0
+const errorMatchLenOfsMismatch = 1
+
+// error reported when ml > maxMatchLen
+const errorMatchLenTooBig = 2
+
+// error reported when mo > available history or mo > s.windowSize
+const errorMatchOffTooBig = 3
+
+// error reported when the sum of literal lengths exeeceds the literal buffer size
+const errorNotEnoughLiterals = 4
+
+// error reported when capacity of `out` is too small
+const errorNotEnoughSpace = 5
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeAsmContext{
+		llTable:   s.litLengths.fse.dt[:maxTablesize],
+		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:   s.offsets.fse.dt[:maxTablesize],
+		llState:   uint64(s.litLengths.state.state),
+		mlState:   uint64(s.matchLengths.state.state),
+		ofState:   uint64(s.offsets.state.state),
+		seqs:      seqs,
+		iteration: len(seqs) - 1,
+		litRemain: len(s.literals),
+	}
+
+	s.seqSize = 0
+	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
+		}
+	} else {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
+		}
+	}
+	if errCode != 0 {
+		i := len(seqs) - ctx.iteration - 1
+		switch errCode {
+		case errorMatchLenOfsMismatch:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+
+		case errorMatchLenTooBig:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+
+		case errorNotEnoughLiterals:
+			ll := ctx.seqs[i].ll
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+		}
+
+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
+	}
+
+	if ctx.litRemain < 0 {
+		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
+			len(s.literals), len(s.literals)-ctx.litRemain)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// --------------------------------------------------------------------------------
+
+type executeAsmContext struct {
+	seqs        []seqVals
+	seqIndex    int
+	out         []byte
+	history     []byte
+	literals    []byte
+	outPosition int
+	litPosition int
+	windowSize  int
+}
+
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
+//
+// Returns false if a match offset is too big.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+
+// executeSimple handles cases when dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
+		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	ctx := executeAsmContext{
+		seqs:        seqs,
+		seqIndex:    0,
+		out:         out,
+		history:     hist,
+		outPosition: t,
+		litPosition: 0,
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+	}
+
+	ok := sequenceDecs_executeSimple_amd64(&ctx)
+	if !ok {
+		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
+	}
+	s.literals = s.literals[ctx.litPosition:]
+	t = ctx.outPosition
+
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
new file mode 100644
index 00000000000..2585b2e988d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,3517 @@
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_end
+
+sequenceDecs_decode_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 16(R10)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 8(R10)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_2_end
+
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, DI
+
+sequenceDecs_decode_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R8
+
+sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R9
+
+sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_amd64_adjust_end
+
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_amd64_adjust_end
+
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_amd64_adjust_three
+	JMP  sequenceDecs_decode_amd64_adjust_two
+
+sequenceDecs_decode_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_amd64_adjust_end:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_56_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_56_amd64_fill_end
+
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_56_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 16(R10)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 8(R10)
+
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_56_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, DI
+
+sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R8
+
+sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R9
+
+sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_56_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_56_amd64_adjust_end
+
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_56_amd64_adjust_end
+
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_56_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_amd64_adjust_three
+	JMP  sequenceDecs_decode_56_amd64_adjust_two
+
+sequenceDecs_decode_56_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_56_amd64_adjust_end:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_56_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_end
+
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_2_end
+
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ   R13, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decode_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decode_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_bmi2_adjust_end
+
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_bmi2_adjust_end
+
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_bmi2_adjust_three
+	JMP  sequenceDecs_decode_bmi2_adjust_two
+
+sequenceDecs_decode_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_bmi2_adjust_end:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_56_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_56_bmi2_fill_end
+
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_56_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ   R13, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decode_56_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decode_56_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_56_bmi2_adjust_end
+
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_56_bmi2_adjust_end
+
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_bmi2_adjust_three
+	JMP  sequenceDecs_decode_56_bmi2_adjust_two
+
+sequenceDecs_decode_56_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_56_bmi2_adjust_end:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_56_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
+	MOVQ  ctx+0(FP), R10
+	MOVQ  8(R10), CX
+	TESTQ CX, CX
+	JZ    empty_seqs
+	MOVQ  (R10), AX
+	MOVQ  24(R10), DX
+	MOVQ  32(R10), BX
+	MOVQ  80(R10), SI
+	MOVQ  104(R10), DI
+	MOVQ  120(R10), R8
+	MOVQ  56(R10), R9
+	MOVQ  64(R10), R10
+	ADDQ  R10, R9
+
+	// seqsBase += 24 * seqIndex
+	LEAQ (DX)(DX*2), R11
+	SHLQ $0x03, R11
+	ADDQ R11, AX
+
+	// outBase += outPosition
+	ADDQ DI, BX
+
+main_loop:
+	MOVQ (AX), R11
+	MOVQ 16(AX), R12
+	MOVQ 8(AX), R13
+
+	// Copy literals
+	TESTQ R11, R11
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, R11
+	JZ    copy_1_word
+	MOVB  (SI)(R14*1), R15
+	MOVB  R15, (BX)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, R11
+	JZ    copy_1_dword
+	MOVW  (SI)(R14*1), R15
+	MOVW  R15, (BX)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, R11
+	JZ    copy_1_qword
+	MOVL  (SI)(R14*1), R15
+	MOVL  R15, (BX)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, R11
+	JZ    copy_1_test
+	MOVQ  (SI)(R14*1), R15
+	MOVQ  R15, (BX)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (SI)(R14*1), X0
+	MOVUPS X0, (BX)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, R11
+	JB   copy_1
+	ADDQ R11, SI
+	ADDQ R11, BX
+	ADDQ R11, DI
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	LEAQ (DI)(R10*1), R11
+	CMPQ R12, R11
+	JG   error_match_off_too_big
+	CMPQ R12, R8
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, R11
+	SUBQ  DI, R11
+	JLS   copy_match
+	MOVQ  R9, R14
+	SUBQ  R11, R14
+	CMPQ  R13, R11
+	JGE   copy_all_from_history
+	XORQ  R11, R11
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(R11*1), R12
+	MOVB  R12, (BX)(R11*1)
+	ADDQ  $0x01, R11
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(R11*1), R12
+	MOVW  R12, (BX)(R11*1)
+	ADDQ  $0x02, R11
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(R11*1), R12
+	MOVL  R12, (BX)(R11*1)
+	ADDQ  $0x04, R11
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(R11*1), R12
+	MOVQ  R12, (BX)(R11*1)
+	ADDQ  $0x08, R11
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(R11*1), X0
+	MOVUPS X0, (BX)(R11*1)
+	ADDQ   $0x10, R11
+
+copy_4_test:
+	CMPQ R11, R13
+	JB   copy_4
+	ADDQ R13, DI
+	ADDQ R13, BX
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+	JMP  loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, R11
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (BX)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, R11
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (BX)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, R11
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (BX)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, R11
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (BX)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (BX)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, R11
+	JB   copy_5
+	ADDQ R11, BX
+	ADDQ R11, DI
+	SUBQ R11, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  BX, R11
+	SUBQ  R12, R11
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, DI
+	MOVQ BX, R12
+	ADDQ R13, BX
+
+copy_2:
+	MOVUPS (R11), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, DI
+
+copy_slow_3:
+	MOVB (R11), R12
+	MOVB R12, (BX)
+	INCQ R11
+	INCQ BX
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+
+loop_finished:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	MOVQ 80(AX), CX
+	SUBQ CX, SI
+	MOVQ SI, 112(AX)
+	RET
+
+error_match_off_too_big:
+	// Return value
+	MOVB $0x00, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	MOVQ 80(AX), CX
+	SUBQ CX, SI
+	MOVQ SI, 112(AX)
+	RET
+
+empty_seqs:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+	RET
+
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_end
+
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 8(SP)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
+
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, DI
+
+sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R8
+
+sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R9
+
+sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_amd64_adjust_end
+
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_amd64_adjust_end
+
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_amd64_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, AX
+	JZ    copy_1_word
+	MOVB  (R11)(R14*1), R15
+	MOVB  R15, (R10)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_1_dword
+	MOVW  (R11)(R14*1), R15
+	MOVW  R15, (R10)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_1_qword
+	MOVL  (R11)(R14*1), R15
+	MOVL  R15, (R10)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_1_test
+	MOVQ  (R11)(R14*1), R15
+	MOVQ  R15, (R10)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, AX
+	JB   copy_1
+	ADDQ AX, R11
+	ADDQ AX, R10
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  CX, AX
+	SUBQ  R12, AX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  AX, R14
+	CMPQ  R13, AX
+	JGE   copy_all_from_history
+	XORQ  AX, AX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(AX*1), CL
+	MOVB  CL, (R10)(AX*1)
+	ADDQ  $0x01, AX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(AX*1), CX
+	MOVW  CX, (R10)(AX*1)
+	ADDQ  $0x02, AX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(AX*1), CX
+	MOVL  CX, (R10)(AX*1)
+	ADDQ  $0x04, AX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(AX*1), CX
+	MOVQ  CX, (R10)(AX*1)
+	ADDQ  $0x08, AX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(AX*1), X0
+	MOVUPS X0, (R10)(AX*1)
+	ADDQ   $0x10, AX
+
+copy_4_test:
+	CMPQ AX, R13
+	JB   copy_4
+	ADDQ R13, R12
+	ADDQ R13, R10
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, AX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R10)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R10)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R10)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R10)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R10)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, AX
+	JB   copy_5
+	ADDQ AX, R10
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R10, AX
+	SUBQ  CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R12
+	MOVQ R10, CX
+	ADDQ R13, R10
+
+copy_2:
+	MOVUPS (AX), X0
+	MOVUPS X0, (CX)
+	ADDQ   $0x10, AX
+	ADDQ   $0x10, CX
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R12
+
+copy_slow_3:
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_end
+
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ   R12, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R12
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decodeSync_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decodeSync_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_bmi2_adjust_end
+
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_bmi2_adjust_end
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_bmi2_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, CX
+	JZ    copy_1_word
+	MOVB  (R10)(R14*1), R15
+	MOVB  R15, (R9)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_1_dword
+	MOVW  (R10)(R14*1), R15
+	MOVW  R15, (R9)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_1_qword
+	MOVL  (R10)(R14*1), R15
+	MOVL  R15, (R9)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_1_test
+	MOVQ  (R10)(R14*1), R15
+	MOVQ  R15, (R9)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, CX
+	JB   copy_1
+	ADDQ CX, R10
+	ADDQ CX, R9
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, CX
+	SUBQ  R11, CX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  CX, R14
+	CMPQ  R13, CX
+	JGE   copy_all_from_history
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(CX*1), R12
+	MOVB  R12, (R9)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(CX*1), R12
+	MOVW  R12, (R9)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(CX*1), R12
+	MOVL  R12, (R9)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(CX*1), R12
+	MOVQ  R12, (R9)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(CX*1), X0
+	MOVUPS X0, (R9)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_4_test:
+	CMPQ CX, R13
+	JB   copy_4
+	ADDQ R13, R11
+	ADDQ R13, R9
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, CX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R9)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R9)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R9)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R9)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R9)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, CX
+	JB   copy_5
+	ADDQ CX, R9
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R9, CX
+	SUBQ  R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R11
+	MOVQ R9, R12
+	ADDQ R13, R9
+
+copy_2:
+	MOVUPS (CX), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, CX
+	ADDQ   $0x10, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R11
+
+copy_slow_3:
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_safe_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 8(SP)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, DI
+
+sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R8
+
+sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R9
+
+sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_safe_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_amd64_adjust_end
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_amd64_adjust_end
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_amd64_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, AX
+	JZ    copy_1_word
+	MOVB  (R11)(R14*1), R15
+	MOVB  R15, (R10)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_1_dword
+	MOVW  (R11)(R14*1), R15
+	MOVW  R15, (R10)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_1_qword
+	MOVL  (R11)(R14*1), R15
+	MOVL  R15, (R10)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_1_test
+	MOVQ  (R11)(R14*1), R15
+	MOVQ  R15, (R10)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, AX
+	JB   copy_1
+	ADDQ AX, R11
+	ADDQ AX, R10
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  CX, AX
+	SUBQ  R12, AX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  AX, R14
+	CMPQ  R13, AX
+	JGE   copy_all_from_history
+	XORQ  AX, AX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(AX*1), CL
+	MOVB  CL, (R10)(AX*1)
+	ADDQ  $0x01, AX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(AX*1), CX
+	MOVW  CX, (R10)(AX*1)
+	ADDQ  $0x02, AX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(AX*1), CX
+	MOVL  CX, (R10)(AX*1)
+	ADDQ  $0x04, AX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(AX*1), CX
+	MOVQ  CX, (R10)(AX*1)
+	ADDQ  $0x08, AX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(AX*1), X0
+	MOVUPS X0, (R10)(AX*1)
+	ADDQ   $0x10, AX
+
+copy_4_test:
+	CMPQ AX, R13
+	JB   copy_4
+	ADDQ R13, R12
+	ADDQ R13, R10
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, AX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R10)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R10)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R10)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R10)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R10)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, AX
+	JB   copy_5
+	ADDQ AX, R10
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R10, AX
+	SUBQ  CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ  R13, R12
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_2_word
+	MOVB  (AX)(CX*1), R14
+	MOVB  R14, (R10)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_2_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_2_dword
+	MOVW  (AX)(CX*1), R14
+	MOVW  R14, (R10)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_2_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_2_qword
+	MOVL  (AX)(CX*1), R14
+	MOVL  R14, (R10)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_2_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_2_test
+	MOVQ  (AX)(CX*1), R14
+	MOVQ  R14, (R10)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_2_test
+
+copy_2:
+	MOVUPS (AX)(CX*1), X0
+	MOVUPS X0, (R10)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_2_test:
+	CMPQ CX, R13
+	JB   copy_2
+	ADDQ R13, R10
+	JMP  handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R12
+
+copy_slow_3:
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ   R12, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R12
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decodeSync_safe_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_bmi2_adjust_end
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_bmi2_adjust_end
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, CX
+	JZ    copy_1_word
+	MOVB  (R10)(R14*1), R15
+	MOVB  R15, (R9)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_1_dword
+	MOVW  (R10)(R14*1), R15
+	MOVW  R15, (R9)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_1_qword
+	MOVL  (R10)(R14*1), R15
+	MOVL  R15, (R9)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_1_test
+	MOVQ  (R10)(R14*1), R15
+	MOVQ  R15, (R9)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, CX
+	JB   copy_1
+	ADDQ CX, R10
+	ADDQ CX, R9
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, CX
+	SUBQ  R11, CX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  CX, R14
+	CMPQ  R13, CX
+	JGE   copy_all_from_history
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(CX*1), R12
+	MOVB  R12, (R9)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(CX*1), R12
+	MOVW  R12, (R9)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(CX*1), R12
+	MOVL  R12, (R9)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(CX*1), R12
+	MOVQ  R12, (R9)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(CX*1), X0
+	MOVUPS X0, (R9)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_4_test:
+	CMPQ CX, R13
+	JB   copy_4
+	ADDQ R13, R11
+	ADDQ R13, R9
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, CX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R9)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R9)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R9)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R9)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R9)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, CX
+	JB   copy_5
+	ADDQ CX, R9
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R9, CX
+	SUBQ  R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ  R13, R11
+	XORQ  R12, R12
+	TESTQ $0x00000001, R13
+	JZ    copy_2_word
+	MOVB  (CX)(R12*1), R14
+	MOVB  R14, (R9)(R12*1)
+	ADDQ  $0x01, R12
+
+copy_2_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_2_dword
+	MOVW  (CX)(R12*1), R14
+	MOVW  R14, (R9)(R12*1)
+	ADDQ  $0x02, R12
+
+copy_2_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_2_qword
+	MOVL  (CX)(R12*1), R14
+	MOVL  R14, (R9)(R12*1)
+	ADDQ  $0x04, R12
+
+copy_2_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_2_test
+	MOVQ  (CX)(R12*1), R14
+	MOVQ  R14, (R9)(R12*1)
+	ADDQ  $0x08, R12
+	JMP   copy_2_test
+
+copy_2:
+	MOVUPS (CX)(R12*1), X0
+	MOVUPS X0, (R9)(R12*1)
+	ADDQ   $0x10, R12
+
+copy_2_test:
+	CMPQ R12, R13
+	JB   copy_2
+	ADDQ R13, R9
+	JMP  handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R11
+
+copy_slow_3:
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
new file mode 100644
index 00000000000..c3452bc3a9e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+)
+
+// decode sequences from the stream with the provided history but without dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	return false, nil
+}
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	s.seqSize = 0
+	litRemain := len(s.literals)
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+	for i := range seqs {
+		var ll, mo, ml int
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("WARNING: temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
+			br.fillFast()
+		} else {
+			if br.overread() {
+				if debugDecoder {
+					printf("reading sequence %d, exceeded available data\n", i)
+				}
+				return io.ErrUnexpectedEOF
+			}
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
+			br.fill()
+		}
+
+		if debugSequences {
+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+		}
+		// Evaluate.
+		// We might be doing this async, so do it early.
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+		}
+		s.seqSize += ll + ml
+		if s.seqSize > maxBlockSize {
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		}
+		litRemain -= ll
+		if litRemain < 0 {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+		}
+		seqs[i] = seqVals{
+			ll: ll,
+			ml: ml,
+			mo: mo,
+		}
+		if i == len(seqs)-1 {
+			// This is the last sequence, so we shouldn't update state.
+			break
+		}
+
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
+		} else {
+			bits := br.get32BitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+		}
+	}
+	s.seqSize += litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// executeSimple handles cases when a dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Malformed input
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into the current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+
+		// We must be in the current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index 967f29b3120..b53f606a18a 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -20,34 +20,49 @@ const ZipMethodPKWare = 20
 
 var zipReaderPool sync.Pool
 
-// newZipReader cannot be used since we would leak goroutines...
-func newZipReader(r io.Reader) io.ReadCloser {
-	dec, ok := zipReaderPool.Get().(*Decoder)
-	if ok {
-		dec.Reset(r)
-	} else {
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
-		if err != nil {
-			panic(err)
+// newZipReader creates a pooled zip decompressor.
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	pool := &zipReaderPool
+	if len(opts) > 0 {
+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
+		// Force concurrency 1
+		opts = append(opts, WithDecoderConcurrency(1))
+		// Create our own pool
+		pool = &sync.Pool{}
+	}
+	return func(r io.Reader) io.ReadCloser {
+		dec, ok := pool.Get().(*Decoder)
+		if ok {
+			dec.Reset(r)
+		} else {
+			d, err := NewReader(r, opts...)
+			if err != nil {
+				panic(err)
+			}
+			dec = d
 		}
-		dec = d
+		return &pooledZipReader{dec: dec, pool: pool}
 	}
-	return &pooledZipReader{dec: dec}
 }
 
 type pooledZipReader struct {
-	mu  sync.Mutex // guards Close and Read
-	dec *Decoder
+	mu   sync.Mutex // guards Close and Read
+	pool *sync.Pool
+	dec  *Decoder
 }
 
 func (r *pooledZipReader) Read(p []byte) (n int, err error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	if r.dec == nil {
-		return 0, errors.New("Read after Close")
+		return 0, errors.New("read after close or EOF")
 	}
 	dec, err := r.dec.Read(p)
-
+	if err == io.EOF {
+		r.dec.Reset(nil)
+		r.pool.Put(r.dec)
+		r.dec = nil
+	}
 	return dec, err
 }
 
@@ -57,7 +72,7 @@ func (r *pooledZipReader) Close() error {
 	var err error
 	if r.dec != nil {
 		err = r.dec.Reset(nil)
-		zipReaderPool.Put(r.dec)
+		r.pool.Put(r.dec)
 		r.dec = nil
 	}
 	return err
@@ -111,12 +126,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
 
 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
 // See ZipCompressor for example.
-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
-	return func(r io.Reader) io.ReadCloser {
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
-		if err != nil {
-			panic(err)
-		}
-		return d.IOReadCloser()
-	}
+// Options can be specified. WithDecoderConcurrency(1) is forced,
+// and by default a 128MB maximum decompression window is specified.
+// The window size can be overridden if required.
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	return newZipReader(opts...)
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index ef1d49a009c..c1c90b4a072 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -39,6 +39,9 @@ const zstdMinMatch = 3
 // Reset the buffer offset when reaching this.
 const bufferReset = math.MaxInt32 - MaxWindowSize
 
+// fcsUnknown is used for unknown frame content size.
+const fcsUnknown = math.MaxUint64
+
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
 	// Typically this indicates wrong or corrupted input.
@@ -52,6 +55,10 @@ var (
 	// Typically returned on invalid input.
 	ErrBlockTooSmall = errors.New("block too small")
 
+	// ErrUnexpectedBlockSize is returned when a block has unexpected size.
+	// Typically returned on invalid input.
+	ErrUnexpectedBlockSize = errors.New("unexpected block size")
+
 	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
 	// Typically this indicates wrong or corrupted input.
 	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
@@ -75,6 +82,10 @@ var (
 	// This is only returned if SingleSegment is specified on the frame.
 	ErrFrameSizeExceeded = errors.New("frame size exceeded")
 
+	// ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size.
+	// This is only returned if SingleSegment is specified on the frame.
+	ErrFrameSizeMismatch = errors.New("frame size does not match size on stream")
+
 	// ErrCRCMismatch is returned if CRC mismatches.
 	ErrCRCMismatch = errors.New("CRC check failed")
 
diff --git a/vendor/github.com/moby/sys/mountinfo/mounted_unix.go b/vendor/github.com/moby/sys/mountinfo/mounted_unix.go
index 45ddad236f3..242f82cc72a 100644
--- a/vendor/github.com/moby/sys/mountinfo/mounted_unix.go
+++ b/vendor/github.com/moby/sys/mountinfo/mounted_unix.go
@@ -4,7 +4,6 @@
 package mountinfo
 
 import (
-	"fmt"
 	"os"
 	"path/filepath"
 
@@ -33,13 +32,13 @@ func mountedByStat(path string) (bool, error) {
 
 func normalizePath(path string) (realPath string, err error) {
 	if realPath, err = filepath.Abs(path); err != nil {
-		return "", fmt.Errorf("unable to get absolute path for %q: %w", path, err)
+		return "", err
 	}
 	if realPath, err = filepath.EvalSymlinks(realPath); err != nil {
-		return "", fmt.Errorf("failed to canonicalise path for %q: %w", path, err)
+		return "", err
 	}
 	if _, err := os.Stat(realPath); err != nil {
-		return "", fmt.Errorf("failed to stat target of %q: %w", path, err)
+		return "", err
 	}
 	return realPath, nil
 }
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 67f3f6101bd..aafc78e0b03 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -213,7 +213,7 @@ github.com/containerd/fifo
 # github.com/containerd/go-runc v1.0.0
 ## explicit; go 1.13
 github.com/containerd/go-runc
-# github.com/containerd/stargz-snapshotter/estargz v0.10.1
+# github.com/containerd/stargz-snapshotter/estargz v0.11.4
 ## explicit; go 1.16
 github.com/containerd/stargz-snapshotter/estargz
 github.com/containerd/stargz-snapshotter/estargz/errorutil
@@ -421,8 +421,8 @@ github.com/containers/psgo/internal/dev
 github.com/containers/psgo/internal/host
 github.com/containers/psgo/internal/proc
 github.com/containers/psgo/internal/process
-# github.com/containers/storage v1.37.0
-## explicit; go 1.14
+# github.com/containers/storage v1.41.0
+## explicit; go 1.16
 github.com/containers/storage
 github.com/containers/storage/drivers
 github.com/containers/storage/drivers/aufs
@@ -822,12 +822,13 @@ github.com/json-iterator/go
 # github.com/kevinburke/ssh_config v1.1.0
 ## explicit
 github.com/kevinburke/ssh_config
-# github.com/klauspost/compress v1.14.2
-## explicit; go 1.15
+# github.com/klauspost/compress v1.15.4
+## explicit; go 1.16
 github.com/klauspost/compress
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/huff0
+github.com/klauspost/compress/internal/cpuinfo
 github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash
@@ -886,7 +887,7 @@ github.com/moby/spdystream/spdy
 # github.com/moby/sys/mount v0.2.0
 ## explicit; go 1.14
 github.com/moby/sys/mount
-# github.com/moby/sys/mountinfo v0.6.0
+# github.com/moby/sys/mountinfo v0.6.1
 ## explicit; go 1.16
 github.com/moby/sys/mountinfo
 # github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6