From 0a5dc67e9ddeb1618fecdaefd847c085f01de2d9 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 21 Sep 2025 11:13:53 +0200
Subject: [PATCH 1/5] [klauspost/deflate-improve-comp] compress/flate: improve
 compression speed

Fixes #75532

This improves the compression speed of the flate package.
This is a cleaned version of github.com/klauspost/compress/flate

Overall changes:

* Compression level 2-6 are custom implementations.
* Compression level 7-9 tweaked to match levels 2-6 with minor improvements.
* Tokens are encoded and indexed when added.
* Huffman encoding attempts to continue blocks instead of always starting a new one.
* Loads/Stores in separate functions and can be made to use unsafe.

In overall terms this attempts to better balance out the compression levels,
which tended to have little spread in the top levels.

The intention is to place "default" at the place where performance drops off
considerably without a proportional improvement in compression ratio.
In my package I have set "5" to be the default, but this keeps it at level 6.

There are built-in benchmarks using the standard library's benchmark below.
I do not think this is a particular good representation of different
data types, so I have also done benchmarks on various data types.

I have compiled the benchmarks on https://stdeflate.klauspost.com/

The main focus has been on level 1 (fastest), level 5+6 (default) and level 9 (smallest).
It is quite rare that levels outside of this are used, but they should still
fit their role reasonably.

Level 9 will attempt more aggressive compression, but will also typically be slightly
slower than before.

I hope the graphs above shows that focusing on a few data types doesn't always give
the full picture.

My own observations:

Level 1 and 2 are often "trading places" depending on data type. Since level 1 is usually
the lowest compressing of the two - and mostly slightly faster, with lower memory usage -
it is placed as the lowest.

The switchover between level 6 and 7 is not always smooth,
since the search method changes significantly.

Random data is now ~100x faster on levels 2-6, and ~3 faster on levels 7-9.
You can feed pre-compressed data with no significant speed penalty.

"Unsafe" operations have been removed for now. They can trivially be added back.
This is an approximately 10% speed penalty.

benchmark                                     old ns/op     new ns/op     delta
BenchmarkEncode/Digits/Huffman/1e4-32         11431         8001          -30.01%
BenchmarkEncode/Digits/Huffman/1e5-32         123175        74780         -39.29%
BenchmarkEncode/Digits/Huffman/1e6-32         1260402       750022        -40.49%
BenchmarkEncode/Digits/Speed/1e4-32           35100         23758         -32.31%
BenchmarkEncode/Digits/Speed/1e5-32           675355        385954        -42.85%
BenchmarkEncode/Digits/Speed/1e6-32           6878375       4873784       -29.14%
BenchmarkEncode/Digits/Default/1e4-32         63411         40974         -35.38%
BenchmarkEncode/Digits/Default/1e5-32         1815762       801563        -55.86%
BenchmarkEncode/Digits/Default/1e6-32         18875894      8101836       -57.08%
BenchmarkEncode/Digits/Compression/1e4-32     63859         85275         +33.54%
BenchmarkEncode/Digits/Compression/1e5-32     1803745       2752174       +52.58%
BenchmarkEncode/Digits/Compression/1e6-32     18931995      30727403      +62.30%
BenchmarkEncode/Newton/Huffman/1e4-32         15770         11108         -29.56%
BenchmarkEncode/Newton/Huffman/1e5-32         134567        85103         -36.76%
BenchmarkEncode/Newton/Huffman/1e6-32         1663889       1030186       -38.09%
BenchmarkEncode/Newton/Speed/1e4-32           32749         22934         -29.97%
BenchmarkEncode/Newton/Speed/1e5-32           565609        336750        -40.46%
BenchmarkEncode/Newton/Speed/1e6-32           5996011       3815437       -36.37%
BenchmarkEncode/Newton/Default/1e4-32         70505         34148         -51.57%
BenchmarkEncode/Newton/Default/1e5-32         2374066       570673        -75.96%
BenchmarkEncode/Newton/Default/1e6-32         24562355      5975917       -75.67%
BenchmarkEncode/Newton/Compression/1e4-32     71505         77670         +8.62%
BenchmarkEncode/Newton/Compression/1e5-32     3345768       3730804       +11.51%
BenchmarkEncode/Newton/Compression/1e6-32     35770364      39768939      +11.18%

benchmark                                     old MB/s     new MB/s     speedup
BenchmarkEncode/Digits/Huffman/1e4-32         874.80       1249.91      1.43x
BenchmarkEncode/Digits/Huffman/1e5-32         811.86       1337.25      1.65x
BenchmarkEncode/Digits/Huffman/1e6-32         793.40       1333.29      1.68x
BenchmarkEncode/Digits/Speed/1e4-32           284.90       420.91       1.48x
BenchmarkEncode/Digits/Speed/1e5-32           148.07       259.10       1.75x
BenchmarkEncode/Digits/Speed/1e6-32           145.38       205.18       1.41x
BenchmarkEncode/Digits/Default/1e4-32         157.70       244.06       1.55x
BenchmarkEncode/Digits/Default/1e5-32         55.07        124.76       2.27x
BenchmarkEncode/Digits/Default/1e6-32         52.98        123.43       2.33x
BenchmarkEncode/Digits/Compression/1e4-32     156.59       117.27       0.75x
BenchmarkEncode/Digits/Compression/1e5-32     55.44        36.33        0.66x
BenchmarkEncode/Digits/Compression/1e6-32     52.82        32.54        0.62x
BenchmarkEncode/Newton/Huffman/1e4-32         634.13       900.25       1.42x
BenchmarkEncode/Newton/Huffman/1e5-32         743.12       1175.04      1.58x
BenchmarkEncode/Newton/Huffman/1e6-32         601.00       970.70       1.62x
BenchmarkEncode/Newton/Speed/1e4-32           305.35       436.03       1.43x
BenchmarkEncode/Newton/Speed/1e5-32           176.80       296.96       1.68x
BenchmarkEncode/Newton/Speed/1e6-32           166.78       262.09       1.57x
BenchmarkEncode/Newton/Default/1e4-32         141.83       292.84       2.06x
BenchmarkEncode/Newton/Default/1e5-32         42.12        175.23       4.16x
BenchmarkEncode/Newton/Default/1e6-32         40.71        167.34       4.11x
BenchmarkEncode/Newton/Compression/1e4-32     139.85       128.75       0.92x
BenchmarkEncode/Newton/Compression/1e5-32     29.89        26.80        0.90x
BenchmarkEncode/Newton/Compression/1e6-32     27.96        25.15        0.90x

Static Memory Usage:

Before:
Level -2: Memory Used: 704KB, 8 allocs
Level -1: Memory Used: 776KB, 7 allocs
Level 0: Memory Used: 704KB, 7 allocs
Level 1: Memory Used: 1160KB, 13 allocs
Level 2: Memory Used: 776KB, 8 allocs
Level 3: Memory Used: 776KB, 8 allocs
Level 4: Memory Used: 776KB, 8 allocs
Level 5: Memory Used: 776KB, 8 allocs
Level 6: Memory Used: 776KB, 8 allocs
Level 7: Memory Used: 776KB, 8 allocs
Level 8: Memory Used: 776KB, 9 allocs
Level 9: Memory Used: 776KB, 8 allocs

After:
Level -2: Memory Used: 272KB, 12 allocs
Level -1: Memory Used: 1016KB, 7 allocs
Level 0: Memory Used: 304KB, 6 allocs
Level 1: Memory Used: 760KB, 13 allocs
Level 2: Memory Used: 1144KB, 8 allocs
Level 3: Memory Used: 1144KB, 8 allocs
Level 4: Memory Used: 888KB, 14 allocs
Level 5: Memory Used: 1016KB, 8 allocs
Level 6: Memory Used: 1016KB, 8 allocs
Level 7: Memory Used: 952KB, 7 allocs
Level 8: Memory Used: 952KB, 7 allocs
Level 9: Memory Used: 1080KB, 9 allocs

This package has been fuzz tested for about 24 hours.
Currently, there is about 1h between new "interesting" finds.

Change-Id: Icb4c9839dc8f1bb96fd6d548038679a7554a559b
---
 src/compress/flate/deflate.go                 | 858 +++++++++++-------
 src/compress/flate/deflate_test.go            | 703 +++-----------
 src/compress/flate/deflatefast.go             | 392 +++-----
 src/compress/flate/dict_decoder.go            |  11 +-
 src/compress/flate/example_test.go            |   3 +-
 src/compress/flate/fuzz_test.go               | 111 +++
 src/compress/flate/huffman_bit_writer.go      | 854 ++++++++++++-----
 src/compress/flate/huffman_bit_writer_test.go |  62 +-
 src/compress/flate/huffman_code.go            | 233 +++--
 src/compress/flate/huffman_sortByFreq.go      | 159 ++++
 src/compress/flate/huffman_sortByLiteral.go   | 201 ++++
 src/compress/flate/level1.go                  | 197 ++++
 src/compress/flate/level2.go                  | 187 ++++
 src/compress/flate/level3.go                  | 226 +++++
 src/compress/flate/level4.go                  | 204 +++++
 src/compress/flate/level5.go                  | 291 ++++++
 src/compress/flate/level6.go                  | 301 ++++++
 src/compress/flate/regmask_amd64.go           |  14 +
 src/compress/flate/regmask_other.go           |  18 +
 .../testdata/huffman-null-max.sync.expect     | Bin 0 -> 78 bytes
 .../huffman-null-max.sync.expect-noinput      | Bin 0 -> 78 bytes
 .../flate/testdata/huffman-pi.sync.expect     | Bin 0 -> 1696 bytes
 .../testdata/huffman-pi.sync.expect-noinput   | Bin 0 -> 1696 bytes
 .../huffman-rand-1k.dyn.expect-noinput        | Bin 1054 -> 1054 bytes
 .../testdata/huffman-rand-1k.sync.expect      | Bin 0 -> 1005 bytes
 .../huffman-rand-1k.sync.expect-noinput       | Bin 0 -> 1054 bytes
 .../testdata/huffman-rand-limit.dyn.expect    | Bin 229 -> 186 bytes
 .../huffman-rand-limit.dyn.expect-noinput     | Bin 229 -> 186 bytes
 .../flate/testdata/huffman-rand-limit.golden  | Bin 252 -> 246 bytes
 .../testdata/huffman-rand-limit.sync.expect   | Bin 0 -> 186 bytes
 .../huffman-rand-limit.sync.expect-noinput    | Bin 0 -> 186 bytes
 .../flate/testdata/huffman-shifts.sync.expect | Bin 0 -> 32 bytes
 .../huffman-shifts.sync.expect-noinput        | Bin 0 -> 32 bytes
 .../testdata/huffman-text-shift.sync.expect   | Bin 0 -> 231 bytes
 .../huffman-text-shift.sync.expect-noinput    | Bin 0 -> 231 bytes
 .../flate/testdata/huffman-text.sync.expect   |   1 +
 .../testdata/huffman-text.sync.expect-noinput |   1 +
 .../flate/testdata/huffman-zero.dyn.expect    | Bin 17 -> 6 bytes
 .../testdata/huffman-zero.dyn.expect-noinput  | Bin 17 -> 6 bytes
 .../flate/testdata/huffman-zero.sync.expect   | Bin 0 -> 6 bytes
 .../testdata/huffman-zero.sync.expect-noinput | Bin 0 -> 6 bytes
 .../null-long-match.sync.expect-noinput       | Bin 0 -> 206 bytes
 src/compress/flate/token.go                   | 253 +++++-
 src/compress/flate/unsafe_disabled.go         |  33 +
 src/compress/flate/writer_test.go             | 118 ++-
 45 files changed, 3894 insertions(+), 1537 deletions(-)
 create mode 100644 src/compress/flate/fuzz_test.go
 create mode 100644 src/compress/flate/huffman_sortByFreq.go
 create mode 100644 src/compress/flate/huffman_sortByLiteral.go
 create mode 100644 src/compress/flate/level1.go
 create mode 100644 src/compress/flate/level2.go
 create mode 100644 src/compress/flate/level3.go
 create mode 100644 src/compress/flate/level4.go
 create mode 100644 src/compress/flate/level5.go
 create mode 100644 src/compress/flate/level6.go
 create mode 100644 src/compress/flate/regmask_amd64.go
 create mode 100644 src/compress/flate/regmask_other.go
 create mode 100644 src/compress/flate/testdata/huffman-null-max.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-null-max.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-pi.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-pi.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-rand-1k.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-rand-limit.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-shifts.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-shifts.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-text-shift.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-text.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-text.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/huffman-zero.sync.expect
 create mode 100644 src/compress/flate/testdata/huffman-zero.sync.expect-noinput
 create mode 100644 src/compress/flate/testdata/null-long-match.sync.expect-noinput
 create mode 100644 src/compress/flate/unsafe_disabled.go

diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go
index 6697f3a7913cd5..3819f2e1eae81d 100644
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -27,132 +27,121 @@ const (
 	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
 	// continue to be able to decompress this output.
 	HuffmanOnly = -2
-)
 
-const (
-	logWindowSize = 15
-	windowSize    = 1 << logWindowSize
-	windowMask    = windowSize - 1
-
-	// The LZ77 step produces a sequence of literal tokens and <length, offset>
-	// pair tokens. The offset is also known as distance. The underlying wire
-	// format limits the range of lengths and offsets. For example, there are
-	// 256 legitimate lengths: those in the range [3, 258]. This package's
-	// compressor uses a higher minimum match length, enabling optimizations
-	// such as finding matches via 32-bit loads and compares.
-	baseMatchLength = 3       // The smallest match length per the RFC section 3.2.5
-	minMatchLength  = 4       // The smallest match length that the compressor actually emits
-	maxMatchLength  = 258     // The largest match length
-	baseMatchOffset = 1       // The smallest match offset
-	maxMatchOffset  = 1 << 15 // The largest match offset
-
-	// The maximum number of tokens we put into a single flate block, just to
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	logWindowSize  = 15
+	windowSize     = 1 << logWindowSize
+	windowMask     = windowSize - 1
+	minMatchLength = 4   // The smallest match that the compressor looks for
+	maxMatchLength = 258 // The longest match for the compressor
+	minOffsetSize  = 1   // The shortest offset that makes any sense
+
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
-	maxHashOffset       = 1 << 24
+	maxHashOffset       = 1 << 28
 
 	skipNever = math.MaxInt32
 )
 
 type compressionLevel struct {
-	level, good, lazy, nice, chain, fastSkipHashing int
+	good, lazy, nice, chain, level int
 }
 
 var levels = []compressionLevel{
-	{0, 0, 0, 0, 0, 0}, // NoCompression.
-	{1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go.
-	// For levels 2-3 we don't bother trying with lazy matches.
-	{2, 4, 0, 16, 8, 5},
-	{3, 4, 0, 32, 32, 6},
-	// Levels 4-9 use increasingly more lazy matching
+	{}, // 0
+	// Level 1-6 uses specialized algorithm - values not used
+	{0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 2},
+	{0, 0, 0, 0, 3},
+	{0, 0, 0, 0, 4},
+	{0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 6},
+	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{4, 4, 4, 16, 16, skipNever},
-	{5, 8, 16, 32, 32, skipNever},
-	{6, 8, 16, 128, 128, skipNever},
-	{7, 8, 32, 128, 256, skipNever},
-	{8, 32, 128, 258, 1024, skipNever},
-	{9, 32, 258, 258, 4096, skipNever},
+	{8, 12, 16, 24, 7},
+	{16, 30, 40, 64, 8},
+	{32, 258, 258, 1024, 9},
 }
 
-type compressor struct {
-	compressionLevel
+// advancedState contains state for the advanced levels, with bigger hash tables, etc.
+type advancedState struct {
+	// deflate state
+	length         int
+	offset         int
+	maxInsertIndex int
+	chainHead      int
+	hashOffset     int
 
-	w          *huffmanBitWriter
-	bulkHasher func([]byte, []uint32)
+	ii uint16 // position of last match, intended to overflow to reset.
 
-	// compression algorithm
-	fill      func(*compressor, []byte) int // copy data to window
-	step      func(*compressor)             // process window
-	bestSpeed *deflateFast                  // Encoder for BestSpeed
+	// input window: unprocessed data is window[index:windowEnd]
+	index     int
+	hashMatch [maxMatchLength + minMatchLength]uint32
 
 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
 	// If hashHead[hashValue] is within the current window, then
 	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
 	// with the same hash value.
-	chainHead  int
-	hashHead   [hashSize]uint32
-	hashPrev   [windowSize]uint32
-	hashOffset int
+	hashHead [hashSize]uint32
+	hashPrev [windowSize]uint32
+}
 
-	// input window: unprocessed data is window[index:windowEnd]
-	index         int
-	window        []byte
-	windowEnd     int
-	blockStart    int  // window index where current tokens start
-	byteAvailable bool // if true, still need to process window[index-1].
+type compressor struct {
+	compressionLevel
 
-	sync bool // requesting flush
+	h *huffmanEncoder
+	w *huffmanBitWriter
 
-	// queued output tokens
-	tokens []token
+	// compression algorithm
+	fill func(*compressor, []byte) int // copy data to window
+	step func(*compressor)             // process window
 
-	// deflate state
-	length         int
-	offset         int
-	maxInsertIndex int
-	err            error
+	window     []byte
+	windowEnd  int
+	blockStart int // window index where current tokens start
+	err        error
+
+	// queued output tokens
+	tokens tokens
+	fast   fastEnc
+	state  *advancedState
 
-	// hashMatch must be able to contain hashes for the maximum match length.
-	hashMatch [maxMatchLength - 1]uint32
+	sync          bool // requesting flush
+	byteAvailable bool // if true, still need to process window[index-1].
 }
 
 func (d *compressor) fillDeflate(b []byte) int {
-	if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+	s := d.state
+	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
 		// shift the window by windowSize
-		copy(d.window, d.window[windowSize:2*windowSize])
-		d.index -= windowSize
+		//copy(d.window[:], d.window[windowSize:2*windowSize])
+		*(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:])
+		s.index -= windowSize
 		d.windowEnd -= windowSize
 		if d.blockStart >= windowSize {
 			d.blockStart -= windowSize
 		} else {
 			d.blockStart = math.MaxInt32
 		}
-		d.hashOffset += windowSize
-		if d.hashOffset > maxHashOffset {
-			delta := d.hashOffset - 1
-			d.hashOffset -= delta
-			d.chainHead -= delta
-
+		s.hashOffset += windowSize
+		if s.hashOffset > maxHashOffset {
+			delta := s.hashOffset - 1
+			s.hashOffset -= delta
+			s.chainHead -= delta
 			// Iterate over slices instead of arrays to avoid copying
 			// the entire table onto the stack (Issue #18625).
-			for i, v := range d.hashPrev[:] {
-				if int(v) > delta {
-					d.hashPrev[i] = uint32(int(v) - delta)
-				} else {
-					d.hashPrev[i] = 0
-				}
+			for i, v := range s.hashPrev[:] {
+				s.hashPrev[i] = uint32(max(int(v)-delta, 0))
 			}
-			for i, v := range d.hashHead[:] {
-				if int(v) > delta {
-					d.hashHead[i] = uint32(int(v) - delta)
-				} else {
-					d.hashHead[i] = 0
-				}
+			for i, v := range s.hashHead[:] {
+				s.hashHead[i] = uint32(max(int(v)-delta, 0))
 			}
 		}
 	}
@@ -161,14 +150,38 @@ func (d *compressor) fillDeflate(b []byte) int {
 	return n
 }
 
-func (d *compressor) writeBlock(tokens []token, index int) error {
-	if index > 0 {
+func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
 		var window []byte
 		if d.blockStart <= index {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tokens, false, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
+		return d.w.err
+	}
+	return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		if d.blockStart <= index {
+			window := d.window[d.blockStart:index]
+			// If we removed less than a 64th of all literals
+			// we huffman compress the block.
+			if int(tok.n) > len(window)-int(tok.n>>6) {
+				d.w.writeBlockHuff(eof, window, d.sync)
+			} else {
+				// Write a dynamic huffman block.
+				d.w.writeBlockDynamic(tok, eof, window, d.sync)
+			}
+		} else {
+			d.w.writeBlock(tok, eof, nil)
+		}
+		d.blockStart = index
 		return d.w.err
 	}
 	return nil
@@ -177,103 +190,139 @@ func (d *compressor) writeBlock(tokens []token, index int) error {
 // fillWindow will fill the current window with the supplied
 // dictionary and calculate all hashes.
 // This is much faster than doing a full encode.
-// Should only be used after a reset.
+// Should only be used after a start/reset.
 func (d *compressor) fillWindow(b []byte) {
-	// Do not fill window if we are in store-only mode.
-	if d.compressionLevel.level < 2 {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 {
 		return
 	}
-	if d.index != 0 || d.windowEnd != 0 {
-		panic("internal error: fillWindow called with stale data")
+	if d.fast != nil {
+		// encode the last data, but discard the result
+		if len(b) > maxMatchOffset {
+			b = b[len(b)-maxMatchOffset:]
+		}
+		d.fast.Encode(&d.tokens, b)
+		d.tokens.Reset()
+		return
 	}
-
+	s := d.state
 	// If we are given too much, cut it.
 	if len(b) > windowSize {
 		b = b[len(b)-windowSize:]
 	}
 	// Add all to window.
-	n := copy(d.window, b)
+	n := copy(d.window[d.windowEnd:], b)
 
 	// Calculate 256 hashes at the time (more L1 cache hits)
 	loops := (n + 256 - minMatchLength) / 256
-	for j := 0; j < loops; j++ {
-		index := j * 256
-		end := index + 256 + minMatchLength - 1
-		if end > n {
-			end = n
-		}
-		toCheck := d.window[index:end]
-		dstSize := len(toCheck) - minMatchLength + 1
+	for j := range loops {
+		startindex := j * 256
+		end := min(startindex+256+minMatchLength-1, n)
+		tocheck := d.window[startindex:end]
+		dstSize := len(tocheck) - minMatchLength + 1
 
 		if dstSize <= 0 {
 			continue
 		}
 
-		dst := d.hashMatch[:dstSize]
-		d.bulkHasher(toCheck, dst)
+		dst := s.hashMatch[:dstSize]
+		bulkHash4(tocheck, dst)
+		var newH uint32
 		for i, val := range dst {
-			di := i + index
-			hh := &d.hashHead[val&hashMask]
+			di := i + startindex
+			newH = val & hashMask
 			// Get previous value with the same hash.
 			// Our chain should point to the previous value.
-			d.hashPrev[di&windowMask] = *hh
+			s.hashPrev[di&windowMask] = s.hashHead[newH]
 			// Set the head of the hash chain to us.
-			*hh = uint32(di + d.hashOffset)
+			s.hashHead[newH] = uint32(di + s.hashOffset)
 		}
 	}
 	// Update window information.
-	d.windowEnd = n
-	d.index = n
+	d.windowEnd += n
+	s.index = n
 }
 
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
-	minMatchLook := maxMatchLength
-	if lookahead < minMatchLook {
-		minMatchLook = lookahead
-	}
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := min(lookahead, maxMatchLength)
 
 	win := d.window[0 : pos+minMatchLook]
 
 	// We quit when we get a match that's at least nice long
-	nice := len(win) - pos
-	if d.nice < nice {
-		nice = d.nice
-	}
+	nice := min(d.nice, len(win)-pos)
 
 	// If we've got a match that's good enough, only look in 1/4 the chain.
 	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
+	length = minMatchLength - 1
 
 	wEnd := win[pos+length]
 	wPos := win[pos:]
-	minIndex := pos - windowSize
+	minIndex := max(pos-windowSize, 0)
+	offset = 0
+
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
+
+	// Minimum gain to accept a match.
+	cGain := 4
+
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 3
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
 
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
-			n := matchLen(win[i:], wPos, minMatchLook)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
+			n := matchLen(win[i:i+minMatchLook], wPos)
+			if n > length {
+				// Calculate gain. Estimates the gains of the new match compared to emitting as literals.
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
 				}
-				wEnd = win[pos+n]
 			}
 		}
-		if i == minIndex {
+		if i <= minIndex {
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
-		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
-		if i < minIndex || i < 0 {
+		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+		if i < minIndex {
 			break
 		}
 	}
@@ -288,235 +337,272 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 	return d.w.err
 }
 
-const hashmul = 0x1e35a7bd
-
 // hash4 returns a hash representation of the first 4 bytes
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-	return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+	return hash4u(loadLE32(b, 0), hashBits)
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
 }
 
 // bulkHash4 will compute hashes using the same
-// algorithm as hash4.
+// algorithm as hash4
 func bulkHash4(b []byte, dst []uint32) {
-	if len(b) < minMatchLength {
+	if len(b) < 4 {
 		return
 	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
-	dst[0] = (hb * hashmul) >> (32 - hashBits)
-	end := len(b) - minMatchLength + 1
-	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
-		dst[i] = (hb * hashmul) >> (32 - hashBits)
-	}
-}
-
-// matchLen returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-func matchLen(a, b []byte, max int) int {
-	a = a[:max]
-	b = b[:len(a)]
-	for i, av := range a {
-		if b[i] != av {
-			return i
-		}
-	}
-	return max
-}
-
-// encSpeed will compress and store the currently added data,
-// if enough has been accumulated or we at the end of the stream.
-// Any error that occurred will be in d.err
-func (d *compressor) encSpeed() {
-	// We only compress if we have maxStoreBlockSize.
-	if d.windowEnd < maxStoreBlockSize {
-		if !d.sync {
-			return
-		}
-
-		// Handle small sizes.
-		if d.windowEnd < 128 {
-			switch {
-			case d.windowEnd == 0:
-				return
-			case d.windowEnd <= 16:
-				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
-			default:
-				d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-				d.err = d.w.err
-			}
-			d.windowEnd = 0
-			d.bestSpeed.reset()
-			return
-		}
-
-	}
-	// Encode the block.
-	d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd])
+	hb := loadLE32(b, 0)
 
-	// If we removed less than 1/16th, Huffman compress the block.
-	if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
-		d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-	} else {
-		d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd])
+	dst[0] = hash4u(hb, hashBits)
+	end := len(b) - 4 + 1
+	for i := 1; i < end; i++ {
+		hb = (hb >> 8) | uint32(b[i+3])<<24
+		dst[i] = hash4u(hb, hashBits)
 	}
-	d.err = d.w.err
-	d.windowEnd = 0
 }
 
 func (d *compressor) initDeflate() {
 	d.window = make([]byte, 2*windowSize)
-	d.hashOffset = 1
-	d.tokens = make([]token, 0, maxFlateBlockTokens+1)
-	d.length = minMatchLength - 1
-	d.offset = 0
 	d.byteAvailable = false
-	d.index = 0
-	d.chainHead = -1
-	d.bulkHasher = bulkHash4
+	d.err = nil
+	if d.state == nil {
+		return
+	}
+	s := d.state
+	s.index = 0
+	s.hashOffset = 1
+	s.length = minMatchLength - 1
+	s.offset = 0
+	s.chainHead = -1
 }
 
-func (d *compressor) deflate() {
-	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+// deflateLazy does encoding with lazy matching.
+func (d *compressor) deflateLazy() {
+	s := d.state
+
+	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		// This is used to estimate the cost of emitting a literal.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
 
-	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 
-Loop:
 	for {
-		if d.index > d.windowEnd {
-			panic("index > windowEnd")
-		}
-		lookahead := d.windowEnd - d.index
+		lookahead := d.windowEnd - s.index
 		if lookahead < minMatchLength+maxMatchLength {
 			if !d.sync {
-				break Loop
-			}
-			if d.index > d.windowEnd {
-				panic("index > windowEnd")
+				return
 			}
 			if lookahead == 0 {
 				// Flush current output block if any.
 				if d.byteAvailable {
 					// There is still one pending token that needs to be flushed
-					d.tokens = append(d.tokens, literalToken(uint32(d.window[d.index-1])))
+					d.tokens.AddLiteral(d.window[s.index-1])
 					d.byteAvailable = false
 				}
-				if len(d.tokens) > 0 {
-					if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 						return
 					}
-					d.tokens = d.tokens[:0]
+					d.tokens.Reset()
 				}
-				break Loop
+				return
 			}
 		}
-		if d.index < d.maxInsertIndex {
+		if s.index < s.maxInsertIndex {
 			// Update the hash
-			hash := hash4(d.window[d.index : d.index+minMatchLength])
-			hh := &d.hashHead[hash&hashMask]
-			d.chainHead = int(*hh)
-			d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
-			*hh = uint32(d.index + d.hashOffset)
+			hash := hash4(d.window[s.index:])
+			ch := s.hashHead[hash]
+			s.chainHead = int(ch)
+			s.hashPrev[s.index&windowMask] = ch
+			s.hashHead[hash] = uint32(s.index + s.hashOffset)
 		}
-		prevLength := d.length
-		prevOffset := d.offset
-		d.length = minMatchLength - 1
-		d.offset = 0
-		minIndex := d.index - windowSize
-		if minIndex < 0 {
-			minIndex = 0
+		prevLength := s.length
+		prevOffset := s.offset
+		s.length = minMatchLength - 1
+		s.offset = 0
+		minIndex := max(s.index-windowSize, 0)
+
+		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
+				s.length = newLength
+				s.offset = newOffset
+			}
 		}
 
-		if d.chainHead-d.hashOffset >= minIndex &&
-			(d.fastSkipHashing != skipNever && lookahead > minMatchLength-1 ||
-				d.fastSkipHashing == skipNever && lookahead > prevLength && prevLength < d.lazy) {
-			if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-				d.length = newLength
-				d.offset = newOffset
+		if prevLength >= minMatchLength && s.length <= prevLength {
+			// No better match, but check for better match at end...
+			//
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield the best results. 3 is sometimes better.
+			const checkOff = 2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := min(lookahead, maxMatchLength+checkOff)
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						}
+					}
+				}
 			}
-		}
-		if d.fastSkipHashing != skipNever && d.length >= minMatchLength ||
-			d.fastSkipHashing == skipNever && prevLength >= minMatchLength && d.length <= prevLength {
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
-			if d.fastSkipHashing != skipNever {
-				d.tokens = append(d.tokens, matchToken(uint32(d.length-baseMatchLength), uint32(d.offset-baseMatchOffset)))
-			} else {
-				d.tokens = append(d.tokens, matchToken(uint32(prevLength-baseMatchLength), uint32(prevOffset-baseMatchOffset)))
-			}
+			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+
 			// Insert in the hash table all strings up to the end of the match.
 			// index and index-1 are already inserted. If there is not enough
 			// lookahead, the last two strings are not inserted into the hash
 			// table.
-			if d.length <= d.fastSkipHashing {
-				var newIndex int
-				if d.fastSkipHashing != skipNever {
-					newIndex = d.index + d.length
-				} else {
-					newIndex = d.index + prevLength - 1
-				}
-				index := d.index
-				for index++; index < newIndex; index++ {
-					if index < d.maxInsertIndex {
-						hash := hash4(d.window[index : index+minMatchLength])
-						// Get previous value with the same hash.
-						// Our chain should point to the previous value.
-						hh := &d.hashHead[hash&hashMask]
-						d.hashPrev[index&windowMask] = *hh
-						// Set the head of the hash chain to us.
-						*hh = uint32(index + d.hashOffset)
-					}
+			newIndex := s.index + prevLength - 1
+			// Calculate missing hashes
+			end := min(newIndex, s.maxInsertIndex)
+			end += minMatchLength - 1
+			startindex := min(s.index+1, s.maxInsertIndex)
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := s.hashMatch[:dstSize]
+				bulkHash4(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					s.hashPrev[di&windowMask] = s.hashHead[newH]
+					// Set the head of the hash chain to us.
+					s.hashHead[newH] = uint32(di + s.hashOffset)
 				}
-				d.index = index
-
-				if d.fastSkipHashing == skipNever {
-					d.byteAvailable = false
-					d.length = minMatchLength - 1
-				}
-			} else {
-				// For matches this long, we don't bother inserting each individual
-				// item into the table.
-				d.index += d.length
 			}
-			if len(d.tokens) == maxFlateBlockTokens {
+
+			s.index = newIndex
+			d.byteAvailable = false
+			s.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
 				// The block includes the current character
-				if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
+				if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 					return
 				}
-				d.tokens = d.tokens[:0]
+				d.tokens.Reset()
 			}
+			s.ii = 0
 		} else {
-			if d.fastSkipHashing != skipNever || d.byteAvailable {
-				i := d.index - 1
-				if d.fastSkipHashing != skipNever {
-					i = d.index
-				}
-				d.tokens = append(d.tokens, literalToken(uint32(d.window[i])))
-				if len(d.tokens) == maxFlateBlockTokens {
-					if d.err = d.writeBlock(d.tokens, i+1); d.err != nil {
+			// Reset, if we got a match this run.
+			if s.length >= minMatchLength {
+				s.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				s.ii++
+				d.tokens.AddLiteral(d.window[s.index-1])
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 						return
 					}
-					d.tokens = d.tokens[:0]
+					d.tokens.Reset()
 				}
-			}
-			d.index++
-			if d.fastSkipHashing == skipNever {
+				s.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when s.ii overflows after 64KB.
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
+					for j := 0; j < n; j++ {
+						if s.index >= d.windowEnd-1 {
+							break
+						}
+						d.tokens.AddLiteral(d.window[s.index-1])
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+								return
+							}
+							d.tokens.Reset()
+						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
+						s.index++
+					}
+					// Flush last byte
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+					// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+							return
+						}
+						d.tokens.Reset()
+					}
+				}
+			} else {
+				s.index++
 				d.byteAvailable = true
 			}
 		}
 	}
 }
 
-func (d *compressor) fillStore(b []byte) int {
-	n := copy(d.window[d.windowEnd:], b)
-	d.windowEnd += n
-	return n
-}
-
 func (d *compressor) store() {
 	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
 		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
@@ -524,38 +610,93 @@ func (d *compressor) store() {
 	}
 }
 
-// storeHuff compresses and stores the currently added data
-// when the d.window is full or we are at the end of the stream.
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
 // Any error that occurred will be in d.err
 func (d *compressor) storeHuff() {
 	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
 		return
 	}
-	d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
 	d.err = d.w.err
 	d.windowEnd = 0
 }
 
+// storeFast will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeFast() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < len(d.window) {
+		if !d.sync {
+			return
+		}
+		// Handle extremely small sizes.
+		if d.windowEnd < 128 {
+			if d.windowEnd == 0 {
+				return
+			}
+			if d.windowEnd <= 32 {
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			} else {
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
+				d.err = d.w.err
+			}
+			d.tokens.Reset()
+			d.windowEnd = 0
+			d.fast.Reset()
+			return
+		}
+	}
+
+	d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
+	// If we made zero matches, store the block as is.
+	if d.tokens.n == 0 {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		// If we removed less than 1/16th, huffman compress the block.
+	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	} else {
+		d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	}
+	d.tokens.Reset()
+	d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
 func (d *compressor) write(b []byte) (n int, err error) {
 	if d.err != nil {
 		return 0, d.err
 	}
 	n = len(b)
 	for len(b) > 0 {
-		d.step(d)
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
 		b = b[d.fill(d, b):]
 		if d.err != nil {
 			return 0, d.err
 		}
 	}
-	return n, nil
+	return n, d.err
 }
 
 func (d *compressor) syncFlush() error {
+	d.sync = true
 	if d.err != nil {
 		return d.err
 	}
-	d.sync = true
 	d.step(d)
 	if d.err == nil {
 		d.w.writeStoredHeader(0, false)
@@ -572,30 +713,33 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 	switch {
 	case level == NoCompression:
 		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillStore
+		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == HuffmanOnly:
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillStore
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
+		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
-	case level == BestSpeed:
-		d.compressionLevel = levels[level]
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillStore
-		d.step = (*compressor).encSpeed
-		d.bestSpeed = newDeflateFast()
-		d.tokens = make([]token, maxStoreBlockSize)
 	case level == DefaultCompression:
 		level = 6
 		fallthrough
-	case 2 <= level && level <= 9:
+	case level >= 1 && level <= 6:
+		d.w.logNewTablePenalty = 7
+		d.fast = newFastEnc(level)
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
+	case 7 <= level && level <= 9:
+		d.w.logNewTablePenalty = 8
+		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
-		d.step = (*compressor).deflate
+		d.step = (*compressor).deflateLazy
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
+	d.level = level
 	return nil
 }
 
@@ -603,27 +747,39 @@ func (d *compressor) reset(w io.Writer) {
 	d.w.reset(w)
 	d.sync = false
 	d.err = nil
-	switch d.compressionLevel.level {
-	case NoCompression:
+	// We only need to reset a few things for Snappy.
+	if d.fast != nil {
+		d.fast.Reset()
 		d.windowEnd = 0
-	case BestSpeed:
+		d.tokens.Reset()
+		return
+	}
+	switch d.compressionLevel.chain {
+	case 0:
+		// level was NoCompression or ConstantCompression.
 		d.windowEnd = 0
-		d.tokens = d.tokens[:0]
-		d.bestSpeed.reset()
 	default:
-		d.chainHead = -1
-		clear(d.hashHead[:])
-		clear(d.hashPrev[:])
-		d.hashOffset = 1
-		d.index, d.windowEnd = 0, 0
+		s := d.state
+		s.chainHead = -1
+		for i := range s.hashHead {
+			s.hashHead[i] = 0
+		}
+		for i := range s.hashPrev {
+			s.hashPrev[i] = 0
+		}
+		s.hashOffset = 1
+		s.index, d.windowEnd = 0, 0
 		d.blockStart, d.byteAvailable = 0, false
-		d.tokens = d.tokens[:0]
-		d.length = minMatchLength - 1
-		d.offset = 0
-		d.maxInsertIndex = 0
+		d.tokens.Reset()
+		s.length = minMatchLength - 1
+		s.offset = 0
+		s.ii = 0
+		s.maxInsertIndex = 0
 	}
 }
 
+var errWriterClosed = errors.New("flate: closed writer")
+
 func (d *compressor) close() error {
 	if d.err == errWriterClosed {
 		return nil
@@ -644,6 +800,7 @@ func (d *compressor) close() error {
 		return d.w.err
 	}
 	d.err = errWriterClosed
+	d.w.reset(nil)
 	return nil
 }
 
@@ -674,26 +831,15 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
 // can only be decompressed by a reader initialized with the
 // same dictionary (see [NewReaderDict]).
 func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w}
-	zw, err := NewWriter(dw, level)
+	zw, err := NewWriter(w, level)
 	if err != nil {
 		return nil, err
 	}
 	zw.d.fillWindow(dict)
 	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
-	return zw, nil
-}
-
-type dictWriter struct {
-	w io.Writer
+	return zw, err
 }
 
-func (w *dictWriter) Write(b []byte) (n int, err error) {
-	return w.w.Write(b)
-}
-
-var errWriterClosed = errors.New("flate: closed writer")
-
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see [NewWriter]).
 type Writer struct {
@@ -728,16 +874,26 @@ func (w *Writer) Close() error {
 }
 
 // Reset discards the writer's state and makes it equivalent to
-// the result of [NewWriter] or [NewWriterDict] called with dst
+// the result of NewWriter or NewWriterDict called with dst
 // and w's level and dictionary.
 func (w *Writer) Reset(dst io.Writer) {
-	if dw, ok := w.d.w.writer.(*dictWriter); ok {
+	if len(w.dict) > 0 {
 		// w was created with NewWriterDict
-		dw.w = dst
-		w.d.reset(dw)
-		w.d.fillWindow(w.dict)
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
 	} else {
 		// w was created with NewWriter
 		w.d.reset(dst)
 	}
 }
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+	w.dict = dict
+	w.d.reset(dst)
+	w.d.fillWindow(w.dict)
+}
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go
index 3610c7bf8763df..4bb89c61dcad0c 100644
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -6,14 +6,11 @@ package flate
 
 import (
 	"bytes"
-	"errors"
 	"fmt"
-	"internal/testenv"
 	"io"
-	"math/rand"
 	"os"
 	"reflect"
-	"runtime/debug"
+	"strings"
 	"sync"
 	"testing"
 )
@@ -35,24 +32,24 @@ type reverseBitsTest struct {
 }
 
 var deflateTests = []*deflateTest{
-	{[]byte{}, 0, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, -1, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11}, DefaultCompression, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11}, 4, []byte{18, 4, 4, 0, 0, 255, 255}},
-
-	{[]byte{0x11}, 0, []byte{0, 1, 0, 254, 255, 17, 1, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 0, []byte{0, 2, 0, 253, 255, 17, 18, 1, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
-		[]byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255},
+	0: {[]byte{}, 0, []byte{0x3, 0x0}},
+	1: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	2: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	3: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+
+	4: {[]byte{0x11}, 0, []byte{0x0, 0x1, 0x0, 0xfe, 0xff, 0x11, 0x3, 0x0}},
+	5: {[]byte{0x11, 0x12}, 0, []byte{0x0, 0x2, 0x0, 0xfd, 0xff, 0x11, 0x12, 0x3, 0x0}},
+	6: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
+		[]byte{0x0, 0x8, 0x0, 0xf7, 0xff, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x3, 0x0},
 	},
-	{[]byte{}, 2, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
-	{[]byte{}, 9, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
+	7:  {[]byte{}, 1, []byte{0x3, 0x0}},
+	8:  {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	9:  {[]byte{0x11, 0x12}, BestCompression, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
+	10: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, BestCompression, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}},
+	11: {[]byte{}, 9, []byte{0x3, 0x0}},
+	12: {[]byte{0x11}, 9, []byte{0x12, 0x4, 0xc, 0x0}},
+	13: {[]byte{0x11, 0x12}, 9, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
+	14: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}},
 }
 
 var deflateInflateTests = []*deflateInflateTest{
@@ -86,23 +83,24 @@ func largeDataChunk() []byte {
 func TestBulkHash4(t *testing.T) {
 	for _, x := range deflateTests {
 		y := x.out
-		if len(y) < minMatchLength {
-			continue
-		}
-		y = append(y, y...)
-		for j := 4; j < len(y); j++ {
-			y := y[:j]
-			dst := make([]uint32, len(y)-minMatchLength+1)
-			for i := range dst {
-				dst[i] = uint32(i + 100)
-			}
-			bulkHash4(y, dst)
-			for i, got := range dst {
-				want := hash4(y[i:])
-				if got != want && got == uint32(i)+100 {
-					t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want)
-				} else if got != want {
-					t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want)
+		if len(y) >= minMatchLength {
+			y = append(y, y...)
+			for j := 4; j < len(y); j++ {
+				y := y[:j]
+				dst := make([]uint32, len(y)-minMatchLength+1)
+				for i := range dst {
+					dst[i] = uint32(i + 100)
+				}
+				bulkHash4(y, dst)
+				for i, got := range dst {
+					want := hash4(y[i:])
+					if got != want && got == uint32(i)+100 {
+						t.Errorf("Len:%d Index:%d, expected 0x%08x but not modified", len(y), i, want)
+					} else if got != want {
+						t.Errorf("Len:%d Index:%d, got 0x%08x expected:0x%08x", len(y), i, got, want)
+					} else {
+						//t.Logf("Len:%d Index:%d OK (0x%08x)", len(y), i, got)
+					}
 				}
 			}
 		}
@@ -110,7 +108,7 @@ func TestBulkHash4(t *testing.T) {
 }
 
 func TestDeflate(t *testing.T) {
-	for _, h := range deflateTests {
+	for i, h := range deflateTests {
 		var buf bytes.Buffer
 		w, err := NewWriter(&buf, h.level)
 		if err != nil {
@@ -120,45 +118,11 @@ func TestDeflate(t *testing.T) {
 		w.Write(h.in)
 		w.Close()
 		if !bytes.Equal(buf.Bytes(), h.out) {
-			t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out)
+			t.Errorf("%d: Deflate(%d, %x) got \n%#v, want \n%#v", i, h.level, h.in, buf.Bytes(), h.out)
 		}
 	}
 }
 
-func TestWriterClose(t *testing.T) {
-	b := new(bytes.Buffer)
-	zw, err := NewWriter(b, 6)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-
-	if c, err := zw.Write([]byte("Test")); err != nil || c != 4 {
-		t.Fatalf("Write to not closed writer: %s, %d", err, c)
-	}
-
-	if err := zw.Close(); err != nil {
-		t.Fatalf("Close: %v", err)
-	}
-
-	afterClose := b.Len()
-
-	if c, err := zw.Write([]byte("Test")); err == nil || c != 0 {
-		t.Fatalf("Write to closed writer: %v, %d", err, c)
-	}
-
-	if err := zw.Flush(); err == nil {
-		t.Fatalf("Flush to closed writer: %s", err)
-	}
-
-	if err := zw.Close(); err != nil {
-		t.Fatalf("Close: %v", err)
-	}
-
-	if afterClose != b.Len() {
-		t.Fatalf("Writer wrote data after close. After close: %d. After writes on closed stream: %d", afterClose, b.Len())
-	}
-}
-
 // A sparseReader returns a stream consisting of 0s followed by 1<<16 1s.
 // This tests missing hash references in a very large input.
 type sparseReader struct {
@@ -191,7 +155,8 @@ func TestVeryLongSparseChunk(t *testing.T) {
 	if testing.Short() {
 		t.Skip("skipping sparse chunk during short test")
 	}
-	w, err := NewWriter(io.Discard, 1)
+	var buf bytes.Buffer
+	w, err := NewWriter(&buf, 1)
 	if err != nil {
 		t.Errorf("NewWriter: %v", err)
 		return
@@ -200,6 +165,7 @@ func TestVeryLongSparseChunk(t *testing.T) {
 		t.Errorf("Compress failed: %v", err)
 		return
 	}
+	t.Log("Length:", buf.Len())
 }
 
 type syncBuffer struct {
@@ -270,7 +236,7 @@ func testSync(t *testing.T, level int, input []byte, name string) {
 	r := NewReader(buf)
 
 	// Write half the input and read back.
-	for i := 0; i < 2; i++ {
+	for i := range 2 {
 		var lo, hi int
 		if i == 0 {
 			lo, hi = 0, (len(input)+1)/2
@@ -348,13 +314,13 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 	}
 	w.Write(input)
 	w.Close()
+	if limit > 0 {
+		t.Logf("level: %d - Size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
+	}
 	if limit > 0 && buffer.Len() > limit {
 		t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit)
-		return
-	}
-	if limit > 0 {
-		t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
 	}
+
 	r := NewReader(&buffer)
 	out, err := io.ReadAll(r)
 	if err != nil {
@@ -363,6 +329,8 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 	}
 	r.Close()
 	if !bytes.Equal(input, out) {
+		os.WriteFile("testdata/fails/"+t.Name()+".got", out, os.ModePerm)
+		os.WriteFile("testdata/fails/"+t.Name()+".want", input, os.ModePerm)
 		t.Errorf("decompress(compress(data)) != data: level=%d input=%s", level, name)
 		return
 	}
@@ -370,19 +338,14 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 }
 
 func testToFromWithLimit(t *testing.T, input []byte, name string, limit [11]int) {
-	for i := 0; i < 10; i++ {
+	for i := range 10 {
 		testToFromWithLevelAndLimit(t, i, input, name, limit[i])
 	}
-	// Test HuffmanCompression
 	testToFromWithLevelAndLimit(t, -2, input, name, limit[10])
 }
 
 func TestDeflateInflate(t *testing.T) {
-	t.Parallel()
 	for i, h := range deflateInflateTests {
-		if testing.Short() && len(h.in) > 10000 {
-			continue
-		}
 		testToFromWithLimit(t, h.in, fmt.Sprintf("#%d", i), [11]int{})
 	}
 }
@@ -399,33 +362,38 @@ func TestReverseBits(t *testing.T) {
 type deflateInflateStringTest struct {
 	filename string
 	label    string
-	limit    [11]int
+	limit    [11]int // Number 11 is ConstantCompression
 }
 
 var deflateInflateStringTests = []deflateInflateStringTest{
 	{
 		"../testdata/e.txt",
 		"2.718281828...",
-		[...]int{100018, 50650, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683},
+		[...]int{100018, 67900, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683 + 100},
 	},
 	{
 		"../../testdata/Isaac.Newton-Opticks.txt",
 		"Isaac.Newton-Opticks",
-		[...]int{567248, 218338, 198211, 193152, 181100, 175427, 175427, 173597, 173422, 173422, 325240},
+		[...]int{567248, 218338, 201354, 199101, 190627, 182587, 179765, 174982, 173422, 173422, 325240},
 	},
 }
 
 func TestDeflateInflateString(t *testing.T) {
-	t.Parallel()
-	if testing.Short() && testenv.Builder() == "" {
-		t.Skip("skipping in short mode")
-	}
 	for _, test := range deflateInflateStringTests {
 		gold, err := os.ReadFile(test.filename)
 		if err != nil {
 			t.Error(err)
 		}
-		testToFromWithLimit(t, gold, test.label, test.limit)
+		// Remove returns that may be present on Windows
+		neutral := strings.Map(func(r rune) rune {
+			if r != '\r' {
+				return r
+			}
+			return -1
+		}, string(gold))
+
+		testToFromWithLimit(t, []byte(neutral), test.label, test.limit)
+
 		if testing.Short() {
 			break
 		}
@@ -460,31 +428,36 @@ func TestReaderDict(t *testing.T) {
 
 func TestWriterDict(t *testing.T) {
 	const (
-		dict = "hello world"
-		text = "hello again world"
+		dict = "hello world Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
+		text = "hello world Lorem ipsum dolor sit amet"
 	)
-	var b bytes.Buffer
-	w, err := NewWriter(&b, 5)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-	w.Write([]byte(dict))
-	w.Flush()
-	b.Reset()
-	w.Write([]byte(text))
-	w.Close()
+	// This test is sensitive to algorithm changes that skip
+	// data in favour of speed. Higher levels are less prone to this
+	// so we test level 4-9.
+	for l := 4; l < 9; l++ {
+		var b bytes.Buffer
+		w, err := NewWriter(&b, l)
+		if err != nil {
+			t.Fatalf("level %d, NewWriter: %v", l, err)
+		}
+		w.Write([]byte(dict))
+		w.Flush()
+		b.Reset()
+		w.Write([]byte(text))
+		w.Close()
 
-	var b1 bytes.Buffer
-	w, _ = NewWriterDict(&b1, 5, []byte(dict))
-	w.Write([]byte(text))
-	w.Close()
+		var b1 bytes.Buffer
+		w, _ = NewWriterDict(&b1, l, []byte(dict))
+		w.Write([]byte(text))
+		w.Close()
 
-	if !bytes.Equal(b1.Bytes(), b.Bytes()) {
-		t.Fatalf("writer wrote %q want %q", b1.Bytes(), b.Bytes())
+		if !bytes.Equal(b1.Bytes(), b.Bytes()) {
+			t.Errorf("level %d, writer wrote\n%v\n want\n%v", l, b1.Bytes(), b.Bytes())
+		}
 	}
 }
 
-// See https://golang.org/issue/2508
+// See http://code.google.com/p/go/issues/detail?id=2508
 func TestRegression2508(t *testing.T) {
 	if testing.Short() {
 		t.Logf("test disabled with -short")
@@ -495,7 +468,7 @@ func TestRegression2508(t *testing.T) {
 		t.Fatalf("NewWriter: %v", err)
 	}
 	buf := make([]byte, 1024)
-	for i := 0; i < 131072; i++ {
+	for range 131072 {
 		if _, err := w.Write(buf); err != nil {
 			t.Fatalf("writer failed: %v", err)
 		}
@@ -504,8 +477,10 @@ func TestRegression2508(t *testing.T) {
 }
 
 func TestWriterReset(t *testing.T) {
-	t.Parallel()
-	for level := 0; level <= 9; level++ {
+	for level := -2; level <= 9; level++ {
+		if level == -1 {
+			level++
+		}
 		if testing.Short() && level > 1 {
 			break
 		}
@@ -514,11 +489,7 @@ func TestWriterReset(t *testing.T) {
 			t.Fatalf("NewWriter: %v", err)
 		}
 		buf := []byte("hello world")
-		n := 1024
-		if testing.Short() {
-			n = 10
-		}
-		for i := 0; i < n; i++ {
+		for range 1024 {
 			w.Write(buf)
 		}
 		w.Reset(io.Discard)
@@ -531,12 +502,12 @@ func TestWriterReset(t *testing.T) {
 		// DeepEqual doesn't compare functions.
 		w.d.fill, wref.d.fill = nil, nil
 		w.d.step, wref.d.step = nil, nil
-		w.d.bulkHasher, wref.d.bulkHasher = nil, nil
-		w.d.bestSpeed, wref.d.bestSpeed = nil, nil
+		w.d.state, wref.d.state = nil, nil
+		w.d.fast, wref.d.fast = nil, nil
+
 		// hashMatch is always overwritten when used.
-		copy(w.d.hashMatch[:], wref.d.hashMatch[:])
-		if len(w.d.tokens) != 0 {
-			t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens))
+		if w.d.tokens.n != 0 {
+			t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, w.d.tokens.n)
 		}
 		// As long as the length is 0, we don't care about the content.
 		w.d.tokens = wref.d.tokens
@@ -548,76 +519,64 @@ func TestWriterReset(t *testing.T) {
 		}
 	}
 
-	levels := []int{0, 1, 2, 5, 9}
-	for _, level := range levels {
-		t.Run(fmt.Sprint(level), func(t *testing.T) {
-			testResetOutput(t, level, nil)
+	for i := HuffmanOnly; i <= BestCompression; i++ {
+		testResetOutput(t, fmt.Sprint("level-", i), func(w io.Writer) (*Writer, error) { return NewWriter(w, i) })
+	}
+	dict := []byte(strings.Repeat("we are the world - how are you?", 3))
+	for i := HuffmanOnly; i <= BestCompression; i++ {
+		testResetOutput(t, fmt.Sprint("dict-level-", i), func(w io.Writer) (*Writer, error) { return NewWriterDict(w, i, dict) })
+	}
+	for i := HuffmanOnly; i <= BestCompression; i++ {
+		testResetOutput(t, fmt.Sprint("dict-reset-level-", i), func(w io.Writer) (*Writer, error) {
+			w2, err := NewWriter(nil, i)
+			if err != nil {
+				return w2, err
+			}
+			w2.ResetDict(w, dict)
+			return w2, nil
 		})
 	}
-
-	t.Run("dict", func(t *testing.T) {
-		for _, level := range levels {
-			t.Run(fmt.Sprint(level), func(t *testing.T) {
-				testResetOutput(t, level, nil)
-			})
-		}
-	})
 }
 
-func testResetOutput(t *testing.T, level int, dict []byte) {
-	writeData := func(w *Writer) {
-		msg := []byte("now is the time for all good gophers")
-		w.Write(msg)
-		w.Flush()
-
-		hello := []byte("hello world")
-		for i := 0; i < 1024; i++ {
-			w.Write(hello)
+func testResetOutput(t *testing.T, name string, newWriter func(w io.Writer) (*Writer, error)) {
+	t.Run(name, func(t *testing.T) {
+		buf := new(bytes.Buffer)
+		w, err := newWriter(buf)
+		if err != nil {
+			t.Fatalf("NewWriter: %v", err)
 		}
+		b := []byte("hello world - how are you doing?")
+		for range 1024 {
+			w.Write(b)
+		}
+		w.Close()
+		out1 := buf.Bytes()
 
-		fill := bytes.Repeat([]byte("x"), 65000)
-		w.Write(fill)
-	}
-
-	buf := new(bytes.Buffer)
-	var w *Writer
-	var err error
-	if dict == nil {
-		w, err = NewWriter(buf, level)
-	} else {
-		w, err = NewWriterDict(buf, level, dict)
-	}
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-
-	writeData(w)
-	w.Close()
-	out1 := buf.Bytes()
-
-	buf2 := new(bytes.Buffer)
-	w.Reset(buf2)
-	writeData(w)
-	w.Close()
-	out2 := buf2.Bytes()
+		buf2 := new(bytes.Buffer)
+		w.Reset(buf2)
+		for range 1024 {
+			w.Write(b)
+		}
+		w.Close()
+		out2 := buf2.Bytes()
 
-	if len(out1) != len(out2) {
-		t.Errorf("got %d, expected %d bytes", len(out2), len(out1))
-		return
-	}
-	if !bytes.Equal(out1, out2) {
-		mm := 0
-		for i, b := range out1[:len(out2)] {
-			if b != out2[i] {
-				t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b)
-			}
-			mm++
-			if mm == 10 {
-				t.Fatal("Stopping")
+		if len(out1) != len(out2) {
+			t.Errorf("got %d, expected %d bytes", len(out2), len(out1))
+		}
+		if !bytes.Equal(out1, out2) {
+			mm := 0
+			for i, b := range out1[:len(out2)] {
+				if b != out2[i] {
+					t.Errorf("mismatch index %d: %02x, expected %02x", i, out2[i], b)
+				}
+				mm++
+				if mm == 10 {
+					t.Fatal("Stopping")
+				}
 			}
 		}
-	}
-	t.Logf("got %d bytes", len(out1))
+		t.Logf("got %d bytes", len(out1))
+	})
 }
 
 // TestBestSpeed tests that round-tripping through deflate and then inflate
@@ -625,7 +584,6 @@ func testResetOutput(t *testing.T, level int, dict []byte) {
 // compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize
 // (65535).
 func TestBestSpeed(t *testing.T) {
-	t.Parallel()
 	abc := make([]byte, 128)
 	for i := range abc {
 		abc[i] = byte(i)
@@ -653,8 +611,8 @@ func TestBestSpeed(t *testing.T) {
 	}
 
 	for i, tc := range testCases {
-		if i >= 3 && testing.Short() {
-			break
+		if testing.Short() && i > 5 {
+			t.Skip()
 		}
 		for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} {
 			tc[0] = firstN
@@ -703,368 +661,3 @@ func TestBestSpeed(t *testing.T) {
 		}
 	}
 }
-
-var errIO = errors.New("IO error")
-
-// failWriter fails with errIO exactly at the nth call to Write.
-type failWriter struct{ n int }
-
-func (w *failWriter) Write(b []byte) (int, error) {
-	w.n--
-	if w.n == -1 {
-		return 0, errIO
-	}
-	return len(b), nil
-}
-
-func TestWriterPersistentWriteError(t *testing.T) {
-	t.Parallel()
-	d, err := os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
-	if err != nil {
-		t.Fatalf("ReadFile: %v", err)
-	}
-	d = d[:10000] // Keep this test short
-
-	zw, err := NewWriter(nil, DefaultCompression)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-
-	// Sweep over the threshold at which an error is returned.
-	// The variable i makes it such that the ith call to failWriter.Write will
-	// return errIO. Since failWriter errors are not persistent, we must ensure
-	// that flate.Writer errors are persistent.
-	for i := 0; i < 1000; i++ {
-		fw := &failWriter{i}
-		zw.Reset(fw)
-
-		_, werr := zw.Write(d)
-		cerr := zw.Close()
-		ferr := zw.Flush()
-		if werr != errIO && werr != nil {
-			t.Errorf("test %d, mismatching Write error: got %v, want %v", i, werr, errIO)
-		}
-		if cerr != errIO && fw.n < 0 {
-			t.Errorf("test %d, mismatching Close error: got %v, want %v", i, cerr, errIO)
-		}
-		if ferr != errIO && fw.n < 0 {
-			t.Errorf("test %d, mismatching Flush error: got %v, want %v", i, ferr, errIO)
-		}
-		if fw.n >= 0 {
-			// At this point, the failure threshold was sufficiently high enough
-			// that we wrote the whole stream without any errors.
-			return
-		}
-	}
-}
-func TestWriterPersistentFlushError(t *testing.T) {
-	zw, err := NewWriter(&failWriter{0}, DefaultCompression)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-	flushErr := zw.Flush()
-	closeErr := zw.Close()
-	_, writeErr := zw.Write([]byte("Test"))
-	checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t)
-}
-
-func TestWriterPersistentCloseError(t *testing.T) {
-	// If underlying writer return error on closing stream we should persistent this error across all writer calls.
-	zw, err := NewWriter(&failWriter{0}, DefaultCompression)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-	closeErr := zw.Close()
-	flushErr := zw.Flush()
-	_, writeErr := zw.Write([]byte("Test"))
-	checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t)
-
-	// After closing writer we should persistent "write after close" error across Flush and Write calls, but return nil
-	// on next Close calls.
-	var b bytes.Buffer
-	zw.Reset(&b)
-	err = zw.Close()
-	if err != nil {
-		t.Fatalf("First call to close returned error: %s", err)
-	}
-	err = zw.Close()
-	if err != nil {
-		t.Fatalf("Second call to close returned error: %s", err)
-	}
-
-	flushErr = zw.Flush()
-	_, writeErr = zw.Write([]byte("Test"))
-	checkErrors([]error{flushErr, writeErr}, errWriterClosed, t)
-}
-
-func checkErrors(got []error, want error, t *testing.T) {
-	t.Helper()
-	for _, err := range got {
-		if err != want {
-			t.Errorf("Error doesn't match\nWant: %s\nGot: %s", want, got)
-		}
-	}
-}
-
-func TestBestSpeedMatch(t *testing.T) {
-	t.Parallel()
-	cases := []struct {
-		previous, current []byte
-		t, s, want        int32
-	}{{
-		previous: []byte{0, 0, 0, 1, 2},
-		current:  []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
-		t:        -3,
-		s:        3,
-		want:     6,
-	}, {
-		previous: []byte{0, 0, 0, 1, 2},
-		current:  []byte{2, 4, 5, 0, 1, 2, 3, 4, 5},
-		t:        -3,
-		s:        3,
-		want:     3,
-	}, {
-		previous: []byte{0, 0, 0, 1, 1},
-		current:  []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
-		t:        -3,
-		s:        3,
-		want:     2,
-	}, {
-		previous: []byte{0, 0, 0, 1, 2},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -1,
-		s:        0,
-		want:     4,
-	}, {
-		previous: []byte{0, 0, 0, 1, 2, 3, 4, 5, 2, 2},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -7,
-		s:        4,
-		want:     5,
-	}, {
-		previous: []byte{9, 9, 9, 9, 9},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -1,
-		s:        0,
-		want:     0,
-	}, {
-		previous: []byte{9, 9, 9, 9, 9},
-		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        0,
-		s:        1,
-		want:     0,
-	}, {
-		previous: []byte{},
-		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -5,
-		s:        1,
-		want:     0,
-	}, {
-		previous: []byte{},
-		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -1,
-		s:        1,
-		want:     0,
-	}, {
-		previous: []byte{},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        0,
-		s:        1,
-		want:     3,
-	}, {
-		previous: []byte{3, 4, 5},
-		current:  []byte{3, 4, 5},
-		t:        -3,
-		s:        0,
-		want:     3,
-	}, {
-		previous: make([]byte, 1000),
-		current:  make([]byte, 1000),
-		t:        -1000,
-		s:        0,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, 200),
-		current:  make([]byte, 500),
-		t:        -200,
-		s:        0,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, 200),
-		current:  make([]byte, 500),
-		t:        0,
-		s:        1,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, maxMatchLength-4),
-		current:  make([]byte, 500),
-		t:        -(maxMatchLength - 4),
-		s:        0,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, 200),
-		current:  make([]byte, 500),
-		t:        -200,
-		s:        400,
-		want:     100,
-	}, {
-		previous: make([]byte, 10),
-		current:  make([]byte, 500),
-		t:        200,
-		s:        400,
-		want:     100,
-	}}
-	for i, c := range cases {
-		e := deflateFast{prev: c.previous}
-		got := e.matchLen(c.s, c.t, c.current)
-		if got != c.want {
-			t.Errorf("Test %d: match length, want %d, got %d", i, c.want, got)
-		}
-	}
-}
-
-func TestBestSpeedMaxMatchOffset(t *testing.T) {
-	t.Parallel()
-	const abc, xyz = "abcdefgh", "stuvwxyz"
-	for _, matchBefore := range []bool{false, true} {
-		for _, extra := range []int{0, inputMargin - 1, inputMargin, inputMargin + 1, 2 * inputMargin} {
-			for offsetAdj := -5; offsetAdj <= +5; offsetAdj++ {
-				report := func(desc string, err error) {
-					t.Errorf("matchBefore=%t, extra=%d, offsetAdj=%d: %s%v",
-						matchBefore, extra, offsetAdj, desc, err)
-				}
-
-				offset := maxMatchOffset + offsetAdj
-
-				// Make src to be a []byte of the form
-				//	"%s%s%s%s%s" % (abc, zeros0, xyzMaybe, abc, zeros1)
-				// where:
-				//	zeros0 is approximately maxMatchOffset zeros.
-				//	xyzMaybe is either xyz or the empty string.
-				//	zeros1 is between 0 and 30 zeros.
-				// The difference between the two abc's will be offset, which
-				// is maxMatchOffset plus or minus a small adjustment.
-				src := make([]byte, offset+len(abc)+extra)
-				copy(src, abc)
-				if !matchBefore {
-					copy(src[offset-len(xyz):], xyz)
-				}
-				copy(src[offset:], abc)
-
-				buf := new(bytes.Buffer)
-				w, err := NewWriter(buf, BestSpeed)
-				if err != nil {
-					report("NewWriter: ", err)
-					continue
-				}
-				if _, err := w.Write(src); err != nil {
-					report("Write: ", err)
-					continue
-				}
-				if err := w.Close(); err != nil {
-					report("Writer.Close: ", err)
-					continue
-				}
-
-				r := NewReader(buf)
-				dst, err := io.ReadAll(r)
-				r.Close()
-				if err != nil {
-					report("ReadAll: ", err)
-					continue
-				}
-
-				if !bytes.Equal(dst, src) {
-					report("", fmt.Errorf("bytes differ after round-tripping"))
-					continue
-				}
-			}
-		}
-	}
-}
-
-func TestBestSpeedShiftOffsets(t *testing.T) {
-	// Test if shiftoffsets properly preserves matches and resets out-of-range matches
-	// seen in https://github.com/golang/go/issues/4142
-	enc := newDeflateFast()
-
-	// testData may not generate internal matches.
-	testData := make([]byte, 32)
-	rng := rand.New(rand.NewSource(0))
-	for i := range testData {
-		testData[i] = byte(rng.Uint32())
-	}
-
-	// Encode the testdata with clean state.
-	// Second part should pick up matches from the first block.
-	wantFirstTokens := len(enc.encode(nil, testData))
-	wantSecondTokens := len(enc.encode(nil, testData))
-
-	if wantFirstTokens <= wantSecondTokens {
-		t.Fatalf("test needs matches between inputs to be generated")
-	}
-	// Forward the current indicator to before wraparound.
-	enc.cur = bufferReset - int32(len(testData))
-
-	// Part 1 before wrap, should match clean state.
-	got := len(enc.encode(nil, testData))
-	if wantFirstTokens != got {
-		t.Errorf("got %d, want %d tokens", got, wantFirstTokens)
-	}
-
-	// Verify we are about to wrap.
-	if enc.cur != bufferReset {
-		t.Errorf("got %d, want e.cur to be at bufferReset (%d)", enc.cur, bufferReset)
-	}
-
-	// Part 2 should match clean state as well even if wrapped.
-	got = len(enc.encode(nil, testData))
-	if wantSecondTokens != got {
-		t.Errorf("got %d, want %d token", got, wantSecondTokens)
-	}
-
-	// Verify that we wrapped.
-	if enc.cur >= bufferReset {
-		t.Errorf("want e.cur to be < bufferReset (%d), got %d", bufferReset, enc.cur)
-	}
-
-	// Forward the current buffer, leaving the matches at the bottom.
-	enc.cur = bufferReset
-	enc.shiftOffsets()
-
-	// Ensure that no matches were picked up.
-	got = len(enc.encode(nil, testData))
-	if wantFirstTokens != got {
-		t.Errorf("got %d, want %d tokens", got, wantFirstTokens)
-	}
-}
-
-func TestMaxStackSize(t *testing.T) {
-	// This test must not run in parallel with other tests as debug.SetMaxStack
-	// affects all goroutines.
-	n := debug.SetMaxStack(1 << 16)
-	defer debug.SetMaxStack(n)
-
-	var wg sync.WaitGroup
-	defer wg.Wait()
-
-	b := make([]byte, 1<<20)
-	for level := HuffmanOnly; level <= BestCompression; level++ {
-		// Run in separate goroutine to increase probability of stack regrowth.
-		wg.Add(1)
-		go func(level int) {
-			defer wg.Done()
-			zw, err := NewWriter(io.Discard, level)
-			if err != nil {
-				t.Errorf("level %d, NewWriter() = %v, want nil", level, err)
-			}
-			if n, err := zw.Write(b); n != len(b) || err != nil {
-				t.Errorf("level %d, Write() = (%d, %v), want (%d, nil)", level, n, err, len(b))
-			}
-			if err := zw.Close(); err != nil {
-				t.Errorf("level %d, Close() = %v, want nil", level, err)
-			}
-			zw.Reset(io.Discard)
-		}(level)
-	}
-}
diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go
index e5554d6fb40842..e132c55951b5ef 100644
--- a/src/compress/flate/deflatefast.go
+++ b/src/compress/flate/deflatefast.go
@@ -4,304 +4,170 @@
 
 package flate
 
-import "math"
-
-// This encoding algorithm, which prioritizes speed over output size, is
-// based on Snappy's LZ77-style encoder: github.com/golang/snappy
-
-const (
-	tableBits  = 14             // Bits used in the table.
-	tableSize  = 1 << tableBits // Size of the table.
-	tableMask  = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
-
-	// Reset the buffer offset when reaching this.
-	// Offsets are stored between blocks as int32 values.
-	// Since the offset we are checking against is at the beginning
-	// of the buffer, we need to subtract the current and input
-	// buffer to not risk overflowing the int32.
-	bufferReset = math.MaxInt32 - maxStoreBlockSize*2
+import (
+	"math/bits"
 )
 
-func load32(b []byte, i int32) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+type fastEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
 }
 
-func load64(b []byte, i int32) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+func newFastEnc(level int) fastEnc {
+	switch level {
+	case 1:
+		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 2:
+		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 3:
+		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 4:
+		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 5:
+		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 6:
+		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+	default:
+		panic("invalid level specified")
+	}
 }
 
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
-}
+const (
+	tableBits       = 15             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+
+	bTableBits   = 17                                               // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                                  // Size of the table
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
+)
 
-// These constants are defined by the Snappy implementation so that its
-// assembly implementation can fast-path some 16-bytes-at-a-time copies. They
-// aren't necessary in the pure Go implementation, as we don't use those same
-// optimizations, but using the same thresholds doesn't really hurt.
 const (
-	inputMargin            = 16 - 1
-	minNonLiteralBlockSize = 1 + 1 + inputMargin
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
 type tableEntry struct {
-	val    uint32 // Value at destination
 	offset int32
 }
 
-// deflateFast maintains the table for matches,
-// and the previous byte block for cross block matching.
-type deflateFast struct {
-	table [tableSize]tableEntry
-	prev  []byte // Previous block, zero length if unknown.
-	cur   int32  // Current match offset.
-}
-
-func newDeflateFast() *deflateFast {
-	return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+	hist []byte
+	cur  int32
 }
 
-// encode encodes a block given in src and appends tokens
-// to dst and returns the result.
-func (e *deflateFast) encode(dst []token, src []byte) []token {
-	// Ensure that e.cur doesn't wrap.
-	if e.cur >= bufferReset {
-		e.shiftOffsets()
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return emitLiteral(dst, src)
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load32(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = e.table[nextHash&tableMask]
-			now := load32(src, nextS)
-			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
-			nextHash = hash(now)
-
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || cv != candidate.val {
-				// Out of range or not matched.
-				cv = now
-				continue
-			}
-			break
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		dst = emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchLen(s, t, src)
-
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)))
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x))
-			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidate = e.table[currHash&tableMask]
-			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
-
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != candidate.val {
-				cv = uint32(x >> 8)
-				nextHash = hash(cv)
-				s++
-				break
+func (e *fastGen) addBlock(src []byte) int32 {
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < maxMatchOffset*2 {
+				panic("unexpected buffer size")
 			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			// copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			*(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
 		}
 	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
 
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		dst = emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-	return dst
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
 }
 
-func emitLiteral(dst []token, lit []byte) []token {
-	for _, v := range lit {
-		dst = append(dst, literalToken(uint32(v)))
-	}
-	return dst
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
-// matchLen returns the match length between src[s:] and src[t:].
-// t can be negative to indicate the match is starting in e.prev.
-// We assume that src[s-4:s] and src[t-4:t] already match.
-func (e *deflateFast) matchLen(s, t int32, src []byte) int32 {
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably, length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
+	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
+	case 5:
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
+	case 6:
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
+	case 7:
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
+	case 8:
+		return uint32((u * prime8bytes) >> (64 - length))
+	default:
+		return (uint32(u) * prime4bytes) >> (32 - length)
 	}
+}
 
-	// If we are inside the current block
-	if t >= 0 {
-		b := src[t:]
-		a := src[s:s1]
-		b = b[:len(a)]
-		// Extend the match to be as long as possible.
-		for i := range a {
-			if a[i] != b[i] {
-				return int32(i)
-			}
-		}
-		return int32(len(a))
-	}
+// matchLenLimited will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchLenLimited(s, t int, src []byte) int32 {
+	a := src[s:min(s+maxMatchLength-4, len(src))]
+	b := src[t:]
+	return int32(matchLen(a, b))
+}
 
-	// We found a match in the previous block.
-	tp := int32(len(e.prev)) + t
-	if tp < 0 {
-		return 0
-	}
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int, src []byte) int32 {
+	return int32(matchLen(src[s:], src[t:]))
+}
 
-	// Extend the match to be as long as possible.
-	a := src[s:s1]
-	b := e.prev[tp:]
-	if len(b) > len(a) {
-		b = b[:len(a)]
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
 	}
-	a = a[:len(b)]
-	for i := range b {
-		if a[i] != b[i] {
-			return int32(i)
-		}
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= bufferReset {
+		e.cur += maxMatchOffset + int32(len(e.hist))
 	}
+	e.hist = e.hist[:0]
+}
 
-	// If we reached our limit, we matched everything we are
-	// allowed to in the previous block and we return.
-	n := int32(len(b))
-	if int(s+n) == s1 {
-		return n
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	left := len(a)
+	for left >= 8 {
+		diff := loadLE64(a, n) ^ loadLE64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+		left -= 8
 	}
 
-	// Continue looking for more matches in the current block.
-	a = src[s+n : s1]
-	b = src[:len(a)]
+	a = a[n:]
+	b = b[n:]
 	for i := range a {
 		if a[i] != b[i] {
-			return int32(i) + n
-		}
-	}
-	return int32(len(a)) + n
-}
-
-// Reset resets the encoding history.
-// This ensures that no matches are made to the previous block.
-func (e *deflateFast) reset() {
-	e.prev = e.prev[:0]
-	// Bump the offset, so all matches will fail distance check.
-	// Nothing should be >= e.cur in the table.
-	e.cur += maxMatchOffset
-
-	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
-		e.shiftOffsets()
-	}
-}
-
-// shiftOffsets will shift down all match offset.
-// This is only called in rare situations to prevent integer overflow.
-//
-// See https://golang.org/issue/18636 and https://github.com/golang/go/issues/34121.
-func (e *deflateFast) shiftOffsets() {
-	if len(e.prev) == 0 {
-		// We have no history; just clear the table.
-		clear(e.table[:])
-		e.cur = maxMatchOffset + 1
-		return
-	}
-
-	// Shift down everything in the table that isn't already too far away.
-	for i := range e.table[:] {
-		v := e.table[i].offset - e.cur + maxMatchOffset + 1
-		if v < 0 {
-			// We want to reset e.cur to maxMatchOffset + 1, so we need to shift
-			// all table entries down by (e.cur - (maxMatchOffset + 1)).
-			// Because we ignore matches > maxMatchOffset, we can cap
-			// any negative offsets at 0.
-			v = 0
+			break
 		}
-		e.table[i].offset = v
+		n++
 	}
-	e.cur = maxMatchOffset + 1
+	return n
 }
diff --git a/src/compress/flate/dict_decoder.go b/src/compress/flate/dict_decoder.go
index d2c19040f54f53..cb855abc4ba1d7 100644
--- a/src/compress/flate/dict_decoder.go
+++ b/src/compress/flate/dict_decoder.go
@@ -104,10 +104,7 @@ func (dd *dictDecoder) writeCopy(dist, length int) int {
 	dstBase := dd.wrPos
 	dstPos := dstBase
 	srcPos := dstPos - dist
-	endPos := dstPos + length
-	if endPos > len(dd.hist) {
-		endPos = len(dd.hist)
-	}
+	endPos := min(dstPos+length, len(dd.hist))
 
 	// Copy non-overlapping section after destination position.
 	//
@@ -160,8 +157,10 @@ func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
 	srcPos := dstPos - dist
 
 	// Copy possibly overlapping section before destination position.
-	for dstPos < endPos {
-		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+loop:
+	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	if dstPos < endPos {
+		goto loop // Avoid for-loop so that this function can be inlined
 	}
 
 	dd.wrPos = dstPos
diff --git a/src/compress/flate/example_test.go b/src/compress/flate/example_test.go
index 578009248f5704..3af5c1d95de1d1 100644
--- a/src/compress/flate/example_test.go
+++ b/src/compress/flate/example_test.go
@@ -93,7 +93,7 @@ func Example_dictionary() {
 	var b bytes.Buffer
 
 	// Compress the data using the specially crafted dictionary.
-	zw, err := flate.NewWriterDict(&b, flate.DefaultCompression, []byte(dict))
+	zw, err := flate.NewWriterDict(&b, flate.BestCompression, []byte(dict))
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -168,6 +168,7 @@ func Example_synchronization() {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
+		defer wp.Close()
 
 		zw, err := flate.NewWriter(wp, flate.BestSpeed)
 		if err != nil {
diff --git a/src/compress/flate/fuzz_test.go b/src/compress/flate/fuzz_test.go
new file mode 100644
index 00000000000000..1ea8cc49e54672
--- /dev/null
+++ b/src/compress/flate/fuzz_test.go
@@ -0,0 +1,111 @@
+package flate
+
+import (
+	"bytes"
+	"flag"
+	"io"
+	"os"
+	"strconv"
+	"testing"
+)
+
+// Fuzzing tweaks:
+var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level")
+var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)")
+var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size")
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	os.Exit(m.Run())
+}
+
+// FuzzEncoding tests the fuzzer by doing roundtrips.
+// Every input is run through the fuzzer at every level.
+// Note: When running the fuzzer, it may hit the 10-second timeout on slower CPUs.
+func FuzzEncoding(f *testing.F) {
+	startFuzz := *fuzzStartF
+	endFuzz := *fuzzEndF
+	maxSize := *fuzzMaxF
+
+	decoder := NewReader(nil)
+	buf, buf2 := new(bytes.Buffer), new(bytes.Buffer)
+	encs := make([]*Writer, endFuzz-startFuzz+1)
+	for i := range encs {
+		var err error
+		encs[i], err = NewWriter(nil, i+startFuzz)
+		if err != nil {
+			f.Fatal(err.Error())
+		}
+	}
+
+	f.Fuzz(func(t *testing.T, data []byte) {
+		if len(data) > maxSize {
+			return
+		}
+		for level := startFuzz; level <= endFuzz; level++ {
+			if level == DefaultCompression {
+				continue // Already covered.
+			}
+			msg := "level " + strconv.Itoa(level) + ":"
+			buf.Reset()
+			fw := encs[level-startFuzz]
+			fw.Reset(buf)
+			n, err := fw.Write(data)
+			if n != len(data) {
+				t.Fatal(msg + "short write")
+			}
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			err = fw.Close()
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			compressed := buf.Bytes()
+			err = decoder.(Resetter).Reset(buf, nil)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			data2, err := io.ReadAll(decoder)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			if !bytes.Equal(data, data2) {
+				t.Fatal(msg + "decompressed not equal")
+			}
+
+			// Do it again...
+			msg = "level " + strconv.Itoa(level) + " (reset):"
+			buf2.Reset()
+			fw.Reset(buf2)
+			n, err = fw.Write(data)
+			if n != len(data) {
+				t.Fatal(msg + "short write")
+			}
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			err = fw.Close()
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			compressed2 := buf2.Bytes()
+			err = decoder.(Resetter).Reset(buf2, nil)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			data2, err = io.ReadAll(decoder)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			if !bytes.Equal(data, data2) {
+				t.Fatal(msg + "decompressed not equal")
+			}
+			// Determinism checks will usually not be reproducible,
+			// since it often relies on the internal state of the compressor.
+			if !bytes.Equal(compressed, compressed2) {
+				t.Fatal(msg + "non-deterministic output")
+			}
+		}
+	})
+}
diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go
index d68c77fb32e32a..f5e50925db8802 100644
--- a/src/compress/flate/huffman_bit_writer.go
+++ b/src/compress/flate/huffman_bit_writer.go
@@ -6,6 +6,7 @@ package flate
 
 import (
 	"io"
+	"math"
 )
 
 const (
@@ -22,20 +23,22 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
 	// we accumulate 6 bytes between writes to the buffer.
-	bufferFlushSize = 240
-
-	// bufferSize is the actual output byte buffer size.
-	// It must have additional headroom for a flush
-	// which can contain up to 8 bytes.
-	bufferSize = bufferFlushSize + 8
+	bufferFlushSize = 246
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = []int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -43,26 +46,47 @@ var lengthExtraBits = []int8{
 }
 
 // The length indicated by length code X - LENGTH_CODES_START.
-var lengthBase = []uint32{
+var lengthBase = [32]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
 	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
-var offsetExtraBits = []int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	/* extended window */
+	14, 14,
 }
 
-var offsetBase = []uint32{
-	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
-	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
-	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
-	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
-	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
-	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+var offsetCombined = [32]uint32{}
+
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
 }
 
 // The odd order in which the codegen code sizes are written.
@@ -75,29 +99,49 @@ type huffmanBitWriter struct {
 	writer io.Writer
 
 	// Data waiting to be written is bytes[0:nbytes]
-	// and then the low nbits of bits.  Data is always written
-	// sequentially into the bytes array.
-	bits            uint64
-	nbits           uint
-	bytes           [bufferSize]byte
-	codegenFreq     [codegenCodeCount]int32
-	nbytes          int
-	literalFreq     []int32
-	offsetFreq      []int32
-	codegen         []uint8
-	literalEncoding *huffmanEncoder
-	offsetEncoding  *huffmanEncoder
-	codegenEncoding *huffmanEncoder
-	err             error
+	// and then the low nbits of bits.
+	bits               uint64
+	nbits              uint8
+	nbytes             uint8
+	lastHuffMan        bool
+	literalEncoding    *huffmanEncoder
+	tmpLitEncoding     *huffmanEncoder
+	offsetEncoding     *huffmanEncoder
+	codegenEncoding    *huffmanEncoder
+	err                error
+	lastHeader         int
+	logNewTablePenalty uint // Bigger values will reduce the penalty of a new table.
+	bytes              [256 + 8]byte
+	literalFreq        [lengthCodesStart + 32]uint16
+	offsetFreq         [32]uint16
+	codegenFreq        [codegenCodeCount]uint16
+
+	// codegen must have an extra space for the final symbol.
+	codegen [literalCount + offsetCodeCount + 1]uint8
 }
 
+// The huffmanBitWriter supports reusing huffman tables and will combine
+// blocks, if compression is less than creating a new table.
+//
+// This is controlled by several variables:
+//
+// If 'lastHeader' is non-zero the Huffman table can be reused.
+// It also indicates that an EOB has not yet been emitted, so if a new table
+// is generated, an EOB with the previous table must be written.
+//
+// If 'lastHuffMan' is set, a table for outputting literals
+// has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a
+// 'fresh' by calculating the optimal size and adding a penalty.
+// A Huffman table is not optimal, which is why we add a penalty,
+// and generating a new table is slower for both compression and decompression.
+
 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
-		literalFreq:     make([]int32, maxNumLit),
-		offsetFreq:      make([]int32, offsetCodeCount),
-		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
-		literalEncoding: newHuffmanEncoder(maxNumLit),
+		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@@ -106,6 +150,37 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.writer = writer
 	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+	w.lastHeader = 0
+	w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
+	a := t.offHist[:offsetCodeCount]
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.extraHist[:literalCount-256]
+	b = w.literalEncoding.codes[256:literalCount]
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -113,6 +188,11 @@ func (w *huffmanBitWriter) flush() {
 		w.nbits = 0
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
 	n := w.nbytes
 	for w.nbits != 0 {
 		w.bytes[n] = byte(w.bits)
@@ -125,7 +205,9 @@ func (w *huffmanBitWriter) flush() {
 		n++
 	}
 	w.bits = 0
-	w.write(w.bytes[:n])
+	if n > 0 {
+		w.write(w.bytes[:n])
+	}
 	w.nbytes = 0
 }
 
@@ -136,30 +218,11 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(b) << w.nbits
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
-			n = 0
-		}
-		w.nbytes = n
+		w.writeOutBits()
 	}
 }
 
@@ -198,21 +261,23 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) {
 //	numOffsets       The number of offsets in offsetEncoding
 //	litenc, offenc   The literal and offset encoder to use
 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
-	clear(w.codegenFreq[:])
+	for i := range w.codegenFreq {
+		w.codegenFreq[i] = 0
+	}
 	// Note that we are using codegen both as a temporary variable for holding
 	// a copy of the frequencies, and as the place where we put the result.
 	// This is fine because the output is always shorter than the input used
 	// so far.
-	codegen := w.codegen // cache
+	codegen := w.codegen[:] // cache
 	// Copy the concatenated code sizes to codegen. Put a marker at the end.
 	cgnl := codegen[:numLiterals]
 	for i := range cgnl {
-		cgnl[i] = uint8(litEnc.codes[i].len)
+		cgnl[i] = litEnc.codes[i].len()
 	}
 
 	cgnl = codegen[numLiterals : numLiterals+numOffsets]
 	for i := range cgnl {
-		cgnl[i] = uint8(offEnc.codes[i].len)
+		cgnl[i] = offEnc.codes[i].len()
 	}
 	codegen[numLiterals+numOffsets] = badCode
 
@@ -234,10 +299,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 			w.codegenFreq[size]++
 			count--
 			for count >= 3 {
-				n := 6
-				if n > count {
-					n = count
-				}
+				n := min(6, count)
 				codegen[outIndex] = 16
 				outIndex++
 				codegen[outIndex] = uint8(n - 3)
@@ -247,10 +309,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 			}
 		} else {
 			for count >= 11 {
-				n := 138
-				if n > count {
-					n = count
-				}
+				n := min(138, count)
 				codegen[outIndex] = 18
 				outIndex++
 				codegen[outIndex] = uint8(n - 11)
@@ -282,30 +341,61 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	codegen[outIndex] = badCode
 }
 
-// dynamicSize returns the size of dynamically encoded data in bits.
-func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+func (w *huffmanBitWriter) codegens() int {
+	numCodegens := len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
 	numCodegens = len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
-	header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
 		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
 		int(w.codegenFreq[16])*2 +
 		int(w.codegenFreq[17])*3 +
-		int(w.codegenFreq[18])*7
+		int(w.codegenFreq[18])*7, numCodegens
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
+	size = litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:])
+	return size
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	header, numCodegens := w.headerSize()
 	size = header +
-		litEnc.bitLength(w.literalFreq) +
-		offEnc.bitLength(w.offsetFreq) +
+		litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:]) +
 		extraBits
-
 	return size, numCodegens
 }
 
+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+	total := 0
+	for i, n := range w.literalFreq[257:literalCount] {
+		total += int(n) * int(lengthExtraBits[i&31])
+	}
+	for i, n := range w.offsetFreq[:offsetCodeCount] {
+		total += int(n) * int(offsetExtraBits[i&31])
+	}
+	return total
+}
+
 // fixedSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) fixedSize(extraBits int) int {
 	return 3 +
-		fixedLiteralEncoding.bitLength(w.literalFreq) +
-		fixedOffsetEncoding.bitLength(w.offsetFreq) +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
 		extraBits
 }
 
@@ -323,30 +413,35 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 }
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(c.code) << w.nbits
-	w.nbits += uint(c.len)
+	// The function does not get inlined if we "& 63" the shift.
+	w.bits |= c.code64() << (w.nbits & reg8SizeMask64)
+	w.nbits += c.len()
 	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
+		w.writeOutBits()
+	}
+}
+
+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+	bits := w.bits
+	w.bits >>= 48
+	w.nbits -= 48
+	n := w.nbytes
+
+	// We overwrite, but faster...
+	storeLE64(w.bytes[n:], bits)
+	n += 6
+
+	if n >= bufferFlushSize {
+		if w.err != nil {
 			n = 0
+			return
 		}
-		w.nbytes = n
+		w.write(w.bytes[:n])
+		n = 0
 	}
+
+	w.nbytes = n
 }
 
 // Write the header of a dynamic Huffman block to the output stream.
@@ -367,19 +462,19 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	w.writeBits(int32(numOffsets-1), 5)
 	w.writeBits(int32(numCodegens-4), 4)
 
-	for i := 0; i < numCodegens; i++ {
-		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+	for i := range numCodegens {
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
 		w.writeBits(int32(value), 3)
 	}
 
 	i := 0
 	for {
-		var codeWord int = int(w.codegen[i])
+		var codeWord = uint32(w.codegen[i])
 		i++
 		if codeWord == badCode {
 			break
 		}
-		w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
+		w.writeCode(w.codegenEncoding.codes[codeWord])
 
 		switch codeWord {
 		case 16:
@@ -395,10 +490,28 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	}
 }
 
+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 	if w.err != nil {
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
 	var flag int32
 	if isEof {
 		flag = 1
@@ -413,6 +526,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 	if w.err != nil {
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
 	// Indicate that we are a fixed Huffman block
 	var value int32 = 2
 	if isEof {
@@ -426,36 +545,33 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 // is larger than the original bytes, the data will be written as a
 // stored block.
 // If the input is nil, the tokens will always be Huffman encoded.
-func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	if w.err != nil {
 		return
 	}
 
-	tokens = append(tokens, endBlockMarker)
+	tokens.AddEOB()
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
 	numLiterals, numOffsets := w.indexTokens(tokens)
-
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
-		// We only bother calculating the costs of the extra bits required by
-		// the length of offset fields (which will be the same for both fixed
-		// and dynamic encoding), if we need to compare those two encodings
-		// against stored encoding.
-		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
-			// First eight length codes have extra size = 0.
-			extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
-		}
-		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
-			// First four offset codes have extra size = 0.
-			extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
-		}
+		extraBits = w.extraBitSize()
 	}
 
 	// Figure out smallest code.
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -473,7 +589,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -487,7 +603,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	}
 
 	// Write the tokens.
-	w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
 }
 
 // writeBlockDynamic encodes a block using a dynamic Huffman table.
@@ -495,53 +611,153 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 // histogram distribution.
 // If input is supplied and the compression savings are below 1/16th of the
 // input size the block is stored.
-func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}
 
-	tokens = append(tokens, endBlockMarker)
+	sync = sync || eof
+	if sync {
+		tokens.AddEOB()
+	}
+
+	// We cannot reuse pure huffman table, and must mark as EOF.
+	if (w.lastHuffMan || eof) && w.lastHeader > 0 {
+		// We will not try to reuse.
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+
+	if w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
 
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
-	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-		w.writeStoredHeader(len(input), eof)
-		w.writeBytes(input)
-		return
+	var size int
+
+	// Check if we should reuse.
+	if w.lastHeader > 0 {
+		// Estimate size for using a new table.
+		// Use the previous header size as the best estimate.
+		newSize := w.lastHeader + tokens.EstimatedBits()
+
+		// The estimated size is calculated as an optimal table.
+		// We add a penalty to make it more realistic and re-use a bit more.
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
+
+		// Calculate the size for reusing the current table.
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
+
+		// Check if a new table is better.
+		if newSize < reuseSize {
+			// Write the EOB we owe.
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			size = newSize
+			w.lastHeader = 0
+		} else {
+			size = reuseSize
+		}
+
+		// Small blocks can be more efficient with fixed encoding.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		// Check if we get a reasonable size decrease.
+		if storable && ssize <= size {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
 	}
 
-	// Write Huffman table.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	// We want a new block/table
+	if w.lastHeader == 0 {
+		w.literalFreq[endBlockMarker] = 1
+
+		w.generate()
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
+		var numCodegens int
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined or raw, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+
+		// Write Huffman table.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
+		w.lastHuffMan = false
+	}
 
+	if sync {
+		w.lastHeader = 0
+	}
 	// Write the tokens.
-	w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
-func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
-	clear(w.literalFreq)
-	clear(w.offsetFreq)
+func (w *huffmanBitWriter) indexTokens(t *tokens) (numLiterals, numOffsets int) {
+	*(*[256]uint16)(w.literalFreq[:]) = t.litHist
+	*(*[32]uint16)(w.literalFreq[256:]) = t.extraHist
+	w.offsetFreq = t.offHist
 
-	for _, t := range tokens {
-		if t < matchType {
-			w.literalFreq[t.literal()]++
-			continue
-		}
-		length := t.length()
-		offset := t.offset()
-		w.literalFreq[lengthCodesStart+lengthCode(length)]++
-		w.offsetFreq[offsetCode(offset)]++
+	if t.n == 0 {
+		return
 	}
-
 	// get the number of literals
 	numLiterals = len(w.literalFreq)
 	for w.literalFreq[numLiterals-1] == 0 {
@@ -558,41 +774,153 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets
 		w.offsetFreq[0] = 1
 		numOffsets = 1
 	}
-	w.literalEncoding.generate(w.literalFreq, 15)
-	w.offsetEncoding.generate(w.offsetFreq, 15)
 	return
 }
 
+func (w *huffmanBitWriter) generate() {
+	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
 // writeTokens writes a slice of tokens to the output.
 // codes for literal and offset encoding must be supplied.
 func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
 	if w.err != nil {
 		return
 	}
+	if len(tokens) == 0 {
+		return
+	}
+
+	// Only last token should be endBlockMarker.
+	var deferEOB bool
+	if tokens[len(tokens)-1] == endBlockMarker {
+		tokens = tokens[:len(tokens)-1]
+		deferEOB = true
+	}
+
+	// Create slices up to the next power of two to avoid bounds checks.
+	lits := leCodes[:256]
+	offs := oeCodes[:32]
+	lengths := leCodes[lengthCodesStart:]
+	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
 	for _, t := range tokens {
-		if t < matchType {
-			w.writeCode(leCodes[t.literal()])
+		if t < 256 {
+			c := lits[t]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				storeLE64(w.bytes[nbytes:], bits)
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 			continue
 		}
+
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
-		w.writeCode(leCodes[lengthCode+lengthCodesStart])
-		extraLengthBits := uint(lengthExtraBits[lengthCode])
-		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode])
-			w.writeBits(extraLength, extraLengthBits)
+		lenCode := lengthCode(length) & 31
+		// inlined 'w.writeCode(lengths[lengthCode])'
+		c := lengths[lenCode]
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		if nbits >= 48 {
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+
+		if lenCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lenCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lenCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				storeLE64(w.bytes[nbytes:], bits)
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offsetCode(offset)
-		w.writeCode(oeCodes[offsetCode])
-		extraOffsetBits := uint(offsetExtraBits[offsetCode])
-		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode])
-			w.writeBits(extraOffset, extraOffsetBits)
+		offCode := (offset >> 16) & 31
+		// inlined 'w.writeCode(offs[offCode])'
+		c = offs[offCode]
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		if nbits >= 48 {
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+
+		if offCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offCode]
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				storeLE64(w.bytes[nbytes:], bits)
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if deferEOB {
+		w.writeCode(leCodes[endBlockMarker])
+	}
 }
 
 // huffOffset is a static offset encoder used for huffman only encoding.
@@ -600,94 +928,168 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 var huffOffset *huffmanEncoder
 
 func init() {
-	offsetFreq := make([]int32, offsetCodeCount)
-	offsetFreq[0] = 1
+	w := newHuffmanBitWriter(nil)
+	w.offsetFreq[0] = 1
 	huffOffset = newHuffmanEncoder(offsetCodeCount)
-	huffOffset.generate(offsetFreq, 15)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
 
 // writeBlockHuff encodes a block of bytes as either
 // Huffman encoded literals or uncompressed bytes if the
 // results only gains very little from compression.
-func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}
 
 	// Clear histogram
-	clear(w.literalFreq)
-
-	// Add everything as literals
-	histogram(input, w.literalFreq)
-
-	w.literalFreq[endBlockMarker] = 1
+	for i := range w.literalFreq[:] {
+		w.literalFreq[i] = 0
+	}
+	if !w.lastHuffMan {
+		for i := range w.offsetFreq[:] {
+			w.offsetFreq[i] = 0
+		}
+	}
 
 	const numLiterals = endBlockMarker + 1
-	w.offsetFreq[0] = 1
 	const numOffsets = 1
 
-	w.literalEncoding.generate(w.literalFreq, 15)
-
-	// Figure out smallest code.
-	// Always use dynamic Huffman or Store
-	var numCodegens int
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+	// Add everything as literals
+	// We have to estimate the header size.
+	// Assume header is around 70 bytes:
+	// https://stackoverflow.com/a/25454430
+	const guessHeaderSizeBits = 70 * 8
+	histogram(input, w.literalFreq[:numLiterals])
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	if estBits < math.MaxInt32 {
+		estBits += w.lastHeader
+		if w.lastHeader == 0 {
+			estBits += guessHeaderSizeBits
+		}
+		estBits += estBits >> w.logNewTablePenalty
+	}
 
 	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+	if storable && ssize <= estBits {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}
 
-	// Huffman.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-	encoding := w.literalEncoding.codes[:257]
-	n := w.nbytes
-	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		w.bits |= uint64(c.code) << w.nbits
-		w.nbits += uint(c.len)
-		if w.nbits < 48 {
-			continue
+	if w.lastHeader > 0 {
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
+
+		if estBits < reuseSize {
+			// We owe an EOB
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			w.lastHeader = 0
 		}
-		// Store 6 bytes
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n < bufferFlushSize {
-			continue
+	}
+
+	if w.lastHeader == 0 {
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		numCodegens := w.codegens()
+
+		// Huffman.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHuffMan = true
+		w.lastHeader, _ = w.headerSize()
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
 		}
-		w.write(w.bytes[:n])
-		if w.err != nil {
-			return // Return early in the event of write failures
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
 		}
-		n = 0
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= a.code64() << (nbits & 63)
+		bits |= b.code64() << ((nbits + a.len()) & 63)
+		c := encoding[input[2]]
+		nbits += b.len() + a.len()
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		input = input[3:]
 	}
-	w.nbytes = n
-	w.writeCode(encoding[endBlockMarker])
-}
 
-// histogram accumulates a histogram of b in h.
-//
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogram(b []byte, h []int32) {
-	h = h[:256]
-	for _, t := range b {
-		h[t]++
+	// Remaining...
+	for _, t := range input {
+		if nbits >= 48 {
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= c.code64() << (nbits & 63)
+
+		nbits += c.len()
+	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
+	if eof || sync {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
 	}
 }
diff --git a/src/compress/flate/huffman_bit_writer_test.go b/src/compress/flate/huffman_bit_writer_test.go
index a57799cae02685..dfb93e326c0871 100644
--- a/src/compress/flate/huffman_bit_writer_test.go
+++ b/src/compress/flate/huffman_bit_writer_test.go
@@ -32,7 +32,9 @@ func TestBlockHuff(t *testing.T) {
 		if strings.HasSuffix(in, ".in") {
 			out = in[:len(in)-len(".in")] + ".golden"
 		}
-		testBlockHuff(t, in, out)
+		t.Run(in, func(t *testing.T) {
+			testBlockHuff(t, in, out)
+		})
 	}
 }
 
@@ -44,7 +46,8 @@ func testBlockHuff(t *testing.T, in, out string) {
 	}
 	var buf bytes.Buffer
 	bw := newHuffmanBitWriter(&buf)
-	bw.writeBlockHuff(false, all)
+	bw.logNewTablePenalty = 8
+	bw.writeBlockHuff(false, all, false)
 	bw.flush()
 	got := buf.Bytes()
 
@@ -79,7 +82,7 @@ func testBlockHuff(t *testing.T, in, out string) {
 	// Test if the writer produces the same output after reset.
 	buf.Reset()
 	bw.reset(&buf)
-	bw.writeBlockHuff(false, all)
+	bw.writeBlockHuff(false, all, false)
 	bw.flush()
 	got = buf.Bytes()
 	if !bytes.Equal(got, want) {
@@ -175,13 +178,23 @@ func TestWriteBlockDynamic(t *testing.T) {
 	}
 }
 
+// TestWriteBlockDynamic tests if the writeBlockDynamic encoding has changed.
+// To update the reference files use the "-update" flag on the test.
+func TestWriteBlockDynamicSync(t *testing.T) {
+	for _, test := range writeBlockTests {
+		testBlock(t, test, "sync")
+	}
+}
+
 // testBlock tests a block against its references,
 // or regenerate the references, if "-update" flag is set.
 func testBlock(t *testing.T, test huffTest, ttype string) {
 	if test.want != "" {
 		test.want = fmt.Sprintf(test.want, ttype)
 	}
+	const gotSuffix = ".got"
 	test.wantNoInput = fmt.Sprintf(test.wantNoInput, ttype)
+	tokens := indexTokens(test.tokens)
 	if *update {
 		if test.input != "" {
 			t.Logf("Updating %q", test.want)
@@ -198,7 +211,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 			}
 			defer f.Close()
 			bw := newHuffmanBitWriter(f)
-			writeToType(t, ttype, bw, test.tokens, input)
+			writeToType(t, ttype, bw, tokens, input)
 		}
 
 		t.Logf("Updating %q", test.wantNoInput)
@@ -209,7 +222,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 		}
 		defer f.Close()
 		bw := newHuffmanBitWriter(f)
-		writeToType(t, ttype, bw, test.tokens, nil)
+		writeToType(t, ttype, bw, tokens, nil)
 		return
 	}
 
@@ -227,12 +240,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 		}
 		var buf bytes.Buffer
 		bw := newHuffmanBitWriter(&buf)
-		writeToType(t, ttype, bw, test.tokens, input)
+		writeToType(t, ttype, bw, tokens, input)
 
 		got := buf.Bytes()
 		if !bytes.Equal(got, want) {
-			t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".got")
-			if err := os.WriteFile(test.want+".got", got, 0666); err != nil {
+			t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+gotSuffix)
+			if err := os.WriteFile(test.want+gotSuffix, got, 0666); err != nil {
 				t.Error(err)
 			}
 		}
@@ -241,12 +254,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 		// Test if the writer produces the same output after reset.
 		buf.Reset()
 		bw.reset(&buf)
-		writeToType(t, ttype, bw, test.tokens, input)
+		writeToType(t, ttype, bw, tokens, input)
 		bw.flush()
 		got = buf.Bytes()
 		if !bytes.Equal(got, want) {
-			t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset.got")
-			if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil {
+			t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset"+gotSuffix)
+			if err := os.WriteFile(test.want+".reset"+gotSuffix, got, 0666); err != nil {
 				t.Error(err)
 			}
 			return
@@ -262,12 +275,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 	}
 	var buf bytes.Buffer
 	bw := newHuffmanBitWriter(&buf)
-	writeToType(t, ttype, bw, test.tokens, nil)
+	writeToType(t, ttype, bw, tokens, nil)
 
 	got := buf.Bytes()
 	if !bytes.Equal(got, wantNI) {
-		t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+".got")
-		if err := os.WriteFile(test.want+".got", got, 0666); err != nil {
+		t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+gotSuffix)
+		if err := os.WriteFile(test.wantNoInput+gotSuffix, got, 0666); err != nil {
 			t.Error(err)
 		}
 	} else if got[0]&1 == 1 {
@@ -280,12 +293,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 	// Test if the writer produces the same output after reset.
 	buf.Reset()
 	bw.reset(&buf)
-	writeToType(t, ttype, bw, test.tokens, nil)
+	writeToType(t, ttype, bw, tokens, nil)
 	bw.flush()
 	got = buf.Bytes()
 	if !bytes.Equal(got, wantNI) {
-		t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.want, test.want+".reset.got")
-		if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil {
+		t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.wantNoInput, test.wantNoInput+".reset"+gotSuffix)
+		if err := os.WriteFile(test.wantNoInput+".reset"+gotSuffix, got, 0666); err != nil {
 			t.Error(err)
 		}
 		return
@@ -294,12 +307,14 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 	testWriterEOF(t, "wb", test, false)
 }
 
-func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok []token, input []byte) {
+func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok tokens, input []byte) {
 	switch ttype {
 	case "wb":
-		bw.writeBlock(tok, false, input)
+		bw.writeBlock(&tok, false, input)
 	case "dyn":
-		bw.writeBlockDynamic(tok, false, input)
+		bw.writeBlockDynamic(&tok, false, input, false)
+	case "sync":
+		bw.writeBlockDynamic(&tok, false, input, true)
 	default:
 		panic("unknown test type")
 	}
@@ -332,13 +347,14 @@ func testWriterEOF(t *testing.T, ttype string, test huffTest, useInput bool) {
 	}
 	var buf bytes.Buffer
 	bw := newHuffmanBitWriter(&buf)
+	tokens := indexTokens(test.tokens)
 	switch ttype {
 	case "wb":
-		bw.writeBlock(test.tokens, true, input)
+		bw.writeBlock(&tokens, true, input)
 	case "dyn":
-		bw.writeBlockDynamic(test.tokens, true, input)
+		bw.writeBlockDynamic(&tokens, true, input, true)
 	case "huff":
-		bw.writeBlockHuff(true, input)
+		bw.writeBlockHuff(true, input, true)
 	default:
 		panic("unknown test type")
 	}
diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go
index 6f69cabfd060d4..f3e202430736d3 100644
--- a/src/compress/flate/huffman_code.go
+++ b/src/compress/flate/huffman_code.go
@@ -7,25 +7,42 @@ package flate
 import (
 	"math"
 	"math/bits"
-	"sort"
+)
+
+const (
+	maxBitsLimit = 16
+	// number of valid literals
+	literalCount = 286
 )
 
 // hcode is a huffman code with a bit code and bit length.
-type hcode struct {
-	code, len uint16
+type hcode uint32
+
+func (h hcode) len() uint8 {
+	return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+	return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+	return h == 0
 }
 
 type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
-	lns       byLiteral // stored to avoid repeated allocation in generate
-	lfs       byFreq    // stored to avoid repeated allocation in generate
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
 }
 
 type literalNode struct {
 	literal uint16
-	freq    int32
+	freq    uint16
 }
 
 // A levelInfo describes the state of the constructed tree for a given depth.
@@ -49,25 +66,34 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
-	h.len = length
-	h.code = code
+func (h *hcode) set(code uint16, length uint8) {
+	*h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+	return hcode(length) | (hcode(code) << 8)
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return bits.Reverse16(number << ((16 - bitLength) & 15))
 }
 
-func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
 
 func newHuffmanEncoder(size int) *huffmanEncoder {
-	return &huffmanEncoder{codes: make([]hcode, size)}
+	// Make capacity to next power of two.
+	c := uint(bits.Len32(uint32(size - 1)))
+	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
 }
 
-// Generates a HuffmanCode corresponding to the fixed literal table.
+// Generates a HuffmanCode corresponding to the fixed literal table
 func generateFixedLiteralEncoding() *huffmanEncoder {
-	h := newHuffmanEncoder(maxNumLit)
+	h := newHuffmanEncoder(literalCount)
 	codes := h.codes
 	var ch uint16
-	for ch = 0; ch < maxNumLit; ch++ {
+	for ch = range uint16(literalCount) {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -86,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = newhcode(reverseBits(bits, size), size)
 	}
 	return h
 }
@@ -95,40 +121,65 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 	h := newHuffmanEncoder(30)
 	codes := h.codes
 	for ch := range codes {
-		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+		codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
 	}
 	return h
 }
 
-var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
-var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()
 
-func (h *huffmanEncoder) bitLength(freq []int32) int {
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
 	for i, f := range freq {
 		if f != 0 {
-			total += int(f) * int(h.codes[i].len)
+			total += int(f) * int(h.codes[i].len())
 		}
 	}
 	return total
 }
 
-const maxBitsLimit = 16
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len())
+	}
+	return total
+}
 
-// bitCounts computes the number of literals assigned to each bit size in the Huffman encoding.
-// It is only called when list.length >= 3.
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.zero() {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len())
+		}
+	}
+	return total
+}
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
 // The cases of 0, 1, and 2 literals are handled by special case code.
 //
-// list is an array of the literals with non-zero frequencies
-// and their associated frequencies. The array is in order of increasing
-// frequency and has as its last element a special element with frequency
-// MaxInt32.
+// list  An array of the literals with non-zero frequencies
+//
+//	and their associated frequencies. The array is in order of increasing
+//	frequency, and has as its last element a special element with frequency
+//	MaxInt32
+//
+// maxBits     The maximum number of bits that should be used to encode any literal.
+//
+//	Must be less than 16.
 //
-// maxBits is the maximum number of bits that should be used to encode any literal.
-// It must be less than 16.
+// return      An integer array in which array[i] indicates the number of literals
 //
-// bitCounts returns an integer slice in which slice[i] indicates the number of literals
-// that should be encoded in i bits.
+//	that should be encoded in i bits.
 func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	if maxBits >= maxBitsLimit {
 		panic("flate: maxBits too large")
@@ -154,14 +205,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     list[1].freq,
-			nextCharFreq: list[2].freq,
-			nextPairFreq: list[0].freq + list[1].freq,
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -172,11 +228,11 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
-			// We've run out of both leaves and pairs.
+			// We've run out of both leafs and pairs.
 			// End all calculations for this level.
 			// To make sure we never come back to this level or any lower level,
 			// set nextPairFreq impossibly large.
@@ -193,14 +249,21 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			l.lastFreq = l.nextCharFreq
 			// Lower leafCounts are the same of the previous node.
 			leafCounts[level][level] = n
-			l.nextCharFreq = list[n].freq
+			e := list[n]
+			if e.literal < math.MaxUint16 {
+				l.nextCharFreq = int32(e.freq)
+			} else {
+				l.nextCharFreq = math.MaxInt32
+			}
 		} else {
 			// The next item on this row is a pair from the previous row.
 			// nextPairFreq isn't valid until we generate two
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			save := leafCounts[level][level]
+			leafCounts[level] = leafCounts[level-1]
+			leafCounts[level][level] = save
 			levels[l.level-1].needed = 2
 		}
 
@@ -256,9 +319,9 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 		// assigned in literal order (not frequency order).
 		chunk := list[len(list)-int(bits):]
 
-		h.lns.sort(chunk)
+		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -268,15 +331,10 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // Update this Huffman Code object to be the minimum code for the specified frequency count.
 //
 // freq is an array of frequencies, in which freq[i] gives the frequency of literal i.
-// maxBits  The maximum number of bits to use for any literal.
-func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
-		// The largest of these is maxNumLit, so we allocate for that case.
-		h.freqcache = make([]literalNode, maxNumLit+1)
-	}
+// maxBits is the maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -285,9 +343,10 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			h.codes[i].len = 0
+			codes[i] = 0
 		}
 	}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
@@ -299,7 +358,7 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
 		}
 		return
 	}
-	h.lfs.sort(list)
+	sortByFreq(list)
 
 	// Get the number of literals for each bit count
 	bitCount := h.bitCounts(list, maxBits)
@@ -307,39 +366,43 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
 	h.assignEncodingAndSize(bitCount, list)
 }
 
-type byLiteral []literalNode
-
-func (s *byLiteral) sort(a []literalNode) {
-	*s = byLiteral(a)
-	sort.Sort(s)
+// atLeastOne clamps the result between 1 and 15.
+func atLeastOne(v float32) float32 {
+	return min(15, max(1, v))
 }
 
-func (s byLiteral) Len() int { return len(s) }
-
-func (s byLiteral) Less(i, j int) bool {
-	return s[i].literal < s[j].literal
-}
-
-func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
-type byFreq []literalNode
-
-func (s *byFreq) sort(a []literalNode) {
-	*s = byFreq(a)
-	sort.Sort(s)
-}
-
-func (s byFreq) Len() int { return len(s) }
-
-func (s byFreq) Less(i, j int) bool {
-	if s[i].freq == s[j].freq {
-		return s[i].literal < s[j].literal
+func histogram(b []byte, h []uint16) {
+	if len(b) >= 8<<10 {
+		// Split for bigger inputs
+		histogramSplit(b, h)
+	} else {
+		h = h[:256]
+		for _, t := range b {
+			h[t]++
+		}
 	}
-	return s[i].freq < s[j].freq
 }
 
-func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
-func reverseBits(number uint16, bitLength byte) uint16 {
-	return bits.Reverse16(number << (16 - bitLength))
+func histogramSplit(b []byte, h []uint16) {
+	// Tested, and slightly faster than 2-way.
+	// Writing to separate arrays and combining is also slightly slower.
+	h = h[:256]
+	// Make size divisible by 4
+	for len(b)&3 != 0 {
+		h[b[0]]++
+		b = b[1:]
+	}
+	n := len(b) / 4
+	x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+	y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+	for i, t := range x {
+		v0 := &h[t]
+		v1 := &h[y[i]]
+		v3 := &h[w[i]]
+		v2 := &h[z[i]]
+		*v0++
+		*v1++
+		*v2++
+		*v3++
+	}
 }
diff --git a/src/compress/flate/huffman_sortByFreq.go b/src/compress/flate/huffman_sortByFreq.go
new file mode 100644
index 00000000000000..6c05ba8c1c2e2a
--- /dev/null
+++ b/src/compress/flate/huffman_sortByFreq.go
@@ -0,0 +1,159 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByFreq(data []literalNode) {
+	n := len(data)
+	quickSortByFreq(data, 0, n, maxDepth(n))
+}
+
+func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivotByFreq(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSortByFreq(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSortByFreq(data, mhi, b)
+		} else {
+			quickSortByFreq(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSortByFreq(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSortByFreq(data, a, b)
+	}
+}
+
+func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s)
+		medianOfThreeSortByFreq(data, m, m-s, m+s)
+		medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThreeSortByFreq(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ {
+	}
+	b := a
+	for {
+		for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot
+		}
+		for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot
+			}
+			for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSortByFreq(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// quickSortByFreq, loosely following Bentley and McIlroy,
+// ``Engineering a Sort Function,'' SP&E November 1993.
+
+// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/src/compress/flate/huffman_sortByLiteral.go b/src/compress/flate/huffman_sortByLiteral.go
new file mode 100644
index 00000000000000..93f1aea109e123
--- /dev/null
+++ b/src/compress/flate/huffman_sortByLiteral.go
@@ -0,0 +1,201 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByLiteral(data []literalNode) {
+	n := len(data)
+	quickSort(data, 0, n, maxDepth(n))
+}
+
+func quickSort(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivot(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSort(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSort(data, mhi, b)
+		} else {
+			quickSort(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSort(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].literal < data[i-6].literal {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSort(data, a, b)
+	}
+}
+func heapSort(data []literalNode, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDown(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDown(data, lo, i, first)
+	}
+}
+
+// siftDown implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDown(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && data[first+child].literal < data[first+child+1].literal {
+			child++
+		}
+		if data[first+root].literal > data[first+child].literal {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThree(data, lo, lo+s, lo+2*s)
+		medianOfThree(data, m, m-s, m+s)
+		medianOfThree(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThree(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && data[a].literal < data[pivot].literal; a++ {
+	}
+	b := a
+	for {
+		for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot
+		}
+		for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].literal > data[pivot].literal { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot
+			}
+			for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSort(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && data[j].literal < data[j-1].literal; j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// maxDepth returns a threshold at which quicksort should switch
+// to heapsort. It returns 2*ceil(lg(n+1)).
+func maxDepth(n int) int {
+	var depth int
+	for i := n; i > 0; i >>= 1 {
+		depth++
+	}
+	return depth * 2
+}
+
+// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThree(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].literal < data[m0].literal {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].literal < data[m1].literal {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].literal < data[m0].literal {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/src/compress/flate/level1.go b/src/compress/flate/level1.go
new file mode 100644
index 00000000000000..2195df4fa38f93
--- /dev/null
+++ b/src/compress/flate/level1.go
@@ -0,0 +1,197 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 1 uses a single small table with 5 byte hashes.
+type fastEncL1 struct {
+	fastGen
+	table [tableSize]tableEntry
+}
+
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		var t int32
+		for {
+			nextHash := hashLen(cv, tableBits, hashBytes)
+			candidate = e.table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+
+			now := loadLE64(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hashLen(now, tableBits, hashBytes)
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = now
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+			cv = now
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && loadLE8(src, t-1) == loadLE8(src, s-1) {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+
+			// Save the match found. Same as 'dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))'
+			xOffset := uint32(s - t - baseMatchOffset)
+			xLength := l
+			oc := offsetCode(xOffset)
+			xOffset |= oc << 16
+			for xLength > 0 {
+				xl := xLength
+				if xl > 258 {
+					if xl > 258+baseMatchLength {
+						xl = 258
+					} else {
+						xl = 258 - baseMatchLength
+					}
+				}
+				xLength -= xl
+				xl -= baseMatchLength
+				dst.extraHist[lengthCodes1[uint8(xl)]]++
+				dst.offHist[oc]++
+				dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xOffset)
+				dst.n++
+			}
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+8) < len(src) {
+					cv := loadLE64(src, s)
+					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := loadLE64(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hashLen(x, tableBits, hashBytes)
+			e.table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashLen(x, tableBits, hashBytes)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			t = candidate.offset - e.cur
+			if s-t > maxMatchOffset || uint32(x) != loadLE32(src, t) {
+				cv = x >> 8
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level2.go b/src/compress/flate/level2.go
new file mode 100644
index 00000000000000..7a2fdf7abe6ddb
--- /dev/null
+++ b/src/compress/flate/level2.go
@@ -0,0 +1,187 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 2 uses a similar algorithm to level 1, but with a larger table.
+type fastEncL2 struct {
+	fastGen
+	table [bTableSize]tableEntry
+}
+
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+	for {
+		// When should we start skipping if we haven't found matches in a long while.
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashLen(cv, bTableBits, hashBytes)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash]
+			now := loadLE64(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hashLen(now, bTableBits, hashBytes)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = now
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+				break
+			}
+			cv = now
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes match.
+		for {
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+8) < len(src) {
+					cv := loadLE64(src, s)
+					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// Store every second hash in-between, but offset by 1.
+			for i := s - l + 2; i < s-5; i += 7 {
+				x := loadLE64(src, i)
+				nextHash := hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
+				// Skip one
+				x >>= 16
+				nextHash = hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
+				// Skip one
+				x >>= 16
+				nextHash = hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1.
+			x := loadLE64(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hashLen(x, bTableBits, hashBytes)
+			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
+			currHash := hashLen(x>>16, bTableBits, hashBytes)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x>>16) != loadLE32(src, candidate.offset-e.cur) {
+				cv = x >> 24
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level3.go b/src/compress/flate/level3.go
new file mode 100644
index 00000000000000..adda8714879c8d
--- /dev/null
+++ b/src/compress/flate/level3.go
@@ -0,0 +1,226 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 3 uses a similar algorithm to level 2, with a smaller table,
+// but will check up two candidates for each iteration with more
+// entries added to the table.
+type fastEncL3 struct {
+	fastGen
+	table [1 << 16]tableEntryPrev
+}
+
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		hashBytes              = 5
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+			}
+			if v.Prev.offset <= minOff {
+				v.Prev.offset = 0
+			} else {
+				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+			}
+			e.table[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// Skip if too small.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	cv := loadLE64(src, s)
+	for {
+		const skipLog = 7
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashLen(cv, tableBits, hashBytes)
+			s = nextS
+			nextS = s + 1 + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash]
+			now := loadLE64(src, nextS)
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || uint32(cv) != loadLE32(src, candidates.Prev.offset-e.cur) {
+					break
+				}
+				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
+				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+				if l2 > l1 {
+					candidate = candidates.Prev
+				}
+				break
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+					break
+				}
+			}
+			cv = now
+		}
+
+		for {
+			// Extend the 4-byte match as long as possible.
+			//
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			// Emit literals.
+			if nextEmit < s {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+
+			// Emit match.
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+8) < len(src) && t > 0 {
+					cv = loadLE64(src, t)
+					nextHash := hashLen(cv, tableBits, hashBytes)
+					e.table[nextHash] = tableEntryPrev{
+						Prev: e.table[nextHash].Cur,
+						Cur:  tableEntry{offset: e.cur + t},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 6 {
+				nextHash := hashLen(loadLE64(src, i), tableBits, hashBytes)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
+			}
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := loadLE64(src, s-2)
+			prevHash := hashLen(x, tableBits, hashBytes)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2},
+			}
+			x >>= 8
+			prevHash = hashLen(x, tableBits, hashBytes)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1},
+			}
+			x >>= 8
+			currHash := hashLen(x, tableBits, hashBytes)
+			candidates := e.table[currHash]
+			cv = x
+			e.table[currHash] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset {
+				if uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+					// Match at prev...
+					continue
+				}
+			}
+			cv = x >> 8
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level4.go b/src/compress/flate/level4.go
new file mode 100644
index 00000000000000..f62168b64ed9e3
--- /dev/null
+++ b/src/compress/flate/level4.go
@@ -0,0 +1,204 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 4 uses two tables, one for short (4 bytes) and one for long (7 bytes) matches.
+type fastEncL4 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.bTable[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	cv := loadLE64(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := loadLE64(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			e.bTable[nextHashL] = entry
+
+			t = lCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// We got a long match. Use that.
+				break
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// Found a 4 match...
+				lCandidate = e.bTable[hash7(next, tableBits)]
+
+				// If the next long is a candidate, check if we should use that instead...
+				lOff := lCandidate.offset - e.cur
+				if nextS-lOff < maxMatchOffset && loadLE32(src, lOff) == uint32(next) {
+					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+					if l2 > l1 {
+						s = nextS
+						t = lCandidate.offset - e.cur
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			for _, v := range src[nextEmit:s] {
+				dst.tokens[dst.n] = token(v)
+				dst.litHist[v]++
+				dst.n++
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index first pair after match end.
+			if int(s+8) < len(src) {
+				cv := loadLE64(src, s)
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
+			}
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between
+		i := nextS
+		if i < s-1 {
+			cv := loadLE64(src, i)
+			t := tableEntry{offset: i + e.cur}
+			t2 := tableEntry{offset: t.offset + 1}
+			e.bTable[hash7(cv, tableBits)] = t
+			e.bTable[hash7(cv>>8, tableBits)] = t2
+			e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+
+			i += 3
+			for ; i < s-1; i += 3 {
+				cv := loadLE64(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				e.bTable[hash7(cv, tableBits)] = t
+				e.bTable[hash7(cv>>8, tableBits)] = t2
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := loadLE64(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level5.go b/src/compress/flate/level5.go
new file mode 100644
index 00000000000000..5ef342eae0e8a2
--- /dev/null
+++ b/src/compress/flate/level5.go
@@ -0,0 +1,291 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 5 is similar to level 4, but for long matches two candidates are tested.
+// Once a match is found, when it stops it will attempt to find a match that extends further.
+type fastEncL5 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := loadLE64(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == loadLE32(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) {
+						l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// Found a 4 match...
+				l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(loadLE64(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			for _, v := range src[nextEmit:s] {
+				dst.tokens[dst.n] = token(v)
+				dst.litHist[v]++
+				dst.n++
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		const hashEvery = 3
+		i := s - l + 1
+		if i < s-1 {
+			cv := loadLE64(src, i)
+			t := tableEntry{offset: i + e.cur}
+			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+			eLong := &e.bTable[hash7(cv, tableBits)]
+			eLong.Cur, eLong.Prev = t, eLong.Cur
+
+			// Do an long at i+1
+			cv >>= 8
+			t = tableEntry{offset: t.offset + 1}
+			eLong = &e.bTable[hash7(cv, tableBits)]
+			eLong.Cur, eLong.Prev = t, eLong.Cur
+
+			// We only have enough bits for a short entry at i+2
+			cv >>= 8
+			t = tableEntry{offset: t.offset + 1}
+			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+			// Skip one - otherwise we risk hitting 's'
+			i += 4
+			for ; i < s-1; i += hashEvery {
+				cv := loadLE64(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := loadLE64(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level6.go b/src/compress/flate/level6.go
new file mode 100644
index 00000000000000..851a7155853eec
--- /dev/null
+++ b/src/compress/flate/level6.go
@@ -0,0 +1,301 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 6 extends level 5, but does "repeat offset" check,
+// as well as adding more hash entries to the tables.
+type fastEncL6 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+	// Repeat MUST be > 1 and within range
+	repeat := int32(1)
+	for {
+		const skipLog = 7
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := loadLE64(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			// Calculate hashes of 'next'
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == loadLE32(src, t) {
+					// Long candidate matches at least 4 bytes.
+
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					// Check the previous long candidate as well.
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) {
+						l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				// Current value did not match, but check if previous long value does.
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// Found a 4 match...
+				l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+
+				// Look up next long candidate (at nextS)
+				lCandidate = e.bTable[nextHashL]
+
+				// Store the next match
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// Check repeat at s + repOff
+				const repOff = 1
+				t2 := s - repeat + repOff
+				if loadLE32(src, t2) == uint32(cv>>(8*repOff)) {
+					ml := e.matchLenLimited(int(s+4+repOff), int(t2+4), src) + 4
+					if ml > l {
+						t = t2
+						l = ml
+						s += repOff
+						// Not worth checking more.
+						break
+					}
+				}
+
+				// If the next long is a candidate, use that...
+				t2 = lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							// This is ok, but check previous as well.
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := &e.bTable[hash7(loadLE64(src, sAt), tableBits)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+				// Test previous entry:
+				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
+				off := s2 - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			for _, v := range src[nextEmit:s] {
+				dst.tokens[dst.n] = token(v)
+				dst.litHist[v]++
+				dst.n++
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		repeat = s - t
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index after match end.
+			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+				cv := loadLE64(src, i)
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
+			}
+			goto emitRemainder
+		}
+
+		// Store every long hash in-between and every second short.
+		for i := nextS + 1; i < s-1; i += 2 {
+			cv := loadLE64(src, i)
+			t := tableEntry{offset: i + e.cur}
+			t2 := tableEntry{offset: t.offset + 1}
+			eLong := &e.bTable[hash7(cv, tableBits)]
+			eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+			eLong.Cur, eLong.Prev = t, eLong.Cur
+			eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+		}
+		cv = loadLE64(src, s)
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/regmask_amd64.go b/src/compress/flate/regmask_amd64.go
new file mode 100644
index 00000000000000..cd1469a909173d
--- /dev/null
+++ b/src/compress/flate/regmask_amd64.go
@@ -0,0 +1,14 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMask64 - shift value is 8 bits on 64 bit register.
+	reg8SizeMask64 = 63
+)
diff --git a/src/compress/flate/regmask_other.go b/src/compress/flate/regmask_other.go
new file mode 100644
index 00000000000000..e25fc87af1b0d2
--- /dev/null
+++ b/src/compress/flate/regmask_other.go
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+// +build !amd64
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// On other platforms the mask is ineffective so the AND can be removed by the compiler.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMask64 - shift value is 8 bits on 64 bit register.
+	reg8SizeMask64 = 0xff
+)
diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect b/src/compress/flate/testdata/huffman-null-max.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..c08165143f2c570013c4916cbac5addfe9622a55
GIT binary patch
literal 78
ZcmaEJppgLx8W#LrDZUcKq5v#l0|1+Y23i0B

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput b/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..c08165143f2c570013c4916cbac5addfe9622a55
GIT binary patch
literal 78
ZcmaEJppgLx8W#LrDZUcKq5v#l0|1+Y23i0B

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect b/src/compress/flate/testdata/huffman-pi.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..e4396ac6fe5e34609ccb7ea0bc359e6adb48c7f4
GIT binary patch
literal 1696
zcmV;R24DFkmtE59U<iVD)Dabx{co(pn{#tI{nGp(K9KsMLFKS`T2<Ed<aE_lt{8dO
z&|*|lEfKs7ZL6Egiakun<W0s&H+fNg$?60)W_5|axFISpD`t8_b=?ypgO^#@WKkW5
zygmTizI-Ec1FB;JK~mE_nd%c;6?O1ed56$)hN|iwpJzFJS+zHNg<&o-pb*-B;*7F-
ze9MDqcIEgds8UUAPGk#>+}dp3M$E(%$UAJ`ff>rs<fW1w!h$tjv!|q@CIYAmR<Q3f
zdWMO*Mvn+J2u!G}f}0yPT#ABNp~MZae$q;RQXvQ&`(iCetu(nU&K7aUWp4HfnX5Wn
z1)vsES2bHT*&V}~uDFRrsnt_rW^XMN8;RAuJKM<JU@5WKNfmaM&obKw|2Lx!&p9GE
zhg>vsiW8T+$6ZwCa`!Y=s-_luo9MajP$09#>I(F*#bYgkvGSvgH9cjqJxOtZL@E-R
zxap{F9H>K0YPWsSkS2)R*aWKe{#|WqFIuv-wS}!bk75c(Z-9;7Wc4VnBBzs?752D&
zc8p>URDdmkKtvR3uWp%l!&_EmTpc=NETFYQZ$(jWT@;<SWY28@6unh-m&Mv!zWtM>
zgN3|cc@&v*F@uVLa&Kn<h_Y|yiFB$9F}WFtVIOZQ*SDq`uxyYgZdO!4%qUjim{#0Z
zt;wX+D4VLaiS^@EuTUV!?c-)=v%wfyze}+Sm>X>Fd2bZUkkwfB)b_MW1tl319U*%S
zvp^|A=dI~L9<K%y#i@;*2}E1+gF5x_;ODg$_iEdqx+H6#6LQXncl3IqZEATNuf3@A
z?MD@Ab|X(GFntLoCWE6f5rSv#&|o59CtP+#6nQ%x+1r}l$|>VRO0%SM^tpIF);2v&
z2UTM|Eu;@^j|Ys3yuqcmNp8%xRb#N#JWo+RBgezuM69fAg{7zjhSjaxj9hCIS<|))
zTLN?jLt7gbXKG}i<j^MHn<U}Y#&UZ$Podr2$CQXL{Wr|F@us(6JlL0s-pyj5(aw<K
zm{%Co0Ni5I5UlpOac0l8IbW5f*ixgbs@c1fZ3`uFy5gvhoD3<U%*1uB?UTOsx!YN+
z!e@;q!;JU*ZgSJ%hP{d7`kM93zuK9D-5Vj+or9yCQZW}Xok=9q?A=n@mxfNi%(}@@
z(^GC}n8)y)c{y=Y9$!CEGfHN0)8@@u!(t3Xj)P+%Y@QY0>EUuqR-jG}(yN@N#B)wX
z?|Hml6#3}s*c0K~nJep+6gLc-%e0Zx+0e0@vrzAOGcG64J5tD?3)Gal%l@md3K`X!
zWHzzhS`E>KPF)C!q0$!IOpK<-WbmbF9QLE^nXFo~mu))PKI>??o<sR&=IZ9ux!WKm
z9DV9+iY2NG=Jxulxwxi%zU6u#Uf;0JK{x3~o{+IMaY1C)U0XVTZj|?7C-=NT3!2SE
zX*U8XBj?l&`)-c(`8%dub22v5!afi5foB(Z{I;HrS!!m^epouAIAN=&_mDj^W@DCa
znq!DziqBJS1nah<GhS3v?QGqyY_kW99kXlIM94o6bvUN(R*N{53(p8OH=FjB$4~Rq
zpm77hDsRqZ=C=36^JTkwoxQbag=VR9-qp~e9ei@9sB6<)%J{v_9NvW!fp${oM(>iY
z2eq0;6HL=Tt81EVym$AC{;?VPYEHwbEH44G@EQbW;L1XcSd)b||Ff@Ei(4Sj++jOm
zBUh^KsO^kc_oqFUViJ1J^cG$3Tj{GxbaP=7I(EAlE=mRs3qthuA%e9rE-#PHFM(mQ
zu6KhDd&6Mrg?qbky>)t9e~*^0hsbjfTxSkFOE@c#rEgM-#Z9ZTpaI9jc6f=dNhXc8
znW%G1wBBCANuz}>6H}+!y>*N6gKL$sTjqM=lH+`zajbQ|_!-Asw+~_~BPZz2`j$Kc
zEhFt1TPE|&golz{9lnon*4~tBl|$aFu;^S(&T%Xt<IG6jY9uEjw_Oac!<_|9!C|~%
z(*)e}k$&k2?{LXg*^(^ih3$-2`Q-{9>kV=$yRZ5cBjJLTgxTv7rS!-y$2B``yh?Bd
zU87(35T;+y=@n~to6Yow&?UtR3gMggy9M(CYsW0orRXZXb1;cR#nNz{C5S6uiE#A#
z)e7C6h_D5sJRBg(Zy^5U!@dY0#$<mT#8rv!72=3R*E$7!HIT!&^XBSwlrHN$Pgl3H
q*v5xD+`YHJO6x#xL3xxi8+2|T?H8AImc;rg`_M2skLrP&{KX#_16-W|

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect-noinput b/src/compress/flate/testdata/huffman-pi.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..e4396ac6fe5e34609ccb7ea0bc359e6adb48c7f4
GIT binary patch
literal 1696
zcmV;R24DFkmtE59U<iVD)Dabx{co(pn{#tI{nGp(K9KsMLFKS`T2<Ed<aE_lt{8dO
z&|*|lEfKs7ZL6Egiakun<W0s&H+fNg$?60)W_5|axFISpD`t8_b=?ypgO^#@WKkW5
zygmTizI-Ec1FB;JK~mE_nd%c;6?O1ed56$)hN|iwpJzFJS+zHNg<&o-pb*-B;*7F-
ze9MDqcIEgds8UUAPGk#>+}dp3M$E(%$UAJ`ff>rs<fW1w!h$tjv!|q@CIYAmR<Q3f
zdWMO*Mvn+J2u!G}f}0yPT#ABNp~MZae$q;RQXvQ&`(iCetu(nU&K7aUWp4HfnX5Wn
z1)vsES2bHT*&V}~uDFRrsnt_rW^XMN8;RAuJKM<JU@5WKNfmaM&obKw|2Lx!&p9GE
zhg>vsiW8T+$6ZwCa`!Y=s-_luo9MajP$09#>I(F*#bYgkvGSvgH9cjqJxOtZL@E-R
zxap{F9H>K0YPWsSkS2)R*aWKe{#|WqFIuv-wS}!bk75c(Z-9;7Wc4VnBBzs?752D&
zc8p>URDdmkKtvR3uWp%l!&_EmTpc=NETFYQZ$(jWT@;<SWY28@6unh-m&Mv!zWtM>
zgN3|cc@&v*F@uVLa&Kn<h_Y|yiFB$9F}WFtVIOZQ*SDq`uxyYgZdO!4%qUjim{#0Z
zt;wX+D4VLaiS^@EuTUV!?c-)=v%wfyze}+Sm>X>Fd2bZUkkwfB)b_MW1tl319U*%S
zvp^|A=dI~L9<K%y#i@;*2}E1+gF5x_;ODg$_iEdqx+H6#6LQXncl3IqZEATNuf3@A
z?MD@Ab|X(GFntLoCWE6f5rSv#&|o59CtP+#6nQ%x+1r}l$|>VRO0%SM^tpIF);2v&
z2UTM|Eu;@^j|Ys3yuqcmNp8%xRb#N#JWo+RBgezuM69fAg{7zjhSjaxj9hCIS<|))
zTLN?jLt7gbXKG}i<j^MHn<U}Y#&UZ$Podr2$CQXL{Wr|F@us(6JlL0s-pyj5(aw<K
zm{%Co0Ni5I5UlpOac0l8IbW5f*ixgbs@c1fZ3`uFy5gvhoD3<U%*1uB?UTOsx!YN+
z!e@;q!;JU*ZgSJ%hP{d7`kM93zuK9D-5Vj+or9yCQZW}Xok=9q?A=n@mxfNi%(}@@
z(^GC}n8)y)c{y=Y9$!CEGfHN0)8@@u!(t3Xj)P+%Y@QY0>EUuqR-jG}(yN@N#B)wX
z?|Hml6#3}s*c0K~nJep+6gLc-%e0Zx+0e0@vrzAOGcG64J5tD?3)Gal%l@md3K`X!
zWHzzhS`E>KPF)C!q0$!IOpK<-WbmbF9QLE^nXFo~mu))PKI>??o<sR&=IZ9ux!WKm
z9DV9+iY2NG=Jxulxwxi%zU6u#Uf;0JK{x3~o{+IMaY1C)U0XVTZj|?7C-=NT3!2SE
zX*U8XBj?l&`)-c(`8%dub22v5!afi5foB(Z{I;HrS!!m^epouAIAN=&_mDj^W@DCa
znq!DziqBJS1nah<GhS3v?QGqyY_kW99kXlIM94o6bvUN(R*N{53(p8OH=FjB$4~Rq
zpm77hDsRqZ=C=36^JTkwoxQbag=VR9-qp~e9ei@9sB6<)%J{v_9NvW!fp${oM(>iY
z2eq0;6HL=Tt81EVym$AC{;?VPYEHwbEH44G@EQbW;L1XcSd)b||Ff@Ei(4Sj++jOm
zBUh^KsO^kc_oqFUViJ1J^cG$3Tj{GxbaP=7I(EAlE=mRs3qthuA%e9rE-#PHFM(mQ
zu6KhDd&6Mrg?qbky>)t9e~*^0hsbjfTxSkFOE@c#rEgM-#Z9ZTpaI9jc6f=dNhXc8
znW%G1wBBCANuz}>6H}+!y>*N6gKL$sTjqM=lH+`zajbQ|_!-Asw+~_~BPZz2`j$Kc
zEhFt1TPE|&golz{9lnon*4~tBl|$aFu;^S(&T%Xt<IG6jY9uEjw_Oac!<_|9!C|~%
z(*)e}k$&k2?{LXg*^(^ih3$-2`Q-{9>kV=$yRZ5cBjJLTgxTv7rS!-y$2B``yh?Bd
zU87(35T;+y=@n~to6Yow&?UtR3gMggy9M(CYsW0orRXZXb1;cR#nNz{C5S6uiE#A#
z)e7C6h_D5sJRBg(Zy^5U!@dY0#$<mT#8rv!72=3R*E$7!HIT!&^XBSwlrHN$Pgl3H
q*v5xD+`YHJO6x#xL3xxi8+2|T?H8AImc;rg`_M2skLrP&{KX#_16-W|

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput
index 0c24742fde2487e3a454ec3364f15e541693c37c..016db5595c47c54234be0c3502a0cbf395f98e32 100644
GIT binary patch
delta 1006
zcmV<K0}=e52%ZQC{Qv*||NsC0kqAJ4{zGbMlBZTqv2jK#?}R1;uV2m`>3S<xTl-y)
zDs{dfnm>&tir-1&Xy^3rq$7<yK`&*DJ*yzG4JUYcL0ucldlH-)s;d(ZY1VpW4(S(e
zlFr3e%qtVnD_g=g20klDFr5)pPR>WzjkH3oE5TPhoG|$Jj~G7Je1{gMAD^RtHK^{N
zufoll6_IZ#Igui;)QE-iL)cr^(p6$f&T?BN!s4KwBwDM|h`zW0G3`5$X2Fl=+u6SL
z$bbs5Whvee;g$C>RT%D-Dl1f-g_g`**F<)E<N9uBv%Y)Zk58EaFw6h@0KzJYpen{!
z>k@PCVoPL;K9`bu?lgzjHb%jJ)6&2?zP)IAw?)0ebClf!Z=Et)6wut_cD@v7o)NpS
zsK_#(rtzn<ja0CDb_#mA-1Kb8ik)?Y>Qd!5{<5eH&uVED3}5?emSTvTf*_p#{4ge`
zn(ID>yQn|Wo1TzEpWsAiwn-x2!aa=WR)#*V<O6A59S|?0tKu}I&5qiCA~eNRl)M;C
zTk@a&yd9nADCfxuI?byKb!xh-(7#I0%nZImZQ5gv$CB2#4tvi_LYE2S)sYOWLSQw~
zgoeUe{4VQcRt)P#)OcIRiS%z?n@h7W#ysK!>0|pdqh%CWATi<D)z`NmOwHY_!QuPl
zhfA5<9@KqQ_27)|d3alYYtb+Tm3~iQ*@8!BDPR-VSXT&~RFU#H9n1<7@Rc7HwnW?}
zlZvvK0eEJCMu>Uv6K5OS-0wY-7hCj?Tf`uBEd$hP2k_Ik53t_)WP<y=Z~MmXy52o!
zM<k$<SQ}Chz7BNYAut>?CL{3Dz=q58^<w^!j4p|>>&Ha%fBIK{4daGq?sUcO$->)O
zrL(={j>A#F8&cXoM~3O}e9Or|hmVKPIv?LKS!HKa#fdMJ!mb#z4eR(FdiJi1Cgqhq
z-hw!drzD!qMHn$Jh)LAM;gcxRzEji-SpMM7p*fDjNTi0qIQ%9_MQ7pjZQ#@|e-9ZH
zr~VKicRyt`JQ%8fm6;9D;%&5h`hA3XWyEnO2wPwf#ty}*9$nY^N{EaB-QN0{37CZY
z$q)aOl;;m)>;+-AznRG38SG@jWRbkZ?`y-mSC;XhcuB8cm3z(6n*y_AgZY3d&mt3*
zn><`VT-|Xtb%b$?w{37^ql(=Zu?n)?a;t|0D}bE`Dvf7dz4(QJ4yCTrQT2k^9Pg(5
zPaRK{q+s7o@$GdxX8%j#mu~+9FDS;suFhYN3wXU}FF0v%9YUkIe{5qUD{xJIy+(37
cib^4s>$zE)GBdYpa@8P<Bn6;NeEG-!0iXf}-2eap

delta 1006
zcmV<K0}=e52%ZQC1i%Y88VCRYkqAJ4?mwiKCV6V*6dPx>@=j<n@cQN4k*>F5wYA^%
zs8Z((qWRNUqWGOOj&@G}PCC-a6ZBHX*s}@}+i-%H7u2<(yeGk_p}IQpkY=q{=8%5j
zCh1&k#k?{By|N{2W8kxb1k)Ko<>Y*X-AF6cx)OZF!wG|b|A^sp&39;F`tdn`T7&BT
z`6}F;SrPe`k`pNcOO04KKZLzyEnOv+<Se&UA}kK-NusqXjp&OD5YxT`X%_r=zMbt`
zj|`{~TbAMt5ng#8Q-$GPsj@=VS!l`3bxmZqH?Hr7HtW0h{rHp_0JHqR4<M|f2&!Uy
zwJtIDF1AFr=yNHl=T38YZDSOFJS`2Z<J*g-cU#meJV)6*@YX4#MFGt%Zs$vZ<{7aI
zi;67sX&Qez+eihgXQ!Z-%T3Rgtk_vcs4i80<1dTK@T`_b!SJ=eW+{faDG0*(&ktjA
zs=4lCxQqG|z3B-#^a)OMW}77PE!@M1Ze{4>N<NV0)dBG`x++dX+U%%*EkaXFMahfN
zv?c%P&)d;?j&h!ypwqm%P^YHL3jM3}%*^0B)TTYwcr0m+>#+B{By^cDULDE6Dg;&&
zO=u{r#qY9CX2q~>M2)v~oJjxXwYfA4W6UEykUq9QGg?N01rigUU44BE!qnW&8XUe)
zez=s$?Lpl~RS(YSo`<)8wH6IiQ0ey+mMwU6mI5|$jdg{<NfjxN)4{AT0bluHVN1kq
zGN~ww8GvUNXoQ#tKXJCP&HdgZd9g+RxJ3+7*D^qzb^t$p`vB{$PbRp}`?hcFuIt@%
zc0>XyiM1gG;p;#L9s<KbV=@9S4Q#kfUoYk#$>@?8yM9bG|EGU{)i7>&=1y1ao-Dkr
zRXW>C?l>F;ydkCib7Ytf&$pZmbohAqtn={=lT~&$Rh;-jDeQ_d+pvz`p=a;9Xi{F;
z<1L8OcuJz#T!azxf|x{296pI6?K?%ifaMSF9Gc@ej6`Y(jKgn|RCE?T-v&<o^7oKI
zaq15Na`#h4!-JuJTAA4ZE#5}Er{71IS4JFng0KY!VeC+>>d|$ruY|}L(Cw|CnSe>S
zpZxGoNqPP-#$FI+`<sajp21EgOcu#o{Ju84du169ikI~IRk_z3y(us|Hkc2X@+>k@
zxyi!?#MK>VQ%4xLc-sayHmcZt5vw5EEw_4Dumae5pwf6<){9>l=uqk^9aS%w&GBx^
z|J3nRNecGe6yILAWA?u!e(Cl<@PcA2?CSjWxPaGt_JWfJ*C8~T`^Pp$vI5uS*J~uV
cqo@>8xt^P)DKm4sCRYuzNKydW#Fu~kA7M59P5=M^

diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect b/src/compress/flate/testdata/huffman-rand-1k.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..09dc798ee37df82176b8b7c9998c88a14207c1ad
GIT binary patch
literal 1005
zcmV<J0}}iI=mQu0_=}cyNDYOQC4jHO{*CA$>lcQ}x5eHD>U|iC=ROD8-~ViL-puE1
zjRYA<e@7ok6|m@@@-bQ7G_+gg;0X~h!iZaZT@({<BdHC%EhE0!?R5lfncw|zX6EkM
zv7T;&S=H2lS(tZ;LpaH=Ffy7T8%8~6Zr5_2P9aam;M98rjim$j1)FQ!DwB68{j_4*
zAm<;eYf(cvFQAeEY;`{LHT^>__oQ{&>YEB=3*aLuz4zyXJp13Xu1};#Rhix|mTnwF
zOo!rp*<Vb2=#T^3FqPIK3`e+4YrP~9t1Q>PZhF=TqnOy;6>9pEFaaeUqI8B!YL)2W
zP7ZdtNvU6;rei#QejpQ1yJnKOE~NTM%dWXRuhSpl)r~@J@cfJn0Ny~Wi$|AEsLzhu
zri&m6gnDM>m?;94<~TB71LK+=ROn-XNSx<mGj5<|gZjq0dP8vq^=N5@oVrELYYQ3C
zyMJ(nlF_teg0bXc#XLqIk}(`kL3j7(hoTQ|=#m3#3g@Xi5!z;oDcX7AWR2rg)b;OV
zhdcDb^l=lZ>ENOU6sujQmH^hn%vbF>Y9-Bf>bg4ep_N_banGD$o@)BlG0~`IFf*!A
z7ZZY+$P{3oO)_oT873jzel8_va>@^q&Gy#Imx?o3b8wLzzbGT44Do}*$X0h~<AS=m
zS5{Y?!`2i9Jh88n$ck5gQ~EEeZf1k}GtnWd8*#LJ+Wq@~Agf;0%KhEo?%KMjCz+RN
z!90SD3S2E)h(0tmhBroe;i?(?s@@d|4H{SfwGZ<vT53%hcX37EPqXat>ljl$J4Xnb
zbD&&|U+WJ#!b4}YW@ms{4#Dg|)FPD1`RJ15X*j-TWXe#-24_NUqwu$E^5|c&ujkvl
zceVJ-2*h=M!1)}1Jc%#TSUTePk+ypzC+V()i{5ms{n@u^D(o_E@REe_Kn#k!Ic_d<
z)NYD&D%@ZnqX*t~i*(5TV|DgDW2`fY!|?bmYqXwpi(E6b%BbX-wveIk57S|?#u}7-
zL{;=f|DL5<#-Qjb!HsV;5xKrj*@u^N&pjiq)f!%|U1|gQA`KAPM`;y5?oy)&(mYZ0
z_?_gKiO6R;)m}AtC+IwYu6c3Nlk}=l5*$k#<!%*k<hxOuMW6Y^j4JyY-&NB{lP3Wt
z!fRD+qWgr4$(|^}?;0fRo36cJ=@m8ZLZmZW`VY4E0j2^>%8*z(mO<ys35?r?O{j_U
zwT}IDTTkuYKkGQ7zOZxrxK#H{FlR0}qEr~V2*2_<)&8YH#%5-^f$wMJ<~$6_qzZYn
bSBr{um>5DYDWih#pN0k_;dS~5vECO-S0Dj5

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..0c24742fde2487e3a454ec3364f15e541693c37c
GIT binary patch
literal 1054
zcmV+(1mXJxzzaAU2mk=!nayIXbMy_f)7H$mL&SF;F?3`%k8@)&&%@Oe(UOiioadDG
zS>BI}35WJ&PF@*1*&LbA=aF5pFj3x*HIFRrKcto>d1~bp8)vlgPG~al`sLh_uD4>f
zwcquqQs)bz`O{dU_?<M4c256JI?~7!^isyyvkDU1aDtZ?)U~0!C&8(qx;pWYX02D|
zkbdDN>0E5ZyfOj3vL$R|;Io1R(-}eK<a~tPNGsI35`4wO34?$Ch~ab1cW7by@i|(9
z>i+pE+?-hv`IeFsDFRE4SU5j~y=5(3C6?qYw^br64(dswwJMG1iwh9bz5{6%{CK{d
z?OTrws1RG0;tdgAc^^}S;a;h-Le*Jl$;@?4WVbi2?}j$(yZ8P0lo<fC{J#$%tfC02
zVtlnOG50REM7HR2DXHg9b9ilI6g({rtmE5@rgvM^D?CTpJ@D2kqeTJDEpF#af#w;p
z3yX>@^JyA?I@?GEt7oU6m&;AhmaN!WN2o4Ue&a8T%J8g~M#1p4zh)_hxG4z2`Ogny
za;mxRW4Md@6TRsPIrIrmbY`0*@-5uMh;C)*<4Qh|=G6i5GP){GL)z@9EkaXFMahfN
zv?c%P&)d;?j&h!ypwqm%P^YHL3jM3}%*^0B)TTYwcr0m+>#+B{By^cDULDE6Dg;&&
zO=u{r#qY9CX2q~>M2)v~oJjxXwYfA4W6UEykUq9QGg?N01rigUU44BE!qnW&8XUe)
zez=s$?Lpl~RS(YSo`<)!77bHS>Gu?tEqHX60yc4tb%nr56)BI?!K^R=U-@BSOT=w5
zsVIvXfM*tHgqR0EakjC|{oW&au|@y5MGR8cGC-Yn06%^E0PC$!Cb-Z0wr}jN>)ms9
zL;@;_wIK!J>p%w{0>eRLG6F9RY`9EcFXkV~=#m(_eoQp~r+?KjZg}QSSL~iFyscF_
z+e_{^90j}~rTuecm=4dkoD6jMc=)XI@ePwzb~aU<_(Cb{iZR=;j^CkY@49GGUfJU<
zh|_pVqS;)85%YqWL`@t%i6ZSgMZJLK5AGbA<2Z~&Y6y(OZ<17W7CzqwPW|%tkU??k
z4*_!bQ%1vsp<0>Q04?4|yQkkrm{&#|cY?4524U<_tm@Hqt*?a07|`vlpP7J3xS#y+
zPf2<HFveaGX8W6o44%PGCQKH|Tl~H@ynAIC4~m!c`c=8t9K9(pJ2sdPnDQ(#QMt*(
z1;o`IXH!QQw|LtIH#Vx+eG#i5+by?xSg-=vd7#pG){9>l=uqk^9aS%w&GBx^|J3nR
zNecGe6yILAWA?u!e(Cl<@PcA2?CSjWxPaGt_JWfJ*C8~T`^Pp$vI5uS*J~uVqo@>8
Yxt^P)DKm4sCRYuzNKydW#Fu~kAGX;UBLDyZ

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect
index 2d6527934e98300d744c7558a025250f67e0f1c9..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0 100644
GIT binary patch
literal 186
zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX<h-OugVZ{7c&Q+)5t
z^W0t*NzZzBc@NPIFXnwRjJ#*ECHm<DPC<i19nm$hBEtE~E4lXQD(}1bwv;7j=Ka3~
zY0uu>+&C|<Ir8e2hR7M^oz4~~*BcnkQjm7DP0zbtG|x}b|JZ{o)hxYBnZL2l&c35H
oxm@G#mZVz>&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po

literal 229
zcmV<B02=?Sz`;s`U;qGMA}jiDQBl!BLxKn@i-J1jMJT%z9Ys3Sp$?iwVIC?HqA0{m
zMO1=xs6%wHK<E-X2pb4HI)*aT%ceNC!x-BxZhMJ+yPX3X((@S!Zk{GZ&1lLYQXIK8
z|B`NRbU7*oc%S37*i$zh9Lmtx#??AZ4*!H|Wu9^ubvp0&4`9X}eX%n%c!Yj|{~0u4
zmw1SM_2T3GLfQURxoaw8iPVd5Pb`}nw(y3GudN<%*%NhhVk_la=aBr_UF%Vp-1MU`
f73+Hr62RZgHnB4I{?Ntpcy%czX6qlk6Dgn#^5$!3

diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput
index 2d6527934e98300d744c7558a025250f67e0f1c9..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0 100644
GIT binary patch
literal 186
zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX<h-OugVZ{7c&Q+)5t
z^W0t*NzZzBc@NPIFXnwRjJ#*ECHm<DPC<i19nm$hBEtE~E4lXQD(}1bwv;7j=Ka3~
zY0uu>+&C|<Ir8e2hR7M^oz4~~*BcnkQjm7DP0zbtG|x}b|JZ{o)hxYBnZL2l&c35H
oxm@G#mZVz>&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po

literal 229
zcmV<B02=?Sz`;s`U;qGMA}jiDQBl!BLxKn@i-J1jMJT%z9Ys3Sp$?iwVIC?HqA0{m
zMO1=xs6%wHK<E-X2pb4HI)*aT%ceNC!x-BxZhMJ+yPX3X((@S!Zk{GZ&1lLYQXIK8
z|B`NRbU7*oc%S37*i$zh9Lmtx#??AZ4*!H|Wu9^ubvp0&4`9X}eX%n%c!Yj|{~0u4
zmw1SM_2T3GLfQURxoaw8iPVd5Pb`}nw(y3GudN<%*%NhhVk_la=aBr_UF%Vp-1MU`
f73+Hr62RZgHnB4I{?Ntpcy%czX6qlk6Dgn#^5$!3

diff --git a/src/compress/flate/testdata/huffman-rand-limit.golden b/src/compress/flate/testdata/huffman-rand-limit.golden
index 57e59322e9884ed96ccace7d6054ca9223128a33..9ca0eb1ce22ff2231516303b06d4ccde837b8b49 100644
GIT binary patch
literal 246
zcmZQM;Hcr8$Z+7}6UF<!9F}h~H8p!Q_{u7e#+>-LRpuX`%(tEwHyI;4Gxl-|h3T{N
z6-jI@ns~S?%F^$acmD1V*M!+Qrys9lzyJ+Dd}Vank36fZwwe0V*y~gD<*;wR&aErE
zK1<kd!!fNt+1}rpjnh4JlV3017w|as<exX8Z~vA2SQ-&6ej=wme7V%C=YO9+T6VgA
zcKvnLw<{j#p4n6PWc`^JZNFx8uWsKcn`$td>x=B^waSM#UtI25yeIe3z8TRIdp4hX
rzbfU)&SL#*50@_2z4<ZLMQ!TBd6VyQ`SRECR(O3szcO#9XCNB@?{|4g

literal 252
zcmZSh&cN|Mk$QmZNB6Wc53bfJstxN8{p)?9@LIg><o2WY*k092bU(LazjgnAPVv1n
z&vScOBt7fh<vm0<yqNdNF!G+smguJsI0X$3bwt<1iU{W`ujJaJtGw^#+ftUCnfLz|
zq&<6gbK|_c=E$p88X{+ucRE{~TyJ1BOF`PrHa+ip(L6sz|6>oXRI~IhW&XxGJNu5-
o<Z_L_Tas=mJeOac9q4SSzhDw$PO04&)4$Go-<Oo$dNrF10Faz>$p8QV

diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect b/src/compress/flate/testdata/huffman-rand-limit.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0
GIT binary patch
literal 186
zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX<h-OugVZ{7c&Q+)5t
z^W0t*NzZzBc@NPIFXnwRjJ#*ECHm<DPC<i19nm$hBEtE~E4lXQD(}1bwv;7j=Ka3~
zY0uu>+&C|<Ir8e2hR7M^oz4~~*BcnkQjm7DP0zbtG|x}b|JZ{o)hxYBnZL2l&c35H
oxm@G#mZVz>&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0
GIT binary patch
literal 186
zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX<h-OugVZ{7c&Q+)5t
z^W0t*NzZzBc@NPIFXnwRjJ#*ECHm<DPC<i19nm$hBEtE~E4lXQD(}1bwv;7j=Ka3~
zY0uu>+&C|<Ir8e2hR7M^oz4~~*BcnkQjm7DP0zbtG|x}b|JZ{o)hxYBnZL2l&c35H
oxm@G#mZVz>&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect b/src/compress/flate/testdata/huffman-shifts.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..7812c1c62da3cbaeb6399e9aa8ab65ae7efa9b08
GIT binary patch
literal 32
ocmaEJ(2|$IfP>+{UeCQBetd7^G}D{T$iTpm^J~2nL&Iw}0NYm#xc~qF

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput b/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..7812c1c62da3cbaeb6399e9aa8ab65ae7efa9b08
GIT binary patch
literal 32
ocmaEJ(2|$IfP>+{UeCQBetd7^G}D{T$iTpm^J~2nL&Iw}0NYm#xc~qF

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect b/src/compress/flate/testdata/huffman-text-shift.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..71ce3aeb75a86e8375d9ac4350b7d83b9229a3ed
GIT binary patch
literal 231
zcmV<D02u!q%I!+?Fc`-1&kN)ro;)ZWZ0S%cC=BVkI;V5KbW=e@XiA!}BzFmS%Bpxa
z9*IXL-M8!d^NZsQn$GK5;mKrjzEKNE11j)5HRh1LZ!1no9B)CQ(h$55Q0W?4@2aw!
z74tE6rUym~hYE!&B?MP#hibLV@^z6*Z8#;AG-6j<?fR*010ET${>b2j)h-%-Q8H+K
zIkmg!?Y-=9be1Hi$&iwP9DQ6&foC2grh=5#ja@KiZ1-F{b`bo<YmCYvCEt{!vuvJU
hEv}b0x2wDL{lnu^vDrSq?z_)_?Eml2(Q$mq{s5`QbLId5

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput b/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..71ce3aeb75a86e8375d9ac4350b7d83b9229a3ed
GIT binary patch
literal 231
zcmV<D02u!q%I!+?Fc`-1&kN)ro;)ZWZ0S%cC=BVkI;V5KbW=e@XiA!}BzFmS%Bpxa
z9*IXL-M8!d^NZsQn$GK5;mKrjzEKNE11j)5HRh1LZ!1no9B)CQ(h$55Q0W?4@2aw!
z74tE6rUym~hYE!&B?MP#hibLV@^z6*Z8#;AG-6j<?fR*010ET${>b2j)h-%-Q8H+K
zIkmg!?Y-=9be1Hi$&iwP9DQ6&foC2grh=5#ja@KiZ1-F{b`bo<YmCYvCEt{!vuvJU
hEv}b0x2wDL{lnu^vDrSq?z_)_?Eml2(Q$mq{s5`QbLId5

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-text.sync.expect b/src/compress/flate/testdata/huffman-text.sync.expect
new file mode 100644
index 00000000000000..d448727c323caf
--- /dev/null
+++ b/src/compress/flate/testdata/huffman-text.sync.expect
@@ -0,0 +1 @@
+�_K�0������`K��0Aasě)^�H�����Iɟb߻��_>�4a��=����-^�1`_�	1	���	�ő:�Y��-�F66!�A��`�a��C;A����Nyr4ߜU�!���GKС��#�����r:B[G�3��.�L��׶�bFRuM]���^⇳�(#Z������i�����v��B����B�H2S]��u/���ֽ��W�T�G��n���r�
\ No newline at end of file
diff --git a/src/compress/flate/testdata/huffman-text.sync.expect-noinput b/src/compress/flate/testdata/huffman-text.sync.expect-noinput
new file mode 100644
index 00000000000000..d448727c323caf
--- /dev/null
+++ b/src/compress/flate/testdata/huffman-text.sync.expect-noinput
@@ -0,0 +1 @@
+�_K�0������`K��0Aasě)^�H�����Iɟb߻��_>�4a��=����-^�1`_�	1	���	�ő:�Y��-�F66!�A��`�a��C;A����Nyr4ߜU�!���GKС��#�����r:B[G�3��.�L��׶�bFRuM]���^⇳�(#Z������i�����v��B����B�H2S]��u/���ֽ��W�T�G��n���r�
\ No newline at end of file
diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect b/src/compress/flate/testdata/huffman-zero.dyn.expect
index 830348a79ad9ab38d0edc449e8335c056f7d185f..dbe401c54c4b6f45f3169376185a476dcf00dde9 100644
GIT binary patch
literal 6
NcmXq#U{zse0006o0CxZY

literal 17
XcmaEJU?T$%G#D)X^D^m0zK$>eMUV%O

diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput b/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput
index 830348a79ad9ab38d0edc449e8335c056f7d185f..dbe401c54c4b6f45f3169376185a476dcf00dde9 100644
GIT binary patch
literal 6
NcmXq#U{zse0006o0CxZY

literal 17
XcmaEJU?T$%G#D)X^D^m0zK$>eMUV%O

diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect b/src/compress/flate/testdata/huffman-zero.sync.expect
new file mode 100644
index 0000000000000000000000000000000000000000..dbe401c54c4b6f45f3169376185a476dcf00dde9
GIT binary patch
literal 6
NcmXq#U{zse0006o0CxZY

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect-noinput b/src/compress/flate/testdata/huffman-zero.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..dbe401c54c4b6f45f3169376185a476dcf00dde9
GIT binary patch
literal 6
NcmXq#U{zse0006o0CxZY

literal 0
HcmV?d00001

diff --git a/src/compress/flate/testdata/null-long-match.sync.expect-noinput b/src/compress/flate/testdata/null-long-match.sync.expect-noinput
new file mode 100644
index 0000000000000000000000000000000000000000..8b92d9fc20f1ee1fea5e4cc84d18aeea26a6fdaa
GIT binary patch
literal 206
ccmaEJz>txFf#HzC@8#d3xFvwhAq<`X0E^!Sx&QzG

literal 0
HcmV?d00001

diff --git a/src/compress/flate/token.go b/src/compress/flate/token.go
index fc0e4941e7bcd2..3f0d1c358077b8 100644
--- a/src/compress/flate/token.go
+++ b/src/compress/flate/token.go
@@ -4,20 +4,26 @@
 
 package flate
 
+import (
+	"math"
+)
+
 const (
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
-	lengthShift = 22
-	offsetMask  = 1<<lengthShift - 1
-	typeMask    = 3 << 30
-	literalType = 0 << 30
-	matchType   = 1 << 30
+	// Token is a compound value:
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
 )
 
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
 // is lengthCodes[length - MIN_MATCH_LENGTH]
-var lengthCodes = [...]uint32{
+var lengthCodes = [256]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
 	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
@@ -46,7 +52,37 @@ var lengthCodes = [...]uint32{
 	27, 27, 27, 27, 27, 28,
 }
 
-var offsetCodes = [...]uint32{
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
 	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
 	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
 	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@@ -65,33 +101,198 @@ var offsetCodes = [...]uint32{
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 }
 
+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
 type token uint32
 
-// Convert a literal into a literal token.
-func literalToken(literal uint32) token { return token(literalType + literal) }
+// tokens are compound values as described above.
+// Histograms are created as tokens are added.
+// A full block is allocated.
+type tokens struct {
+	extraHist [32]uint16  // codes 256->maxnumlit
+	offHist   [32]uint16  // offset codes
+	litHist   [256]uint16 // codes 0->255
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
+	tokens    [65536]token
+}
+
+func (t *tokens) Reset() {
+	if t.n == 0 {
+		return
+	}
+	t.n = 0
+	t.nFilled = 0
+	clear(t.litHist[:])
+	clear(t.extraHist[:])
+	clear(t.offHist[:])
+}
+
+func indexTokens(in []token) tokens {
+	var t tokens
+	t.indexTokens(in)
+	return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+	t.Reset()
+	for _, tok := range in {
+		if tok < matchType {
+			t.AddLiteral(tok.literal())
+			continue
+		}
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
+	}
+}
+
+// emitLiterals writes a literal chunk and returns the number of bytes written.
+func emitLiterals(dst *tokens, lit []byte) {
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
+		dst.litHist[v]++
+		dst.n++
+	}
+}
+
+func (t *tokens) AddLiteral(lit byte) {
+	t.tokens[t.n] = token(lit)
+	t.litHist[lit]++
+	t.n++
+}
+
+// from https://stackoverflow.com/a/28730362
+func mFastLog2(val float32) float32 {
+	ux := int32(math.Float32bits(val))
+	log2 := (float32)(((ux >> 23) & 255) - 128)
+	ux &= -0x7f800001
+	ux += 127 << 23
+	uval := math.Float32frombits(uint32(ux))
+	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
+	return log2
+}
 
-// Convert a < xlength, xoffset > pair into a match token.
-func matchToken(xlength uint32, xoffset uint32) token {
-	return token(matchType + xlength<<lengthShift + xoffset)
+// EstimatedBits will return a minimum size estimated by an *optimal*
+// compression of the block.
+func (t *tokens) EstimatedBits() int {
+	shannon := float32(0)
+	bits := int(0)
+	nMatches := 0
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
+		for _, v := range t.litHist[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			}
+		}
+		// Just add 15 for EOB
+		shannon += 15
+		for i, v := range t.extraHist[1 : literalCount-256] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(lengthExtraBits[i&31]) * int(v)
+				nMatches += int(v)
+			}
+		}
+	}
+	if nMatches > 0 {
+		invTotal := 1.0 / float32(nMatches)
+		for i, v := range t.offHist[:offsetCodeCount] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(offsetExtraBits[i&31]) * int(v)
+			}
+		}
+	}
+	return int(shannon) + bits
 }
 
-// Returns the literal of a literal token.
-func (t token) literal() uint32 { return uint32(t - literalType) }
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
 
-// Returns the extra offset of a match token.
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
+	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+	t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
+	for xlength > 0 {
+		xl := xlength
+		if xl > 258 {
+			// We need to have at least baseMatchLength left over for next loop.
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
+		}
+		xlength -= xl
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
+		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+		t.n++
+	}
+}
+
+func (t *tokens) AddEOB() {
+	t.tokens[t.n] = token(endBlockMarker)
+	t.extraHist[0]++
+	t.n++
+}
+
+// Slice returns a slice of the tokens that references the tokens in t.
+func (t *tokens) Slice() []token {
+	return t.tokens[:t.n]
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint8 { return uint8(t) }
+
+// Returns the extra offset of a match token
 func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
-func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
-// Returns the offset code corresponding to a specific offset.
+// Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
 	if off < uint32(len(offsetCodes)) {
-		return offsetCodes[off]
-	}
-	if off>>7 < uint32(len(offsetCodes)) {
-		return offsetCodes[off>>7] + 14
+		return offsetCodes[uint8(off)]
 	}
-	return offsetCodes[off>>14] + 28
+	return offsetCodes14[uint8(off>>7)]
 }
diff --git a/src/compress/flate/unsafe_disabled.go b/src/compress/flate/unsafe_disabled.go
new file mode 100644
index 00000000000000..1444494693468e
--- /dev/null
+++ b/src/compress/flate/unsafe_disabled.go
@@ -0,0 +1,33 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"internal/byteorder"
+)
+
+type indexer interface {
+	int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64
+}
+
+// loadLE8 will load from b at index i.
+func loadLE8[I indexer](b []byte, i I) byte {
+	return b[i]
+}
+
+// loadLE32 will load from b at index i.
+func loadLE32[I indexer](b []byte, i I) uint32 {
+	return byteorder.LEUint32(b[i:])
+}
+
+// loadLE64 will load from b at index i.
+func loadLE64[I indexer](b []byte, i I) uint64 {
+	return byteorder.LEUint64(b[i:])
+}
+
+// storeLE64 will store v at start of b.
+func storeLE64(b []byte, v uint64) {
+	byteorder.LEPutUint64(b, v)
+}
diff --git a/src/compress/flate/writer_test.go b/src/compress/flate/writer_test.go
index c413735cd2c9f3..43815b2e4787fd 100644
--- a/src/compress/flate/writer_test.go
+++ b/src/compress/flate/writer_test.go
@@ -8,6 +8,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"math"
 	"math/rand"
 	"runtime"
 	"testing"
@@ -40,6 +41,34 @@ func BenchmarkEncode(b *testing.B) {
 	})
 }
 
+func TestWriterMemUsage(t *testing.T) {
+	testMem := func(t *testing.T, fn func()) {
+		var before, after runtime.MemStats
+		runtime.GC()
+		runtime.ReadMemStats(&before)
+		fn()
+		runtime.GC()
+		runtime.ReadMemStats(&after)
+		t.Logf("%s: Memory Used: %dKB, %d allocs", t.Name(), (after.HeapInuse-before.HeapInuse)/1024, after.HeapObjects-before.HeapObjects)
+	}
+	data := make([]byte, 100000)
+
+	for level := HuffmanOnly; level <= BestCompression; level++ {
+		t.Run(fmt.Sprint("level-", level), func(t *testing.T) {
+			var zr *Writer
+			var err error
+			testMem(t, func() {
+				zr, err = NewWriter(io.Discard, level)
+				if err != nil {
+					t.Fatal(err)
+				}
+				zr.Write(data)
+			})
+			zr.Close()
+		})
+	}
+}
+
 // errorWriter is a writer that fails after N writes.
 type errorWriter struct {
 	N int
@@ -67,7 +96,7 @@ func TestWriteError(t *testing.T) {
 	in := buf.Bytes()
 	// We create our own buffer to control number of writes.
 	copyBuffer := make([]byte, 128)
-	for l := 0; l < 10; l++ {
+	for l := range 10 {
 		for fail := 1; fail <= 256; fail *= 2 {
 			// Fail after 'fail' writes
 			ew := &errorWriter{N: fail}
@@ -110,6 +139,75 @@ func TestWriteError(t *testing.T) {
 	}
 }
 
+// Test if errors from the underlying writer is passed upwards.
+func TestWriter_Reset(t *testing.T) {
+	buf := new(bytes.Buffer)
+	n := 65536
+	if !testing.Short() {
+		n *= 4
+	}
+	for i := 0; i < n; i++ {
+		fmt.Fprintf(buf, "asdasfasf%d%dfghfgujyut%dyutyu\n", i, i, i)
+	}
+	in := buf.Bytes()
+	for l := range 10 {
+		l := l
+		if testing.Short() && l > 1 {
+			continue
+		}
+		t.Run(fmt.Sprintf("level-%d", l), func(t *testing.T) {
+			t.Parallel()
+			offset := 1
+			if testing.Short() {
+				offset = 256
+			}
+			for ; offset <= 256; offset *= 2 {
+				// Fail after 'fail' writes
+				w, err := NewWriter(io.Discard, l)
+				if err != nil {
+					t.Fatalf("NewWriter: level %d: %v", l, err)
+				}
+				if w.d.fast == nil {
+					t.Skip("Not Fast...")
+					return
+				}
+				for i := 0; i < (bufferReset-len(in)-offset-maxMatchOffset)/maxMatchOffset; i++ {
+					// skip ahead to where we are close to wrap around...
+					w.d.fast.Reset()
+				}
+				w.d.fast.Reset()
+				_, err = w.Write(in)
+				if err != nil {
+					t.Fatal(err)
+				}
+				for range 50 {
+					// skip ahead again... This should wrap around...
+					w.d.fast.Reset()
+				}
+				w.d.fast.Reset()
+
+				_, err = w.Write(in)
+				if err != nil {
+					t.Fatal(err)
+				}
+				for range (math.MaxUint32 - bufferReset) / maxMatchOffset {
+					// skip ahead to where we are close to wrap around...
+					w.d.fast.Reset()
+				}
+
+				_, err = w.Write(in)
+				if err != nil {
+					t.Fatal(err)
+				}
+				err = w.Close()
+				if err != nil {
+					t.Fatal(err)
+				}
+			}
+		})
+	}
+}
+
 // Test if two runs produce identical results
 // even when writing different sizes to the Writer.
 func TestDeterministic(t *testing.T) {
@@ -171,6 +269,24 @@ func testDeterministic(i int, t *testing.T) {
 	if !bytes.Equal(b1b, b2b) {
 		t.Errorf("level %d did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b2b))
 	}
+
+	// Test using io.WriterTo interface.
+	var b3 bytes.Buffer
+	br = bytes.NewBuffer(t1)
+	w, err = NewWriter(&b3, i)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = br.WriteTo(w)
+	if err != nil {
+		t.Fatal(err)
+	}
+	w.Close()
+
+	b3b := b3.Bytes()
+	if !bytes.Equal(b1b, b3b) {
+		t.Errorf("level %d (io.WriterTo) did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b3b))
+	}
 }
 
 // TestDeflateFast_Reset will test that encoding is consistent

From 374779b85f6ae7f5ad5f87f7a78811612c74c953 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sat, 27 Sep 2025 16:27:38 +0200
Subject: [PATCH 2/5] [klauspost/deflate-improve-comp] don't use
 internal/byteorder

Change-Id: I0ac5571da9585daba9491b360c9a6b4e0cecbcee
---
 src/compress/flate/unsafe_disabled.go | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/compress/flate/unsafe_disabled.go b/src/compress/flate/unsafe_disabled.go
index 1444494693468e..c4ecd0fd0a9bb1 100644
--- a/src/compress/flate/unsafe_disabled.go
+++ b/src/compress/flate/unsafe_disabled.go
@@ -4,10 +4,6 @@
 
 package flate
 
-import (
-	"internal/byteorder"
-)
-
 type indexer interface {
 	int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64
 }
@@ -19,15 +15,26 @@ func loadLE8[I indexer](b []byte, i I) byte {
 
 // loadLE32 will load from b at index i.
 func loadLE32[I indexer](b []byte, i I) uint32 {
-	return byteorder.LEUint32(b[i:])
+	b = b[i : i+4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 
 // loadLE64 will load from b at index i.
 func loadLE64[I indexer](b []byte, i I) uint64 {
-	return byteorder.LEUint64(b[i:])
+	b = b[i : i+8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 
 // storeLE64 will store v at start of b.
 func storeLE64(b []byte, v uint64) {
-	byteorder.LEPutUint64(b, v)
+	_ = b[7] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v)
+	b[1] = byte(v >> 8)
+	b[2] = byte(v >> 16)
+	b[3] = byte(v >> 24)
+	b[4] = byte(v >> 32)
+	b[5] = byte(v >> 40)
+	b[6] = byte(v >> 48)
+	b[7] = byte(v >> 56)
 }

From 2c5d12a16952010142017cecdf431180163bc2af Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sat, 27 Sep 2025 16:54:32 +0200
Subject: [PATCH 3/5] [klauspost/deflate-improve-comp] Remove hash7 and use
 const for long table bytes.

Change-Id: Ia141c7ec888bf51ceb6351d2a1c3f1501c2c4e12
---
 src/compress/flate/deflatefast.go        |  7 +------
 src/compress/flate/huffman_bit_writer.go |  3 ++-
 src/compress/flate/level4.go             | 16 ++++++++--------
 src/compress/flate/level5.go             | 14 +++++++-------
 src/compress/flate/level6.go             | 12 ++++++------
 5 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go
index e132c55951b5ef..eef1896b6f5c63 100644
--- a/src/compress/flate/deflatefast.go
+++ b/src/compress/flate/deflatefast.go
@@ -35,6 +35,7 @@ func newFastEnc(level int) fastEnc {
 const (
 	tableBits       = 15             // Bits used in the table
 	tableSize       = 1 << tableBits // Size of the table
+	hashLongBytes   = 7              // Bytes used for long table hash
 	baseMatchOffset = 1              // The smallest match offset
 	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
 	maxMatchOffset  = 1 << 15        // The largest match offset
@@ -93,12 +94,6 @@ type tableEntryPrev struct {
 	Prev tableEntry
 }
 
-// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash7(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
-}
-
 // hashLen returns a hash of the lowest mls bytes of with length output bits.
 // mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
 // length should always be < 32.
diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go
index f5e50925db8802..585a9b4cf19032 100644
--- a/src/compress/flate/huffman_bit_writer.go
+++ b/src/compress/flate/huffman_bit_writer.go
@@ -412,8 +412,9 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 	return 0, false
 }
 
+// writeCode writes 'c' to the stream.
+// Inline manually when performance is critical.
 func (w *huffmanBitWriter) writeCode(c hcode) {
-	// The function does not get inlined if we "& 63" the shift.
 	w.bits |= c.code64() << (w.nbits & reg8SizeMask64)
 	w.nbits += c.len()
 	if w.nbits >= 48 {
diff --git a/src/compress/flate/level4.go b/src/compress/flate/level4.go
index f62168b64ed9e3..ceb899793e3148 100644
--- a/src/compress/flate/level4.go
+++ b/src/compress/flate/level4.go
@@ -82,7 +82,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		var t int32
 		for {
 			nextHashS := hashLen(cv, tableBits, hashShortBytes)
-			nextHashL := hash7(cv, tableBits)
+			nextHashL := hashLen(cv, tableBits, hashLongBytes)
 
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
@@ -106,7 +106,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			t = sCandidate.offset - e.cur
 			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
 				// Found a 4 match...
-				lCandidate = e.bTable[hash7(next, tableBits)]
+				lCandidate = e.bTable[hashLen(next, tableBits, hashLongBytes)]
 
 				// If the next long is a candidate, check if we should use that instead...
 				lOff := lCandidate.offset - e.cur
@@ -155,7 +155,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			if int(s+8) < len(src) {
 				cv := loadLE64(src, s)
 				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}
-				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.bTable[hashLen(cv, tableBits, hashLongBytes)] = tableEntry{offset: s + e.cur}
 			}
 			goto emitRemainder
 		}
@@ -166,8 +166,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			cv := loadLE64(src, i)
 			t := tableEntry{offset: i + e.cur}
 			t2 := tableEntry{offset: t.offset + 1}
-			e.bTable[hash7(cv, tableBits)] = t
-			e.bTable[hash7(cv>>8, tableBits)] = t2
+			e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t
+			e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2
 			e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 
 			i += 3
@@ -175,8 +175,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 				cv := loadLE64(src, i)
 				t := tableEntry{offset: i + e.cur}
 				t2 := tableEntry{offset: t.offset + 1}
-				e.bTable[hash7(cv, tableBits)] = t
-				e.bTable[hash7(cv>>8, tableBits)] = t2
+				e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t
+				e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2
 				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 			}
 		}
@@ -186,7 +186,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		x := loadLE64(src, s-1)
 		o := e.cur + s - 1
 		prevHashS := hashLen(x, tableBits, hashShortBytes)
-		prevHashL := hash7(x, tableBits)
+		prevHashL := hashLen(x, tableBits, hashLongBytes)
 		e.table[prevHashS] = tableEntry{offset: o}
 		e.bTable[prevHashL] = tableEntry{offset: o}
 		cv = x >> 8
diff --git a/src/compress/flate/level5.go b/src/compress/flate/level5.go
index 5ef342eae0e8a2..29f1df27413b82 100644
--- a/src/compress/flate/level5.go
+++ b/src/compress/flate/level5.go
@@ -92,7 +92,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		var t int32
 		for {
 			nextHashS := hashLen(cv, tableBits, hashShortBytes)
-			nextHashL := hash7(cv, tableBits)
+			nextHashL := hashLen(cv, tableBits, hashLongBytes)
 
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
@@ -109,7 +109,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 
 			nextHashS = hashLen(next, tableBits, hashShortBytes)
-			nextHashL = hash7(next, tableBits)
+			nextHashL = hashLen(next, tableBits, hashLongBytes)
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
@@ -196,7 +196,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			// The skipped bytes are tested in Extend backwards,
 			// and still picked up as part of the match if they do.
 			const skipBeginning = 2
-			eLong := e.bTable[hash7(loadLE64(src, sAt), tableBits)].Cur.offset
+			eLong := e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)].Cur.offset
 			t2 := eLong - e.cur - l + skipBeginning
 			s2 := s + skipBeginning
 			off := s2 - t2
@@ -241,13 +241,13 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			cv := loadLE64(src, i)
 			t := tableEntry{offset: i + e.cur}
 			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
-			eLong := &e.bTable[hash7(cv, tableBits)]
+			eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
 			eLong.Cur, eLong.Prev = t, eLong.Cur
 
 			// Do an long at i+1
 			cv >>= 8
 			t = tableEntry{offset: t.offset + 1}
-			eLong = &e.bTable[hash7(cv, tableBits)]
+			eLong = &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
 			eLong.Cur, eLong.Prev = t, eLong.Cur
 
 			// We only have enough bits for a short entry at i+2
@@ -261,7 +261,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 				cv := loadLE64(src, i)
 				t := tableEntry{offset: i + e.cur}
 				t2 := tableEntry{offset: t.offset + 1}
-				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 			}
@@ -272,7 +272,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		x := loadLE64(src, s-1)
 		o := e.cur + s - 1
 		prevHashS := hashLen(x, tableBits, hashShortBytes)
-		prevHashL := hash7(x, tableBits)
+		prevHashL := hashLen(x, tableBits, hashLongBytes)
 		e.table[prevHashS] = tableEntry{offset: o}
 		eLong := &e.bTable[prevHashL]
 		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
diff --git a/src/compress/flate/level6.go b/src/compress/flate/level6.go
index 851a7155853eec..d709f31e21fc42 100644
--- a/src/compress/flate/level6.go
+++ b/src/compress/flate/level6.go
@@ -92,7 +92,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		var t int32
 		for {
 			nextHashS := hashLen(cv, tableBits, hashShortBytes)
-			nextHashL := hash7(cv, tableBits)
+			nextHashL := hashLen(cv, tableBits, hashLongBytes)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -109,7 +109,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 
 			// Calculate hashes of 'next'
 			nextHashS = hashLen(next, tableBits, hashShortBytes)
-			nextHashL = hash7(next, tableBits)
+			nextHashL = hashLen(next, tableBits, hashLongBytes)
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
@@ -216,7 +216,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			// The skipped bytes are tested in extend backwards,
 			// and still picked up as part of the match if they do.
 			const skipBeginning = 2
-			eLong := &e.bTable[hash7(loadLE64(src, sAt), tableBits)]
+			eLong := &e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)]
 			// Test current
 			t2 := eLong.Cur.offset - e.cur - l + skipBeginning
 			s2 := s + skipBeginning
@@ -269,7 +269,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
 				cv := loadLE64(src, i)
 				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}
-				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
 				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
 			}
 			goto emitRemainder
@@ -280,8 +280,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			cv := loadLE64(src, i)
 			t := tableEntry{offset: i + e.cur}
 			t2 := tableEntry{offset: t.offset + 1}
-			eLong := &e.bTable[hash7(cv, tableBits)]
-			eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+			eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
+			eLong2 := &e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)]
 			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 			eLong.Cur, eLong.Prev = t, eLong.Cur
 			eLong2.Cur, eLong2.Prev = t2, eLong2.Cur

From f09b893f4892a4950daa68713908bc1e9f7c91b4 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sat, 27 Sep 2025 17:01:25 +0200
Subject: [PATCH 4/5] [klauspost/deflate-improve-comp] update expected zlib
 output

Change-Id: I1cef87da8cf7a2f2b330115f8eeecb7bf825af76
---
 src/compress/zlib/example_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compress/zlib/example_test.go b/src/compress/zlib/example_test.go
index 70408895ffd5a0..7052973355eb92 100644
--- a/src/compress/zlib/example_test.go
+++ b/src/compress/zlib/example_test.go
@@ -19,7 +19,7 @@ func ExampleNewWriter() {
 	w.Write([]byte("hello, world\n"))
 	w.Close()
 	fmt.Println(b.Bytes())
-	// Output: [120 156 202 72 205 201 201 215 81 40 207 47 202 73 225 2 4 0 0 255 255 33 231 4 147]
+	// Output: [120 156 0 13 0 242 255 104 101 108 108 111 44 32 119 111 114 108 100 10 3 0 33 231 4 147]
 }
 
 func ExampleNewReader() {

From f5d855e43f730c5b44760059fbf00fd153b1ff3e Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 28 Sep 2025 17:19:14 +0200
Subject: [PATCH 5/5] [klauspost/deflate-improve-comp] Use pre-compressed bytes
 for test.

Change-Id: Ie3630fc4b51f30108909a3d5930ffe17851f4a94
---
 src/debug/elf/file_test.go | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/debug/elf/file_test.go b/src/debug/elf/file_test.go
index 0c1a7cf18aeb6e..733daae57772c6 100644
--- a/src/debug/elf/file_test.go
+++ b/src/debug/elf/file_test.go
@@ -7,7 +7,6 @@ package elf
 import (
 	"bytes"
 	"compress/gzip"
-	"compress/zlib"
 	"debug/dwarf"
 	"encoding/binary"
 	"errors"
@@ -1560,18 +1559,9 @@ func TestIssue59208(t *testing.T) {
 	zoffset := sec.Offset + uint64(sec.compressionOffset)
 	copy(dn, data[:zoffset])
 
-	ozd, err := sec.Data()
-	if err != nil {
-		t.Fatal(err)
-	}
-	buf := bytes.NewBuffer(nil)
-	wr := zlib.NewWriter(buf)
 	// corrupt origin data same as COMPRESS_ZLIB
-	copy(ozd, []byte{1, 0, 0, 0})
-	wr.Write(ozd)
-	wr.Close()
-
-	copy(dn[zoffset:], buf.Bytes())
+	// Insert zlib compressed sec.Data() block with `[]byte{1, 0, 0, 0}` as the first 4 bytes
+	copy(dn[zoffset:], []byte{0x78, 0x9c, 0x5c, 0x4d, 0xb9, 0xd, 0x80, 0x30, 0xc, 0x3c, 0x7, 0x27, 0xdc, 0xe, 0xc, 0x46, 0x4b, 0x8b, 0x14, 0x51, 0x20, 0x16, 0xa1, 0x67, 0x8b, 0x2c, 0x88, 0xec, 0x44, 0xc2, 0xe2, 0x8a, 0xdc, 0x1b, 0x59, 0x0, 0x28, 0xc, 0x34, 0x9, 0x7f, 0x22, 0x96, 0xa0, 0x13, 0x67, 0x27, 0xa1, 0x53, 0xea, 0x4e, 0x47, 0x58, 0x7a, 0x98, 0x8d, 0x26, 0xcd, 0xfb, 0x71, 0x21, 0x31, 0x87, 0x7f, 0xca, 0xf3, 0x1b, 0x7a, 0x21, 0xfa, 0x3f, 0x23, 0x4f, 0x3, 0x50, 0x7a, 0xb9, 0xda, 0xfc, 0xae, 0xc3, 0x35, 0x77, 0x1b, 0x94, 0xd5, 0x82, 0x37, 0x0, 0x0, 0xff, 0xff, 0x65, 0xfb, 0x7, 0x6e})
 	copy(dn[sec.Offset+sec.FileSize:], data[sec.Offset+sec.FileSize:])
 
 	nf, err := NewFile(bytes.NewReader(dn))