From 0a5dc67e9ddeb1618fecdaefd847c085f01de2d9 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 21 Sep 2025 11:13:53 +0200 Subject: [PATCH 1/5] [klauspost/deflate-improve-comp] compress/flate: improve compression speed Fixes #75532 This improves the compression speed of the flate package. This is a cleaned version of github.com/klauspost/compress/flate Overall changes: * Compression level 2-6 are custom implementations. * Compression level 7-9 tweaked to match levels 2-6 with minor improvements. * Tokens are encoded and indexed when added. * Huffman encoding attempts to continue blocks instead of always starting a new one. * Loads/Stores in separate functions and can be made to use unsafe. In overall terms this attempts to better balance out the compression levels, which tended to have little spread in the top levels. The intention is to place "default" at the place where performance drops off considerably without a proportional improvement in compression ratio. In my package I have set "5" to be the default, but this keeps it at level 6. There are built-in benchmarks using the standard library's benchmark below. I do not think this is a particular good representation of different data types, so I have also done benchmarks on various data types. I have compiled the benchmarks on https://stdeflate.klauspost.com/ The main focus has been on level 1 (fastest), level 5+6 (default) and level 9 (smallest). It is quite rare that levels outside of this are used, but they should still fit their role reasonably. Level 9 will attempt more aggressive compression, but will also typically be slightly slower than before. I hope the graphs above shows that focusing on a few data types doesn't always give the full picture. My own observations: Level 1 and 2 are often "trading places" depending on data type. Since level 1 is usually the lowest compressing of the two - and mostly slightly faster, with lower memory usage - it is placed as the lowest. The switchover between level 6 and 7 is not always smooth, since the search method changes significantly. Random data is now ~100x faster on levels 2-6, and ~3 faster on levels 7-9. You can feed pre-compressed data with no significant speed penalty. "Unsafe" operations have been removed for now. They can trivially be added back. This is an approximately 10% speed penalty. benchmark old ns/op new ns/op delta BenchmarkEncode/Digits/Huffman/1e4-32 11431 8001 -30.01% BenchmarkEncode/Digits/Huffman/1e5-32 123175 74780 -39.29% BenchmarkEncode/Digits/Huffman/1e6-32 1260402 750022 -40.49% BenchmarkEncode/Digits/Speed/1e4-32 35100 23758 -32.31% BenchmarkEncode/Digits/Speed/1e5-32 675355 385954 -42.85% BenchmarkEncode/Digits/Speed/1e6-32 6878375 4873784 -29.14% BenchmarkEncode/Digits/Default/1e4-32 63411 40974 -35.38% BenchmarkEncode/Digits/Default/1e5-32 1815762 801563 -55.86% BenchmarkEncode/Digits/Default/1e6-32 18875894 8101836 -57.08% BenchmarkEncode/Digits/Compression/1e4-32 63859 85275 +33.54% BenchmarkEncode/Digits/Compression/1e5-32 1803745 2752174 +52.58% BenchmarkEncode/Digits/Compression/1e6-32 18931995 30727403 +62.30% BenchmarkEncode/Newton/Huffman/1e4-32 15770 11108 -29.56% BenchmarkEncode/Newton/Huffman/1e5-32 134567 85103 -36.76% BenchmarkEncode/Newton/Huffman/1e6-32 1663889 1030186 -38.09% BenchmarkEncode/Newton/Speed/1e4-32 32749 22934 -29.97% BenchmarkEncode/Newton/Speed/1e5-32 565609 336750 -40.46% BenchmarkEncode/Newton/Speed/1e6-32 5996011 3815437 -36.37% BenchmarkEncode/Newton/Default/1e4-32 70505 34148 -51.57% BenchmarkEncode/Newton/Default/1e5-32 2374066 570673 -75.96% BenchmarkEncode/Newton/Default/1e6-32 24562355 5975917 -75.67% BenchmarkEncode/Newton/Compression/1e4-32 71505 77670 +8.62% BenchmarkEncode/Newton/Compression/1e5-32 3345768 3730804 +11.51% BenchmarkEncode/Newton/Compression/1e6-32 35770364 39768939 +11.18% benchmark old MB/s new MB/s speedup BenchmarkEncode/Digits/Huffman/1e4-32 874.80 1249.91 1.43x BenchmarkEncode/Digits/Huffman/1e5-32 811.86 1337.25 1.65x BenchmarkEncode/Digits/Huffman/1e6-32 793.40 1333.29 1.68x BenchmarkEncode/Digits/Speed/1e4-32 284.90 420.91 1.48x BenchmarkEncode/Digits/Speed/1e5-32 148.07 259.10 1.75x BenchmarkEncode/Digits/Speed/1e6-32 145.38 205.18 1.41x BenchmarkEncode/Digits/Default/1e4-32 157.70 244.06 1.55x BenchmarkEncode/Digits/Default/1e5-32 55.07 124.76 2.27x BenchmarkEncode/Digits/Default/1e6-32 52.98 123.43 2.33x BenchmarkEncode/Digits/Compression/1e4-32 156.59 117.27 0.75x BenchmarkEncode/Digits/Compression/1e5-32 55.44 36.33 0.66x BenchmarkEncode/Digits/Compression/1e6-32 52.82 32.54 0.62x BenchmarkEncode/Newton/Huffman/1e4-32 634.13 900.25 1.42x BenchmarkEncode/Newton/Huffman/1e5-32 743.12 1175.04 1.58x BenchmarkEncode/Newton/Huffman/1e6-32 601.00 970.70 1.62x BenchmarkEncode/Newton/Speed/1e4-32 305.35 436.03 1.43x BenchmarkEncode/Newton/Speed/1e5-32 176.80 296.96 1.68x BenchmarkEncode/Newton/Speed/1e6-32 166.78 262.09 1.57x BenchmarkEncode/Newton/Default/1e4-32 141.83 292.84 2.06x BenchmarkEncode/Newton/Default/1e5-32 42.12 175.23 4.16x BenchmarkEncode/Newton/Default/1e6-32 40.71 167.34 4.11x BenchmarkEncode/Newton/Compression/1e4-32 139.85 128.75 0.92x BenchmarkEncode/Newton/Compression/1e5-32 29.89 26.80 0.90x BenchmarkEncode/Newton/Compression/1e6-32 27.96 25.15 0.90x Static Memory Usage: Before: Level -2: Memory Used: 704KB, 8 allocs Level -1: Memory Used: 776KB, 7 allocs Level 0: Memory Used: 704KB, 7 allocs Level 1: Memory Used: 1160KB, 13 allocs Level 2: Memory Used: 776KB, 8 allocs Level 3: Memory Used: 776KB, 8 allocs Level 4: Memory Used: 776KB, 8 allocs Level 5: Memory Used: 776KB, 8 allocs Level 6: Memory Used: 776KB, 8 allocs Level 7: Memory Used: 776KB, 8 allocs Level 8: Memory Used: 776KB, 9 allocs Level 9: Memory Used: 776KB, 8 allocs After: Level -2: Memory Used: 272KB, 12 allocs Level -1: Memory Used: 1016KB, 7 allocs Level 0: Memory Used: 304KB, 6 allocs Level 1: Memory Used: 760KB, 13 allocs Level 2: Memory Used: 1144KB, 8 allocs Level 3: Memory Used: 1144KB, 8 allocs Level 4: Memory Used: 888KB, 14 allocs Level 5: Memory Used: 1016KB, 8 allocs Level 6: Memory Used: 1016KB, 8 allocs Level 7: Memory Used: 952KB, 7 allocs Level 8: Memory Used: 952KB, 7 allocs Level 9: Memory Used: 1080KB, 9 allocs This package has been fuzz tested for about 24 hours. Currently, there is about 1h between new "interesting" finds. Change-Id: Icb4c9839dc8f1bb96fd6d548038679a7554a559b --- src/compress/flate/deflate.go | 858 +++++++++++------- src/compress/flate/deflate_test.go | 703 +++----------- src/compress/flate/deflatefast.go | 392 +++----- src/compress/flate/dict_decoder.go | 11 +- src/compress/flate/example_test.go | 3 +- src/compress/flate/fuzz_test.go | 111 +++ src/compress/flate/huffman_bit_writer.go | 854 ++++++++++++----- src/compress/flate/huffman_bit_writer_test.go | 62 +- src/compress/flate/huffman_code.go | 233 +++-- src/compress/flate/huffman_sortByFreq.go | 159 ++++ src/compress/flate/huffman_sortByLiteral.go | 201 ++++ src/compress/flate/level1.go | 197 ++++ src/compress/flate/level2.go | 187 ++++ src/compress/flate/level3.go | 226 +++++ src/compress/flate/level4.go | 204 +++++ src/compress/flate/level5.go | 291 ++++++ src/compress/flate/level6.go | 301 ++++++ src/compress/flate/regmask_amd64.go | 14 + src/compress/flate/regmask_other.go | 18 + .../testdata/huffman-null-max.sync.expect | Bin 0 -> 78 bytes .../huffman-null-max.sync.expect-noinput | Bin 0 -> 78 bytes .../flate/testdata/huffman-pi.sync.expect | Bin 0 -> 1696 bytes .../testdata/huffman-pi.sync.expect-noinput | Bin 0 -> 1696 bytes .../huffman-rand-1k.dyn.expect-noinput | Bin 1054 -> 1054 bytes .../testdata/huffman-rand-1k.sync.expect | Bin 0 -> 1005 bytes .../huffman-rand-1k.sync.expect-noinput | Bin 0 -> 1054 bytes .../testdata/huffman-rand-limit.dyn.expect | Bin 229 -> 186 bytes .../huffman-rand-limit.dyn.expect-noinput | Bin 229 -> 186 bytes .../flate/testdata/huffman-rand-limit.golden | Bin 252 -> 246 bytes .../testdata/huffman-rand-limit.sync.expect | Bin 0 -> 186 bytes .../huffman-rand-limit.sync.expect-noinput | Bin 0 -> 186 bytes .../flate/testdata/huffman-shifts.sync.expect | Bin 0 -> 32 bytes .../huffman-shifts.sync.expect-noinput | Bin 0 -> 32 bytes .../testdata/huffman-text-shift.sync.expect | Bin 0 -> 231 bytes .../huffman-text-shift.sync.expect-noinput | Bin 0 -> 231 bytes .../flate/testdata/huffman-text.sync.expect | 1 + .../testdata/huffman-text.sync.expect-noinput | 1 + .../flate/testdata/huffman-zero.dyn.expect | Bin 17 -> 6 bytes .../testdata/huffman-zero.dyn.expect-noinput | Bin 17 -> 6 bytes .../flate/testdata/huffman-zero.sync.expect | Bin 0 -> 6 bytes .../testdata/huffman-zero.sync.expect-noinput | Bin 0 -> 6 bytes .../null-long-match.sync.expect-noinput | Bin 0 -> 206 bytes src/compress/flate/token.go | 253 +++++- src/compress/flate/unsafe_disabled.go | 33 + src/compress/flate/writer_test.go | 118 ++- 45 files changed, 3894 insertions(+), 1537 deletions(-) create mode 100644 src/compress/flate/fuzz_test.go create mode 100644 src/compress/flate/huffman_sortByFreq.go create mode 100644 src/compress/flate/huffman_sortByLiteral.go create mode 100644 src/compress/flate/level1.go create mode 100644 src/compress/flate/level2.go create mode 100644 src/compress/flate/level3.go create mode 100644 src/compress/flate/level4.go create mode 100644 src/compress/flate/level5.go create mode 100644 src/compress/flate/level6.go create mode 100644 src/compress/flate/regmask_amd64.go create mode 100644 src/compress/flate/regmask_other.go create mode 100644 src/compress/flate/testdata/huffman-null-max.sync.expect create mode 100644 src/compress/flate/testdata/huffman-null-max.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-pi.sync.expect create mode 100644 src/compress/flate/testdata/huffman-pi.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-rand-1k.sync.expect create mode 100644 src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-rand-limit.sync.expect create mode 100644 src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-shifts.sync.expect create mode 100644 src/compress/flate/testdata/huffman-shifts.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-text-shift.sync.expect create mode 100644 src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-text.sync.expect create mode 100644 src/compress/flate/testdata/huffman-text.sync.expect-noinput create mode 100644 src/compress/flate/testdata/huffman-zero.sync.expect create mode 100644 src/compress/flate/testdata/huffman-zero.sync.expect-noinput create mode 100644 src/compress/flate/testdata/null-long-match.sync.expect-noinput create mode 100644 src/compress/flate/unsafe_disabled.go diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go index 6697f3a7913cd5..3819f2e1eae81d 100644 --- a/src/compress/flate/deflate.go +++ b/src/compress/flate/deflate.go @@ -27,132 +27,121 @@ const ( // RFC 1951 compliant. That is, any valid DEFLATE decompressor will // continue to be able to decompress this output. HuffmanOnly = -2 -) -const ( - logWindowSize = 15 - windowSize = 1 << logWindowSize - windowMask = windowSize - 1 - - // The LZ77 step produces a sequence of literal tokens and - // pair tokens. The offset is also known as distance. The underlying wire - // format limits the range of lengths and offsets. For example, there are - // 256 legitimate lengths: those in the range [3, 258]. This package's - // compressor uses a higher minimum match length, enabling optimizations - // such as finding matches via 32-bit loads and compares. - baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 - minMatchLength = 4 // The smallest match length that the compressor actually emits - maxMatchLength = 258 // The largest match length - baseMatchOffset = 1 // The smallest match offset - maxMatchOffset = 1 << 15 // The largest match offset - - // The maximum number of tokens we put into a single flate block, just to - // stop things from getting too large. - maxFlateBlockTokens = 1 << 14 + logWindowSize = 15 + windowSize = 1 << logWindowSize + windowMask = windowSize - 1 + minMatchLength = 4 // The smallest match that the compressor looks for + maxMatchLength = 258 // The longest match for the compressor + minOffsetSize = 1 // The shortest offset that makes any sense + + // The maximum number of tokens we will encode at the time. + // Smaller sizes usually creates less optimal blocks. + // Bigger can make context switching slow. + // We use this for levels 7-9, so we make it big. + maxFlateBlockTokens = 1 << 15 maxStoreBlockSize = 65535 hashBits = 17 // After 17 performance degrades hashSize = 1 << hashBits hashMask = (1 << hashBits) - 1 - maxHashOffset = 1 << 24 + maxHashOffset = 1 << 28 skipNever = math.MaxInt32 ) type compressionLevel struct { - level, good, lazy, nice, chain, fastSkipHashing int + good, lazy, nice, chain, level int } var levels = []compressionLevel{ - {0, 0, 0, 0, 0, 0}, // NoCompression. - {1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go. - // For levels 2-3 we don't bother trying with lazy matches. - {2, 4, 0, 16, 8, 5}, - {3, 4, 0, 32, 32, 6}, - // Levels 4-9 use increasingly more lazy matching + {}, // 0 + // Level 1-6 uses specialized algorithm - values not used + {0, 0, 0, 0, 1}, + {0, 0, 0, 0, 2}, + {0, 0, 0, 0, 3}, + {0, 0, 0, 0, 4}, + {0, 0, 0, 0, 5}, + {0, 0, 0, 0, 6}, + // Levels 7-9 use increasingly more lazy matching // and increasingly stringent conditions for "good enough". - {4, 4, 4, 16, 16, skipNever}, - {5, 8, 16, 32, 32, skipNever}, - {6, 8, 16, 128, 128, skipNever}, - {7, 8, 32, 128, 256, skipNever}, - {8, 32, 128, 258, 1024, skipNever}, - {9, 32, 258, 258, 4096, skipNever}, + {8, 12, 16, 24, 7}, + {16, 30, 40, 64, 8}, + {32, 258, 258, 1024, 9}, } -type compressor struct { - compressionLevel +// advancedState contains state for the advanced levels, with bigger hash tables, etc. +type advancedState struct { + // deflate state + length int + offset int + maxInsertIndex int + chainHead int + hashOffset int - w *huffmanBitWriter - bulkHasher func([]byte, []uint32) + ii uint16 // position of last match, intended to overflow to reset. - // compression algorithm - fill func(*compressor, []byte) int // copy data to window - step func(*compressor) // process window - bestSpeed *deflateFast // Encoder for BestSpeed + // input window: unprocessed data is window[index:windowEnd] + index int + hashMatch [maxMatchLength + minMatchLength]uint32 // Input hash chains // hashHead[hashValue] contains the largest inputIndex with the specified hash value // If hashHead[hashValue] is within the current window, then // hashPrev[hashHead[hashValue] & windowMask] contains the previous index // with the same hash value. - chainHead int - hashHead [hashSize]uint32 - hashPrev [windowSize]uint32 - hashOffset int + hashHead [hashSize]uint32 + hashPrev [windowSize]uint32 +} - // input window: unprocessed data is window[index:windowEnd] - index int - window []byte - windowEnd int - blockStart int // window index where current tokens start - byteAvailable bool // if true, still need to process window[index-1]. +type compressor struct { + compressionLevel - sync bool // requesting flush + h *huffmanEncoder + w *huffmanBitWriter - // queued output tokens - tokens []token + // compression algorithm + fill func(*compressor, []byte) int // copy data to window + step func(*compressor) // process window - // deflate state - length int - offset int - maxInsertIndex int - err error + window []byte + windowEnd int + blockStart int // window index where current tokens start + err error + + // queued output tokens + tokens tokens + fast fastEnc + state *advancedState - // hashMatch must be able to contain hashes for the maximum match length. - hashMatch [maxMatchLength - 1]uint32 + sync bool // requesting flush + byteAvailable bool // if true, still need to process window[index-1]. } func (d *compressor) fillDeflate(b []byte) int { - if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) { + s := d.state + if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) { // shift the window by windowSize - copy(d.window, d.window[windowSize:2*windowSize]) - d.index -= windowSize + //copy(d.window[:], d.window[windowSize:2*windowSize]) + *(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:]) + s.index -= windowSize d.windowEnd -= windowSize if d.blockStart >= windowSize { d.blockStart -= windowSize } else { d.blockStart = math.MaxInt32 } - d.hashOffset += windowSize - if d.hashOffset > maxHashOffset { - delta := d.hashOffset - 1 - d.hashOffset -= delta - d.chainHead -= delta - + s.hashOffset += windowSize + if s.hashOffset > maxHashOffset { + delta := s.hashOffset - 1 + s.hashOffset -= delta + s.chainHead -= delta // Iterate over slices instead of arrays to avoid copying // the entire table onto the stack (Issue #18625). - for i, v := range d.hashPrev[:] { - if int(v) > delta { - d.hashPrev[i] = uint32(int(v) - delta) - } else { - d.hashPrev[i] = 0 - } + for i, v := range s.hashPrev[:] { + s.hashPrev[i] = uint32(max(int(v)-delta, 0)) } - for i, v := range d.hashHead[:] { - if int(v) > delta { - d.hashHead[i] = uint32(int(v) - delta) - } else { - d.hashHead[i] = 0 - } + for i, v := range s.hashHead[:] { + s.hashHead[i] = uint32(max(int(v)-delta, 0)) } } } @@ -161,14 +150,38 @@ func (d *compressor) fillDeflate(b []byte) int { return n } -func (d *compressor) writeBlock(tokens []token, index int) error { - if index > 0 { +func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error { + if index > 0 || eof { var window []byte if d.blockStart <= index { window = d.window[d.blockStart:index] } d.blockStart = index - d.w.writeBlock(tokens, false, window) + d.w.writeBlockDynamic(tok, eof, window, d.sync) + return d.w.err + } + return nil +} + +// writeBlockSkip writes the current block and uses the number of tokens +// to determine if the block should be stored on no matches, or +// only huffman encoded. +func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error { + if index > 0 || eof { + if d.blockStart <= index { + window := d.window[d.blockStart:index] + // If we removed less than a 64th of all literals + // we huffman compress the block. + if int(tok.n) > len(window)-int(tok.n>>6) { + d.w.writeBlockHuff(eof, window, d.sync) + } else { + // Write a dynamic huffman block. + d.w.writeBlockDynamic(tok, eof, window, d.sync) + } + } else { + d.w.writeBlock(tok, eof, nil) + } + d.blockStart = index return d.w.err } return nil @@ -177,103 +190,139 @@ func (d *compressor) writeBlock(tokens []token, index int) error { // fillWindow will fill the current window with the supplied // dictionary and calculate all hashes. // This is much faster than doing a full encode. -// Should only be used after a reset. +// Should only be used after a start/reset. func (d *compressor) fillWindow(b []byte) { - // Do not fill window if we are in store-only mode. - if d.compressionLevel.level < 2 { + // Do not fill window if we are in store-only or huffman mode. + if d.level <= 0 { return } - if d.index != 0 || d.windowEnd != 0 { - panic("internal error: fillWindow called with stale data") + if d.fast != nil { + // encode the last data, but discard the result + if len(b) > maxMatchOffset { + b = b[len(b)-maxMatchOffset:] + } + d.fast.Encode(&d.tokens, b) + d.tokens.Reset() + return } - + s := d.state // If we are given too much, cut it. if len(b) > windowSize { b = b[len(b)-windowSize:] } // Add all to window. - n := copy(d.window, b) + n := copy(d.window[d.windowEnd:], b) // Calculate 256 hashes at the time (more L1 cache hits) loops := (n + 256 - minMatchLength) / 256 - for j := 0; j < loops; j++ { - index := j * 256 - end := index + 256 + minMatchLength - 1 - if end > n { - end = n - } - toCheck := d.window[index:end] - dstSize := len(toCheck) - minMatchLength + 1 + for j := range loops { + startindex := j * 256 + end := min(startindex+256+minMatchLength-1, n) + tocheck := d.window[startindex:end] + dstSize := len(tocheck) - minMatchLength + 1 if dstSize <= 0 { continue } - dst := d.hashMatch[:dstSize] - d.bulkHasher(toCheck, dst) + dst := s.hashMatch[:dstSize] + bulkHash4(tocheck, dst) + var newH uint32 for i, val := range dst { - di := i + index - hh := &d.hashHead[val&hashMask] + di := i + startindex + newH = val & hashMask // Get previous value with the same hash. // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = *hh + s.hashPrev[di&windowMask] = s.hashHead[newH] // Set the head of the hash chain to us. - *hh = uint32(di + d.hashOffset) + s.hashHead[newH] = uint32(di + s.hashOffset) } } // Update window information. - d.windowEnd = n - d.index = n + d.windowEnd += n + s.index = n } // Try to find a match starting at index whose length is greater than prevSize. // We only look at chainCount possibilities before giving up. -func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { - minMatchLook := maxMatchLength - if lookahead < minMatchLook { - minMatchLook = lookahead - } +func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) { + minMatchLook := min(lookahead, maxMatchLength) win := d.window[0 : pos+minMatchLook] // We quit when we get a match that's at least nice long - nice := len(win) - pos - if d.nice < nice { - nice = d.nice - } + nice := min(d.nice, len(win)-pos) // If we've got a match that's good enough, only look in 1/4 the chain. tries := d.chain - length = prevLength - if length >= d.good { - tries >>= 2 - } + length = minMatchLength - 1 wEnd := win[pos+length] wPos := win[pos:] - minIndex := pos - windowSize + minIndex := max(pos-windowSize, 0) + offset = 0 + + if d.chain < 100 { + for i := prevHead; tries > 0; tries-- { + if wEnd == win[i+length] { + n := matchLen(win[i:i+minMatchLook], wPos) + if n > length { + length = n + offset = pos - i + ok = true + if n >= nice { + // The match is good enough that we don't try to find a better one. + break + } + wEnd = win[pos+n] + } + } + if i <= minIndex { + // hashPrev[i & windowMask] has already been overwritten, so stop now. + break + } + i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset + if i < minIndex { + break + } + } + return + } + + // Minimum gain to accept a match. + cGain := 4 + + // Some like it higher (CSV), some like it lower (JSON) + const baseCost = 3 + // Base is 4 bytes at with an additional cost. + // Matches must be better than this. for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { - n := matchLen(win[i:], wPos, minMatchLook) - - if n > length && (n > minMatchLength || pos-i <= 4096) { - length = n - offset = pos - i - ok = true - if n >= nice { - // The match is good enough that we don't try to find a better one. - break + n := matchLen(win[i:i+minMatchLook], wPos) + if n > length { + // Calculate gain. Estimates the gains of the new match compared to emitting as literals. + newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]]) + + if newGain > cGain { + length = n + offset = pos - i + cGain = newGain + ok = true + if n >= nice { + // The match is good enough that we don't try to find a better one. + break + } + wEnd = win[pos+n] } - wEnd = win[pos+n] } } - if i == minIndex { + if i <= minIndex { // hashPrev[i & windowMask] has already been overwritten, so stop now. break } - i = int(d.hashPrev[i&windowMask]) - d.hashOffset - if i < minIndex || i < 0 { + i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset + if i < minIndex { break } } @@ -288,235 +337,272 @@ func (d *compressor) writeStoredBlock(buf []byte) error { return d.w.err } -const hashmul = 0x1e35a7bd - // hash4 returns a hash representation of the first 4 bytes // of the supplied slice. // The caller must ensure that len(b) >= 4. func hash4(b []byte) uint32 { - return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits) + return hash4u(loadLE32(b, 0), hashBits) +} + +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { + return (u * prime4bytes) >> (32 - h) } // bulkHash4 will compute hashes using the same -// algorithm as hash4. +// algorithm as hash4 func bulkHash4(b []byte, dst []uint32) { - if len(b) < minMatchLength { + if len(b) < 4 { return } - hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 - dst[0] = (hb * hashmul) >> (32 - hashBits) - end := len(b) - minMatchLength + 1 - for i := 1; i < end; i++ { - hb = (hb << 8) | uint32(b[i+3]) - dst[i] = (hb * hashmul) >> (32 - hashBits) - } -} - -// matchLen returns the number of matching bytes in a and b -// up to length 'max'. Both slices must be at least 'max' -// bytes in size. -func matchLen(a, b []byte, max int) int { - a = a[:max] - b = b[:len(a)] - for i, av := range a { - if b[i] != av { - return i - } - } - return max -} - -// encSpeed will compress and store the currently added data, -// if enough has been accumulated or we at the end of the stream. -// Any error that occurred will be in d.err -func (d *compressor) encSpeed() { - // We only compress if we have maxStoreBlockSize. - if d.windowEnd < maxStoreBlockSize { - if !d.sync { - return - } - - // Handle small sizes. - if d.windowEnd < 128 { - switch { - case d.windowEnd == 0: - return - case d.windowEnd <= 16: - d.err = d.writeStoredBlock(d.window[:d.windowEnd]) - default: - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) - d.err = d.w.err - } - d.windowEnd = 0 - d.bestSpeed.reset() - return - } - - } - // Encode the block. - d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd]) + hb := loadLE32(b, 0) - // If we removed less than 1/16th, Huffman compress the block. - if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) { - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) - } else { - d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd]) + dst[0] = hash4u(hb, hashBits) + end := len(b) - 4 + 1 + for i := 1; i < end; i++ { + hb = (hb >> 8) | uint32(b[i+3])<<24 + dst[i] = hash4u(hb, hashBits) } - d.err = d.w.err - d.windowEnd = 0 } func (d *compressor) initDeflate() { d.window = make([]byte, 2*windowSize) - d.hashOffset = 1 - d.tokens = make([]token, 0, maxFlateBlockTokens+1) - d.length = minMatchLength - 1 - d.offset = 0 d.byteAvailable = false - d.index = 0 - d.chainHead = -1 - d.bulkHasher = bulkHash4 + d.err = nil + if d.state == nil { + return + } + s := d.state + s.index = 0 + s.hashOffset = 1 + s.length = minMatchLength - 1 + s.offset = 0 + s.chainHead = -1 } -func (d *compressor) deflate() { - if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync { +// deflateLazy does encoding with lazy matching. +func (d *compressor) deflateLazy() { + s := d.state + + if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { return } + if d.windowEnd != s.index && d.chain > 100 { + // Get literal huffman coder. + // This is used to estimate the cost of emitting a literal. + if d.h == nil { + d.h = newHuffmanEncoder(maxFlateBlockTokens) + } + var tmp [256]uint16 + for _, v := range d.window[s.index:d.windowEnd] { + tmp[v]++ + } + d.h.generate(tmp[:], 15) + } - d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) + s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) -Loop: for { - if d.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - d.index + lookahead := d.windowEnd - s.index if lookahead < minMatchLength+maxMatchLength { if !d.sync { - break Loop - } - if d.index > d.windowEnd { - panic("index > windowEnd") + return } if lookahead == 0 { // Flush current output block if any. if d.byteAvailable { // There is still one pending token that needs to be flushed - d.tokens = append(d.tokens, literalToken(uint32(d.window[d.index-1]))) + d.tokens.AddLiteral(d.window[s.index-1]) d.byteAvailable = false } - if len(d.tokens) > 0 { - if d.err = d.writeBlock(d.tokens, d.index); d.err != nil { + if d.tokens.n > 0 { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens = d.tokens[:0] + d.tokens.Reset() } - break Loop + return } } - if d.index < d.maxInsertIndex { + if s.index < s.maxInsertIndex { // Update the hash - hash := hash4(d.window[d.index : d.index+minMatchLength]) - hh := &d.hashHead[hash&hashMask] - d.chainHead = int(*hh) - d.hashPrev[d.index&windowMask] = uint32(d.chainHead) - *hh = uint32(d.index + d.hashOffset) + hash := hash4(d.window[s.index:]) + ch := s.hashHead[hash] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[hash] = uint32(s.index + s.hashOffset) } - prevLength := d.length - prevOffset := d.offset - d.length = minMatchLength - 1 - d.offset = 0 - minIndex := d.index - windowSize - if minIndex < 0 { - minIndex = 0 + prevLength := s.length + prevOffset := s.offset + s.length = minMatchLength - 1 + s.offset = 0 + minIndex := max(s.index-windowSize, 0) + + if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { + if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok { + s.length = newLength + s.offset = newOffset + } } - if d.chainHead-d.hashOffset >= minIndex && - (d.fastSkipHashing != skipNever && lookahead > minMatchLength-1 || - d.fastSkipHashing == skipNever && lookahead > prevLength && prevLength < d.lazy) { - if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok { - d.length = newLength - d.offset = newOffset + if prevLength >= minMatchLength && s.length <= prevLength { + // No better match, but check for better match at end... + // + // Skip forward a number of bytes. + // Offset of 2 seems to yield the best results. 3 is sometimes better. + const checkOff = 2 + + // Check all, except full length + if prevLength < maxMatchLength-checkOff { + prevIndex := s.index - 1 + if prevIndex+prevLength < s.maxInsertIndex { + end := min(lookahead, maxMatchLength+checkOff) + end += prevIndex + + // Hash at match end. + h := hash4(d.window[prevIndex+prevLength:]) + ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff { + length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:]) + // It seems like a pure length metric is best. + if length > prevLength { + prevLength = length + prevOffset = prevIndex - ch2 + + // Extend back... + for i := checkOff - 1; i >= 0; i-- { + if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] { + // Emit tokens we "owe" + for j := 0; j <= i; j++ { + d.tokens.AddLiteral(d.window[prevIndex+j]) + if d.tokens.n == maxFlateBlockTokens { + // The block includes the current character + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + s.index++ + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } + } + break + } else { + prevLength++ + } + } + } + } + } } - } - if d.fastSkipHashing != skipNever && d.length >= minMatchLength || - d.fastSkipHashing == skipNever && prevLength >= minMatchLength && d.length <= prevLength { // There was a match at the previous step, and the current match is // not better. Output the previous match. - if d.fastSkipHashing != skipNever { - d.tokens = append(d.tokens, matchToken(uint32(d.length-baseMatchLength), uint32(d.offset-baseMatchOffset))) - } else { - d.tokens = append(d.tokens, matchToken(uint32(prevLength-baseMatchLength), uint32(prevOffset-baseMatchOffset))) - } + d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) + // Insert in the hash table all strings up to the end of the match. // index and index-1 are already inserted. If there is not enough // lookahead, the last two strings are not inserted into the hash // table. - if d.length <= d.fastSkipHashing { - var newIndex int - if d.fastSkipHashing != skipNever { - newIndex = d.index + d.length - } else { - newIndex = d.index + prevLength - 1 - } - index := d.index - for index++; index < newIndex; index++ { - if index < d.maxInsertIndex { - hash := hash4(d.window[index : index+minMatchLength]) - // Get previous value with the same hash. - // Our chain should point to the previous value. - hh := &d.hashHead[hash&hashMask] - d.hashPrev[index&windowMask] = *hh - // Set the head of the hash chain to us. - *hh = uint32(index + d.hashOffset) - } + newIndex := s.index + prevLength - 1 + // Calculate missing hashes + end := min(newIndex, s.maxInsertIndex) + end += minMatchLength - 1 + startindex := min(s.index+1, s.maxInsertIndex) + tocheck := d.window[startindex:end] + dstSize := len(tocheck) - minMatchLength + 1 + if dstSize > 0 { + dst := s.hashMatch[:dstSize] + bulkHash4(tocheck, dst) + var newH uint32 + for i, val := range dst { + di := i + startindex + newH = val & hashMask + // Get previous value with the same hash. + // Our chain should point to the previous value. + s.hashPrev[di&windowMask] = s.hashHead[newH] + // Set the head of the hash chain to us. + s.hashHead[newH] = uint32(di + s.hashOffset) } - d.index = index - - if d.fastSkipHashing == skipNever { - d.byteAvailable = false - d.length = minMatchLength - 1 - } - } else { - // For matches this long, we don't bother inserting each individual - // item into the table. - d.index += d.length } - if len(d.tokens) == maxFlateBlockTokens { + + s.index = newIndex + d.byteAvailable = false + s.length = minMatchLength - 1 + if d.tokens.n == maxFlateBlockTokens { // The block includes the current character - if d.err = d.writeBlock(d.tokens, d.index); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens = d.tokens[:0] + d.tokens.Reset() } + s.ii = 0 } else { - if d.fastSkipHashing != skipNever || d.byteAvailable { - i := d.index - 1 - if d.fastSkipHashing != skipNever { - i = d.index - } - d.tokens = append(d.tokens, literalToken(uint32(d.window[i]))) - if len(d.tokens) == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, i+1); d.err != nil { + // Reset, if we got a match this run. + if s.length >= minMatchLength { + s.ii = 0 + } + // We have a byte waiting. Emit it. + if d.byteAvailable { + s.ii++ + d.tokens.AddLiteral(d.window[s.index-1]) + if d.tokens.n == maxFlateBlockTokens { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens = d.tokens[:0] + d.tokens.Reset() } - } - d.index++ - if d.fastSkipHashing == skipNever { + s.index++ + + // If we have a long run of no matches, skip additional bytes + // Resets when s.ii overflows after 64KB. + if n := int(s.ii) - d.chain; n > 0 { + n = 1 + int(n>>6) + for j := 0; j < n; j++ { + if s.index >= d.windowEnd-1 { + break + } + d.tokens.AddLiteral(d.window[s.index-1]) + if d.tokens.n == maxFlateBlockTokens { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + // Index... + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } + s.index++ + } + // Flush last byte + d.tokens.AddLiteral(d.window[s.index-1]) + d.byteAvailable = false + // s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength + if d.tokens.n == maxFlateBlockTokens { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + } + } else { + s.index++ d.byteAvailable = true } } } } -func (d *compressor) fillStore(b []byte) int { - n := copy(d.window[d.windowEnd:], b) - d.windowEnd += n - return n -} - func (d *compressor) store() { if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) { d.err = d.writeStoredBlock(d.window[:d.windowEnd]) @@ -524,38 +610,93 @@ func (d *compressor) store() { } } -// storeHuff compresses and stores the currently added data -// when the d.window is full or we are at the end of the stream. +// fillWindow will fill the buffer with data for huffman-only compression. +// The number of bytes copied is returned. +func (d *compressor) fillBlock(b []byte) int { + n := copy(d.window[d.windowEnd:], b) + d.windowEnd += n + return n +} + +// storeHuff will compress and store the currently added data, +// if enough has been accumulated or we at the end of the stream. // Any error that occurred will be in d.err func (d *compressor) storeHuff() { if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 { return } - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) d.err = d.w.err d.windowEnd = 0 } +// storeFast will compress and store the currently added data, +// if enough has been accumulated or we at the end of the stream. +// Any error that occurred will be in d.err +func (d *compressor) storeFast() { + // We only compress if we have maxStoreBlockSize. + if d.windowEnd < len(d.window) { + if !d.sync { + return + } + // Handle extremely small sizes. + if d.windowEnd < 128 { + if d.windowEnd == 0 { + return + } + if d.windowEnd <= 32 { + d.err = d.writeStoredBlock(d.window[:d.windowEnd]) + } else { + d.w.writeBlockHuff(false, d.window[:d.windowEnd], true) + d.err = d.w.err + } + d.tokens.Reset() + d.windowEnd = 0 + d.fast.Reset() + return + } + } + + d.fast.Encode(&d.tokens, d.window[:d.windowEnd]) + // If we made zero matches, store the block as is. + if d.tokens.n == 0 { + d.err = d.writeStoredBlock(d.window[:d.windowEnd]) + // If we removed less than 1/16th, huffman compress the block. + } else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) { + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) + d.err = d.w.err + } else { + d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync) + d.err = d.w.err + } + d.tokens.Reset() + d.windowEnd = 0 +} + +// write will add input byte to the stream. +// Unless an error occurs all bytes will be consumed. func (d *compressor) write(b []byte) (n int, err error) { if d.err != nil { return 0, d.err } n = len(b) for len(b) > 0 { - d.step(d) + if d.windowEnd == len(d.window) || d.sync { + d.step(d) + } b = b[d.fill(d, b):] if d.err != nil { return 0, d.err } } - return n, nil + return n, d.err } func (d *compressor) syncFlush() error { + d.sync = true if d.err != nil { return d.err } - d.sync = true d.step(d) if d.err == nil { d.w.writeStoredHeader(0, false) @@ -572,30 +713,33 @@ func (d *compressor) init(w io.Writer, level int) (err error) { switch { case level == NoCompression: d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillStore + d.fill = (*compressor).fillBlock d.step = (*compressor).store case level == HuffmanOnly: - d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillStore + d.w.logNewTablePenalty = 10 + d.window = make([]byte, 32<<10) + d.fill = (*compressor).fillBlock d.step = (*compressor).storeHuff - case level == BestSpeed: - d.compressionLevel = levels[level] - d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillStore - d.step = (*compressor).encSpeed - d.bestSpeed = newDeflateFast() - d.tokens = make([]token, maxStoreBlockSize) case level == DefaultCompression: level = 6 fallthrough - case 2 <= level && level <= 9: + case level >= 1 && level <= 6: + d.w.logNewTablePenalty = 7 + d.fast = newFastEnc(level) + d.window = make([]byte, maxStoreBlockSize) + d.fill = (*compressor).fillBlock + d.step = (*compressor).storeFast + case 7 <= level && level <= 9: + d.w.logNewTablePenalty = 8 + d.state = &advancedState{} d.compressionLevel = levels[level] d.initDeflate() d.fill = (*compressor).fillDeflate - d.step = (*compressor).deflate + d.step = (*compressor).deflateLazy default: return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level) } + d.level = level return nil } @@ -603,27 +747,39 @@ func (d *compressor) reset(w io.Writer) { d.w.reset(w) d.sync = false d.err = nil - switch d.compressionLevel.level { - case NoCompression: + // We only need to reset a few things for Snappy. + if d.fast != nil { + d.fast.Reset() d.windowEnd = 0 - case BestSpeed: + d.tokens.Reset() + return + } + switch d.compressionLevel.chain { + case 0: + // level was NoCompression or ConstantCompression. d.windowEnd = 0 - d.tokens = d.tokens[:0] - d.bestSpeed.reset() default: - d.chainHead = -1 - clear(d.hashHead[:]) - clear(d.hashPrev[:]) - d.hashOffset = 1 - d.index, d.windowEnd = 0, 0 + s := d.state + s.chainHead = -1 + for i := range s.hashHead { + s.hashHead[i] = 0 + } + for i := range s.hashPrev { + s.hashPrev[i] = 0 + } + s.hashOffset = 1 + s.index, d.windowEnd = 0, 0 d.blockStart, d.byteAvailable = 0, false - d.tokens = d.tokens[:0] - d.length = minMatchLength - 1 - d.offset = 0 - d.maxInsertIndex = 0 + d.tokens.Reset() + s.length = minMatchLength - 1 + s.offset = 0 + s.ii = 0 + s.maxInsertIndex = 0 } } +var errWriterClosed = errors.New("flate: closed writer") + func (d *compressor) close() error { if d.err == errWriterClosed { return nil @@ -644,6 +800,7 @@ func (d *compressor) close() error { return d.w.err } d.err = errWriterClosed + d.w.reset(nil) return nil } @@ -674,26 +831,15 @@ func NewWriter(w io.Writer, level int) (*Writer, error) { // can only be decompressed by a reader initialized with the // same dictionary (see [NewReaderDict]). func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) { - dw := &dictWriter{w} - zw, err := NewWriter(dw, level) + zw, err := NewWriter(w, level) if err != nil { return nil, err } zw.d.fillWindow(dict) zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method. - return zw, nil -} - -type dictWriter struct { - w io.Writer + return zw, err } -func (w *dictWriter) Write(b []byte) (n int, err error) { - return w.w.Write(b) -} - -var errWriterClosed = errors.New("flate: closed writer") - // A Writer takes data written to it and writes the compressed // form of that data to an underlying writer (see [NewWriter]). type Writer struct { @@ -728,16 +874,26 @@ func (w *Writer) Close() error { } // Reset discards the writer's state and makes it equivalent to -// the result of [NewWriter] or [NewWriterDict] called with dst +// the result of NewWriter or NewWriterDict called with dst // and w's level and dictionary. func (w *Writer) Reset(dst io.Writer) { - if dw, ok := w.d.w.writer.(*dictWriter); ok { + if len(w.dict) > 0 { // w was created with NewWriterDict - dw.w = dst - w.d.reset(dw) - w.d.fillWindow(w.dict) + w.d.reset(dst) + if dst != nil { + w.d.fillWindow(w.dict) + } } else { // w was created with NewWriter w.d.reset(dst) } } + +// ResetDict discards the writer's state and makes it equivalent to +// the result of NewWriter or NewWriterDict called with dst +// and w's level, but sets a specific dictionary. +func (w *Writer) ResetDict(dst io.Writer, dict []byte) { + w.dict = dict + w.d.reset(dst) + w.d.fillWindow(w.dict) +} diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go index 3610c7bf8763df..4bb89c61dcad0c 100644 --- a/src/compress/flate/deflate_test.go +++ b/src/compress/flate/deflate_test.go @@ -6,14 +6,11 @@ package flate import ( "bytes" - "errors" "fmt" - "internal/testenv" "io" - "math/rand" "os" "reflect" - "runtime/debug" + "strings" "sync" "testing" ) @@ -35,24 +32,24 @@ type reverseBitsTest struct { } var deflateTests = []*deflateTest{ - {[]byte{}, 0, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, -1, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11}, DefaultCompression, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11}, 4, []byte{18, 4, 4, 0, 0, 255, 255}}, - - {[]byte{0x11}, 0, []byte{0, 1, 0, 254, 255, 17, 1, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 0, []byte{0, 2, 0, 253, 255, 17, 18, 1, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0, - []byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255}, + 0: {[]byte{}, 0, []byte{0x3, 0x0}}, + 1: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + 2: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + 3: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + + 4: {[]byte{0x11}, 0, []byte{0x0, 0x1, 0x0, 0xfe, 0xff, 0x11, 0x3, 0x0}}, + 5: {[]byte{0x11, 0x12}, 0, []byte{0x0, 0x2, 0x0, 0xfd, 0xff, 0x11, 0x12, 0x3, 0x0}}, + 6: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0, + []byte{0x0, 0x8, 0x0, 0xf7, 0xff, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x3, 0x0}, }, - {[]byte{}, 2, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}}, - {[]byte{}, 9, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}}, + 7: {[]byte{}, 1, []byte{0x3, 0x0}}, + 8: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + 9: {[]byte{0x11, 0x12}, BestCompression, []byte{0x12, 0x14, 0x2, 0xc, 0x0}}, + 10: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, BestCompression, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}}, + 11: {[]byte{}, 9, []byte{0x3, 0x0}}, + 12: {[]byte{0x11}, 9, []byte{0x12, 0x4, 0xc, 0x0}}, + 13: {[]byte{0x11, 0x12}, 9, []byte{0x12, 0x14, 0x2, 0xc, 0x0}}, + 14: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}}, } var deflateInflateTests = []*deflateInflateTest{ @@ -86,23 +83,24 @@ func largeDataChunk() []byte { func TestBulkHash4(t *testing.T) { for _, x := range deflateTests { y := x.out - if len(y) < minMatchLength { - continue - } - y = append(y, y...) - for j := 4; j < len(y); j++ { - y := y[:j] - dst := make([]uint32, len(y)-minMatchLength+1) - for i := range dst { - dst[i] = uint32(i + 100) - } - bulkHash4(y, dst) - for i, got := range dst { - want := hash4(y[i:]) - if got != want && got == uint32(i)+100 { - t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want) - } else if got != want { - t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want) + if len(y) >= minMatchLength { + y = append(y, y...) + for j := 4; j < len(y); j++ { + y := y[:j] + dst := make([]uint32, len(y)-minMatchLength+1) + for i := range dst { + dst[i] = uint32(i + 100) + } + bulkHash4(y, dst) + for i, got := range dst { + want := hash4(y[i:]) + if got != want && got == uint32(i)+100 { + t.Errorf("Len:%d Index:%d, expected 0x%08x but not modified", len(y), i, want) + } else if got != want { + t.Errorf("Len:%d Index:%d, got 0x%08x expected:0x%08x", len(y), i, got, want) + } else { + //t.Logf("Len:%d Index:%d OK (0x%08x)", len(y), i, got) + } } } } @@ -110,7 +108,7 @@ func TestBulkHash4(t *testing.T) { } func TestDeflate(t *testing.T) { - for _, h := range deflateTests { + for i, h := range deflateTests { var buf bytes.Buffer w, err := NewWriter(&buf, h.level) if err != nil { @@ -120,45 +118,11 @@ func TestDeflate(t *testing.T) { w.Write(h.in) w.Close() if !bytes.Equal(buf.Bytes(), h.out) { - t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out) + t.Errorf("%d: Deflate(%d, %x) got \n%#v, want \n%#v", i, h.level, h.in, buf.Bytes(), h.out) } } } -func TestWriterClose(t *testing.T) { - b := new(bytes.Buffer) - zw, err := NewWriter(b, 6) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - - if c, err := zw.Write([]byte("Test")); err != nil || c != 4 { - t.Fatalf("Write to not closed writer: %s, %d", err, c) - } - - if err := zw.Close(); err != nil { - t.Fatalf("Close: %v", err) - } - - afterClose := b.Len() - - if c, err := zw.Write([]byte("Test")); err == nil || c != 0 { - t.Fatalf("Write to closed writer: %v, %d", err, c) - } - - if err := zw.Flush(); err == nil { - t.Fatalf("Flush to closed writer: %s", err) - } - - if err := zw.Close(); err != nil { - t.Fatalf("Close: %v", err) - } - - if afterClose != b.Len() { - t.Fatalf("Writer wrote data after close. After close: %d. After writes on closed stream: %d", afterClose, b.Len()) - } -} - // A sparseReader returns a stream consisting of 0s followed by 1<<16 1s. // This tests missing hash references in a very large input. type sparseReader struct { @@ -191,7 +155,8 @@ func TestVeryLongSparseChunk(t *testing.T) { if testing.Short() { t.Skip("skipping sparse chunk during short test") } - w, err := NewWriter(io.Discard, 1) + var buf bytes.Buffer + w, err := NewWriter(&buf, 1) if err != nil { t.Errorf("NewWriter: %v", err) return @@ -200,6 +165,7 @@ func TestVeryLongSparseChunk(t *testing.T) { t.Errorf("Compress failed: %v", err) return } + t.Log("Length:", buf.Len()) } type syncBuffer struct { @@ -270,7 +236,7 @@ func testSync(t *testing.T, level int, input []byte, name string) { r := NewReader(buf) // Write half the input and read back. - for i := 0; i < 2; i++ { + for i := range 2 { var lo, hi int if i == 0 { lo, hi = 0, (len(input)+1)/2 @@ -348,13 +314,13 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str } w.Write(input) w.Close() + if limit > 0 { + t.Logf("level: %d - Size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len()) + } if limit > 0 && buffer.Len() > limit { t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit) - return - } - if limit > 0 { - t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len()) } + r := NewReader(&buffer) out, err := io.ReadAll(r) if err != nil { @@ -363,6 +329,8 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str } r.Close() if !bytes.Equal(input, out) { + os.WriteFile("testdata/fails/"+t.Name()+".got", out, os.ModePerm) + os.WriteFile("testdata/fails/"+t.Name()+".want", input, os.ModePerm) t.Errorf("decompress(compress(data)) != data: level=%d input=%s", level, name) return } @@ -370,19 +338,14 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str } func testToFromWithLimit(t *testing.T, input []byte, name string, limit [11]int) { - for i := 0; i < 10; i++ { + for i := range 10 { testToFromWithLevelAndLimit(t, i, input, name, limit[i]) } - // Test HuffmanCompression testToFromWithLevelAndLimit(t, -2, input, name, limit[10]) } func TestDeflateInflate(t *testing.T) { - t.Parallel() for i, h := range deflateInflateTests { - if testing.Short() && len(h.in) > 10000 { - continue - } testToFromWithLimit(t, h.in, fmt.Sprintf("#%d", i), [11]int{}) } } @@ -399,33 +362,38 @@ func TestReverseBits(t *testing.T) { type deflateInflateStringTest struct { filename string label string - limit [11]int + limit [11]int // Number 11 is ConstantCompression } var deflateInflateStringTests = []deflateInflateStringTest{ { "../testdata/e.txt", "2.718281828...", - [...]int{100018, 50650, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683}, + [...]int{100018, 67900, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683 + 100}, }, { "../../testdata/Isaac.Newton-Opticks.txt", "Isaac.Newton-Opticks", - [...]int{567248, 218338, 198211, 193152, 181100, 175427, 175427, 173597, 173422, 173422, 325240}, + [...]int{567248, 218338, 201354, 199101, 190627, 182587, 179765, 174982, 173422, 173422, 325240}, }, } func TestDeflateInflateString(t *testing.T) { - t.Parallel() - if testing.Short() && testenv.Builder() == "" { - t.Skip("skipping in short mode") - } for _, test := range deflateInflateStringTests { gold, err := os.ReadFile(test.filename) if err != nil { t.Error(err) } - testToFromWithLimit(t, gold, test.label, test.limit) + // Remove returns that may be present on Windows + neutral := strings.Map(func(r rune) rune { + if r != '\r' { + return r + } + return -1 + }, string(gold)) + + testToFromWithLimit(t, []byte(neutral), test.label, test.limit) + if testing.Short() { break } @@ -460,31 +428,36 @@ func TestReaderDict(t *testing.T) { func TestWriterDict(t *testing.T) { const ( - dict = "hello world" - text = "hello again world" + dict = "hello world Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." + text = "hello world Lorem ipsum dolor sit amet" ) - var b bytes.Buffer - w, err := NewWriter(&b, 5) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - w.Write([]byte(dict)) - w.Flush() - b.Reset() - w.Write([]byte(text)) - w.Close() + // This test is sensitive to algorithm changes that skip + // data in favour of speed. Higher levels are less prone to this + // so we test level 4-9. + for l := 4; l < 9; l++ { + var b bytes.Buffer + w, err := NewWriter(&b, l) + if err != nil { + t.Fatalf("level %d, NewWriter: %v", l, err) + } + w.Write([]byte(dict)) + w.Flush() + b.Reset() + w.Write([]byte(text)) + w.Close() - var b1 bytes.Buffer - w, _ = NewWriterDict(&b1, 5, []byte(dict)) - w.Write([]byte(text)) - w.Close() + var b1 bytes.Buffer + w, _ = NewWriterDict(&b1, l, []byte(dict)) + w.Write([]byte(text)) + w.Close() - if !bytes.Equal(b1.Bytes(), b.Bytes()) { - t.Fatalf("writer wrote %q want %q", b1.Bytes(), b.Bytes()) + if !bytes.Equal(b1.Bytes(), b.Bytes()) { + t.Errorf("level %d, writer wrote\n%v\n want\n%v", l, b1.Bytes(), b.Bytes()) + } } } -// See https://golang.org/issue/2508 +// See http://code.google.com/p/go/issues/detail?id=2508 func TestRegression2508(t *testing.T) { if testing.Short() { t.Logf("test disabled with -short") @@ -495,7 +468,7 @@ func TestRegression2508(t *testing.T) { t.Fatalf("NewWriter: %v", err) } buf := make([]byte, 1024) - for i := 0; i < 131072; i++ { + for range 131072 { if _, err := w.Write(buf); err != nil { t.Fatalf("writer failed: %v", err) } @@ -504,8 +477,10 @@ func TestRegression2508(t *testing.T) { } func TestWriterReset(t *testing.T) { - t.Parallel() - for level := 0; level <= 9; level++ { + for level := -2; level <= 9; level++ { + if level == -1 { + level++ + } if testing.Short() && level > 1 { break } @@ -514,11 +489,7 @@ func TestWriterReset(t *testing.T) { t.Fatalf("NewWriter: %v", err) } buf := []byte("hello world") - n := 1024 - if testing.Short() { - n = 10 - } - for i := 0; i < n; i++ { + for range 1024 { w.Write(buf) } w.Reset(io.Discard) @@ -531,12 +502,12 @@ func TestWriterReset(t *testing.T) { // DeepEqual doesn't compare functions. w.d.fill, wref.d.fill = nil, nil w.d.step, wref.d.step = nil, nil - w.d.bulkHasher, wref.d.bulkHasher = nil, nil - w.d.bestSpeed, wref.d.bestSpeed = nil, nil + w.d.state, wref.d.state = nil, nil + w.d.fast, wref.d.fast = nil, nil + // hashMatch is always overwritten when used. - copy(w.d.hashMatch[:], wref.d.hashMatch[:]) - if len(w.d.tokens) != 0 { - t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens)) + if w.d.tokens.n != 0 { + t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, w.d.tokens.n) } // As long as the length is 0, we don't care about the content. w.d.tokens = wref.d.tokens @@ -548,76 +519,64 @@ func TestWriterReset(t *testing.T) { } } - levels := []int{0, 1, 2, 5, 9} - for _, level := range levels { - t.Run(fmt.Sprint(level), func(t *testing.T) { - testResetOutput(t, level, nil) + for i := HuffmanOnly; i <= BestCompression; i++ { + testResetOutput(t, fmt.Sprint("level-", i), func(w io.Writer) (*Writer, error) { return NewWriter(w, i) }) + } + dict := []byte(strings.Repeat("we are the world - how are you?", 3)) + for i := HuffmanOnly; i <= BestCompression; i++ { + testResetOutput(t, fmt.Sprint("dict-level-", i), func(w io.Writer) (*Writer, error) { return NewWriterDict(w, i, dict) }) + } + for i := HuffmanOnly; i <= BestCompression; i++ { + testResetOutput(t, fmt.Sprint("dict-reset-level-", i), func(w io.Writer) (*Writer, error) { + w2, err := NewWriter(nil, i) + if err != nil { + return w2, err + } + w2.ResetDict(w, dict) + return w2, nil }) } - - t.Run("dict", func(t *testing.T) { - for _, level := range levels { - t.Run(fmt.Sprint(level), func(t *testing.T) { - testResetOutput(t, level, nil) - }) - } - }) } -func testResetOutput(t *testing.T, level int, dict []byte) { - writeData := func(w *Writer) { - msg := []byte("now is the time for all good gophers") - w.Write(msg) - w.Flush() - - hello := []byte("hello world") - for i := 0; i < 1024; i++ { - w.Write(hello) +func testResetOutput(t *testing.T, name string, newWriter func(w io.Writer) (*Writer, error)) { + t.Run(name, func(t *testing.T) { + buf := new(bytes.Buffer) + w, err := newWriter(buf) + if err != nil { + t.Fatalf("NewWriter: %v", err) } + b := []byte("hello world - how are you doing?") + for range 1024 { + w.Write(b) + } + w.Close() + out1 := buf.Bytes() - fill := bytes.Repeat([]byte("x"), 65000) - w.Write(fill) - } - - buf := new(bytes.Buffer) - var w *Writer - var err error - if dict == nil { - w, err = NewWriter(buf, level) - } else { - w, err = NewWriterDict(buf, level, dict) - } - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - - writeData(w) - w.Close() - out1 := buf.Bytes() - - buf2 := new(bytes.Buffer) - w.Reset(buf2) - writeData(w) - w.Close() - out2 := buf2.Bytes() + buf2 := new(bytes.Buffer) + w.Reset(buf2) + for range 1024 { + w.Write(b) + } + w.Close() + out2 := buf2.Bytes() - if len(out1) != len(out2) { - t.Errorf("got %d, expected %d bytes", len(out2), len(out1)) - return - } - if !bytes.Equal(out1, out2) { - mm := 0 - for i, b := range out1[:len(out2)] { - if b != out2[i] { - t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b) - } - mm++ - if mm == 10 { - t.Fatal("Stopping") + if len(out1) != len(out2) { + t.Errorf("got %d, expected %d bytes", len(out2), len(out1)) + } + if !bytes.Equal(out1, out2) { + mm := 0 + for i, b := range out1[:len(out2)] { + if b != out2[i] { + t.Errorf("mismatch index %d: %02x, expected %02x", i, out2[i], b) + } + mm++ + if mm == 10 { + t.Fatal("Stopping") + } } } - } - t.Logf("got %d bytes", len(out1)) + t.Logf("got %d bytes", len(out1)) + }) } // TestBestSpeed tests that round-tripping through deflate and then inflate @@ -625,7 +584,6 @@ func testResetOutput(t *testing.T, level int, dict []byte) { // compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize // (65535). func TestBestSpeed(t *testing.T) { - t.Parallel() abc := make([]byte, 128) for i := range abc { abc[i] = byte(i) @@ -653,8 +611,8 @@ func TestBestSpeed(t *testing.T) { } for i, tc := range testCases { - if i >= 3 && testing.Short() { - break + if testing.Short() && i > 5 { + t.Skip() } for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} { tc[0] = firstN @@ -703,368 +661,3 @@ func TestBestSpeed(t *testing.T) { } } } - -var errIO = errors.New("IO error") - -// failWriter fails with errIO exactly at the nth call to Write. -type failWriter struct{ n int } - -func (w *failWriter) Write(b []byte) (int, error) { - w.n-- - if w.n == -1 { - return 0, errIO - } - return len(b), nil -} - -func TestWriterPersistentWriteError(t *testing.T) { - t.Parallel() - d, err := os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt") - if err != nil { - t.Fatalf("ReadFile: %v", err) - } - d = d[:10000] // Keep this test short - - zw, err := NewWriter(nil, DefaultCompression) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - - // Sweep over the threshold at which an error is returned. - // The variable i makes it such that the ith call to failWriter.Write will - // return errIO. Since failWriter errors are not persistent, we must ensure - // that flate.Writer errors are persistent. - for i := 0; i < 1000; i++ { - fw := &failWriter{i} - zw.Reset(fw) - - _, werr := zw.Write(d) - cerr := zw.Close() - ferr := zw.Flush() - if werr != errIO && werr != nil { - t.Errorf("test %d, mismatching Write error: got %v, want %v", i, werr, errIO) - } - if cerr != errIO && fw.n < 0 { - t.Errorf("test %d, mismatching Close error: got %v, want %v", i, cerr, errIO) - } - if ferr != errIO && fw.n < 0 { - t.Errorf("test %d, mismatching Flush error: got %v, want %v", i, ferr, errIO) - } - if fw.n >= 0 { - // At this point, the failure threshold was sufficiently high enough - // that we wrote the whole stream without any errors. - return - } - } -} -func TestWriterPersistentFlushError(t *testing.T) { - zw, err := NewWriter(&failWriter{0}, DefaultCompression) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - flushErr := zw.Flush() - closeErr := zw.Close() - _, writeErr := zw.Write([]byte("Test")) - checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t) -} - -func TestWriterPersistentCloseError(t *testing.T) { - // If underlying writer return error on closing stream we should persistent this error across all writer calls. - zw, err := NewWriter(&failWriter{0}, DefaultCompression) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - closeErr := zw.Close() - flushErr := zw.Flush() - _, writeErr := zw.Write([]byte("Test")) - checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t) - - // After closing writer we should persistent "write after close" error across Flush and Write calls, but return nil - // on next Close calls. - var b bytes.Buffer - zw.Reset(&b) - err = zw.Close() - if err != nil { - t.Fatalf("First call to close returned error: %s", err) - } - err = zw.Close() - if err != nil { - t.Fatalf("Second call to close returned error: %s", err) - } - - flushErr = zw.Flush() - _, writeErr = zw.Write([]byte("Test")) - checkErrors([]error{flushErr, writeErr}, errWriterClosed, t) -} - -func checkErrors(got []error, want error, t *testing.T) { - t.Helper() - for _, err := range got { - if err != want { - t.Errorf("Error doesn't match\nWant: %s\nGot: %s", want, got) - } - } -} - -func TestBestSpeedMatch(t *testing.T) { - t.Parallel() - cases := []struct { - previous, current []byte - t, s, want int32 - }{{ - previous: []byte{0, 0, 0, 1, 2}, - current: []byte{3, 4, 5, 0, 1, 2, 3, 4, 5}, - t: -3, - s: 3, - want: 6, - }, { - previous: []byte{0, 0, 0, 1, 2}, - current: []byte{2, 4, 5, 0, 1, 2, 3, 4, 5}, - t: -3, - s: 3, - want: 3, - }, { - previous: []byte{0, 0, 0, 1, 1}, - current: []byte{3, 4, 5, 0, 1, 2, 3, 4, 5}, - t: -3, - s: 3, - want: 2, - }, { - previous: []byte{0, 0, 0, 1, 2}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -1, - s: 0, - want: 4, - }, { - previous: []byte{0, 0, 0, 1, 2, 3, 4, 5, 2, 2}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -7, - s: 4, - want: 5, - }, { - previous: []byte{9, 9, 9, 9, 9}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -1, - s: 0, - want: 0, - }, { - previous: []byte{9, 9, 9, 9, 9}, - current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5}, - t: 0, - s: 1, - want: 0, - }, { - previous: []byte{}, - current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -5, - s: 1, - want: 0, - }, { - previous: []byte{}, - current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -1, - s: 1, - want: 0, - }, { - previous: []byte{}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: 0, - s: 1, - want: 3, - }, { - previous: []byte{3, 4, 5}, - current: []byte{3, 4, 5}, - t: -3, - s: 0, - want: 3, - }, { - previous: make([]byte, 1000), - current: make([]byte, 1000), - t: -1000, - s: 0, - want: maxMatchLength - 4, - }, { - previous: make([]byte, 200), - current: make([]byte, 500), - t: -200, - s: 0, - want: maxMatchLength - 4, - }, { - previous: make([]byte, 200), - current: make([]byte, 500), - t: 0, - s: 1, - want: maxMatchLength - 4, - }, { - previous: make([]byte, maxMatchLength-4), - current: make([]byte, 500), - t: -(maxMatchLength - 4), - s: 0, - want: maxMatchLength - 4, - }, { - previous: make([]byte, 200), - current: make([]byte, 500), - t: -200, - s: 400, - want: 100, - }, { - previous: make([]byte, 10), - current: make([]byte, 500), - t: 200, - s: 400, - want: 100, - }} - for i, c := range cases { - e := deflateFast{prev: c.previous} - got := e.matchLen(c.s, c.t, c.current) - if got != c.want { - t.Errorf("Test %d: match length, want %d, got %d", i, c.want, got) - } - } -} - -func TestBestSpeedMaxMatchOffset(t *testing.T) { - t.Parallel() - const abc, xyz = "abcdefgh", "stuvwxyz" - for _, matchBefore := range []bool{false, true} { - for _, extra := range []int{0, inputMargin - 1, inputMargin, inputMargin + 1, 2 * inputMargin} { - for offsetAdj := -5; offsetAdj <= +5; offsetAdj++ { - report := func(desc string, err error) { - t.Errorf("matchBefore=%t, extra=%d, offsetAdj=%d: %s%v", - matchBefore, extra, offsetAdj, desc, err) - } - - offset := maxMatchOffset + offsetAdj - - // Make src to be a []byte of the form - // "%s%s%s%s%s" % (abc, zeros0, xyzMaybe, abc, zeros1) - // where: - // zeros0 is approximately maxMatchOffset zeros. - // xyzMaybe is either xyz or the empty string. - // zeros1 is between 0 and 30 zeros. - // The difference between the two abc's will be offset, which - // is maxMatchOffset plus or minus a small adjustment. - src := make([]byte, offset+len(abc)+extra) - copy(src, abc) - if !matchBefore { - copy(src[offset-len(xyz):], xyz) - } - copy(src[offset:], abc) - - buf := new(bytes.Buffer) - w, err := NewWriter(buf, BestSpeed) - if err != nil { - report("NewWriter: ", err) - continue - } - if _, err := w.Write(src); err != nil { - report("Write: ", err) - continue - } - if err := w.Close(); err != nil { - report("Writer.Close: ", err) - continue - } - - r := NewReader(buf) - dst, err := io.ReadAll(r) - r.Close() - if err != nil { - report("ReadAll: ", err) - continue - } - - if !bytes.Equal(dst, src) { - report("", fmt.Errorf("bytes differ after round-tripping")) - continue - } - } - } - } -} - -func TestBestSpeedShiftOffsets(t *testing.T) { - // Test if shiftoffsets properly preserves matches and resets out-of-range matches - // seen in https://github.com/golang/go/issues/4142 - enc := newDeflateFast() - - // testData may not generate internal matches. - testData := make([]byte, 32) - rng := rand.New(rand.NewSource(0)) - for i := range testData { - testData[i] = byte(rng.Uint32()) - } - - // Encode the testdata with clean state. - // Second part should pick up matches from the first block. - wantFirstTokens := len(enc.encode(nil, testData)) - wantSecondTokens := len(enc.encode(nil, testData)) - - if wantFirstTokens <= wantSecondTokens { - t.Fatalf("test needs matches between inputs to be generated") - } - // Forward the current indicator to before wraparound. - enc.cur = bufferReset - int32(len(testData)) - - // Part 1 before wrap, should match clean state. - got := len(enc.encode(nil, testData)) - if wantFirstTokens != got { - t.Errorf("got %d, want %d tokens", got, wantFirstTokens) - } - - // Verify we are about to wrap. - if enc.cur != bufferReset { - t.Errorf("got %d, want e.cur to be at bufferReset (%d)", enc.cur, bufferReset) - } - - // Part 2 should match clean state as well even if wrapped. - got = len(enc.encode(nil, testData)) - if wantSecondTokens != got { - t.Errorf("got %d, want %d token", got, wantSecondTokens) - } - - // Verify that we wrapped. - if enc.cur >= bufferReset { - t.Errorf("want e.cur to be < bufferReset (%d), got %d", bufferReset, enc.cur) - } - - // Forward the current buffer, leaving the matches at the bottom. - enc.cur = bufferReset - enc.shiftOffsets() - - // Ensure that no matches were picked up. - got = len(enc.encode(nil, testData)) - if wantFirstTokens != got { - t.Errorf("got %d, want %d tokens", got, wantFirstTokens) - } -} - -func TestMaxStackSize(t *testing.T) { - // This test must not run in parallel with other tests as debug.SetMaxStack - // affects all goroutines. - n := debug.SetMaxStack(1 << 16) - defer debug.SetMaxStack(n) - - var wg sync.WaitGroup - defer wg.Wait() - - b := make([]byte, 1<<20) - for level := HuffmanOnly; level <= BestCompression; level++ { - // Run in separate goroutine to increase probability of stack regrowth. - wg.Add(1) - go func(level int) { - defer wg.Done() - zw, err := NewWriter(io.Discard, level) - if err != nil { - t.Errorf("level %d, NewWriter() = %v, want nil", level, err) - } - if n, err := zw.Write(b); n != len(b) || err != nil { - t.Errorf("level %d, Write() = (%d, %v), want (%d, nil)", level, n, err, len(b)) - } - if err := zw.Close(); err != nil { - t.Errorf("level %d, Close() = %v, want nil", level, err) - } - zw.Reset(io.Discard) - }(level) - } -} diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go index e5554d6fb40842..e132c55951b5ef 100644 --- a/src/compress/flate/deflatefast.go +++ b/src/compress/flate/deflatefast.go @@ -4,304 +4,170 @@ package flate -import "math" - -// This encoding algorithm, which prioritizes speed over output size, is -// based on Snappy's LZ77-style encoder: github.com/golang/snappy - -const ( - tableBits = 14 // Bits used in the table. - tableSize = 1 << tableBits // Size of the table. - tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. - tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32. - - // Reset the buffer offset when reaching this. - // Offsets are stored between blocks as int32 values. - // Since the offset we are checking against is at the beginning - // of the buffer, we need to subtract the current and input - // buffer to not risk overflowing the int32. - bufferReset = math.MaxInt32 - maxStoreBlockSize*2 +import ( + "math/bits" ) -func load32(b []byte, i int32) uint32 { - b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +type fastEnc interface { + Encode(dst *tokens, src []byte) + Reset() } -func load64(b []byte, i int32) uint64 { - b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +func newFastEnc(level int) fastEnc { + switch level { + case 1: + return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}} + case 2: + return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}} + case 3: + return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}} + case 4: + return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}} + case 5: + return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}} + case 6: + return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}} + default: + panic("invalid level specified") + } } -func hash(u uint32) uint32 { - return (u * 0x1e35a7bd) >> tableShift -} +const ( + tableBits = 15 // Bits used in the table + tableSize = 1 << tableBits // Size of the table + baseMatchOffset = 1 // The smallest match offset + baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 + maxMatchOffset = 1 << 15 // The largest match offset + + bTableBits = 17 // Bits used in the big tables + bTableSize = 1 << bTableBits // Size of the table + allocHistory = maxStoreBlockSize * 5 // Size to preallocate for history. + bufferReset = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this. +) -// These constants are defined by the Snappy implementation so that its -// assembly implementation can fast-path some 16-bytes-at-a-time copies. They -// aren't necessary in the pure Go implementation, as we don't use those same -// optimizations, but using the same thresholds doesn't really hurt. const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin + prime3bytes = 506832829 + prime4bytes = 2654435761 + prime5bytes = 889523592379 + prime6bytes = 227718039650203 + prime7bytes = 58295818150454627 + prime8bytes = 0xcf1bbcdcb7a56463 ) type tableEntry struct { - val uint32 // Value at destination offset int32 } -// deflateFast maintains the table for matches, -// and the previous byte block for cross block matching. -type deflateFast struct { - table [tableSize]tableEntry - prev []byte // Previous block, zero length if unknown. - cur int32 // Current match offset. -} - -func newDeflateFast() *deflateFast { - return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)} +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastGen struct { + hist []byte + cur int32 } -// encode encodes a block given in src and appends tokens -// to dst and returns the result. -func (e *deflateFast) encode(dst []token, src []byte) []token { - // Ensure that e.cur doesn't wrap. - if e.cur >= bufferReset { - e.shiftOffsets() - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return emitLiteral(dst, src) - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load32(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidate = e.table[nextHash&tableMask] - now := load32(src, nextS) - e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv} - nextHash = hash(now) - - offset := s - (candidate.offset - e.cur) - if offset > maxMatchOffset || cv != candidate.val { - // Out of range or not matched. - cv = now - continue - } - break - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - dst = emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchLen(s, t, src) - - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))) - s += l - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load64(src, s-1) - prevHash := hash(uint32(x)) - e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)} - x >>= 8 - currHash := hash(uint32(x)) - candidate = e.table[currHash&tableMask] - e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)} - - offset := s - (candidate.offset - e.cur) - if offset > maxMatchOffset || uint32(x) != candidate.val { - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break +func (e *fastGen) addBlock(src []byte) int32 { + // check if we have space already + if len(e.hist)+len(src) > cap(e.hist) { + if cap(e.hist) == 0 { + e.hist = make([]byte, 0, allocHistory) + } else { + if cap(e.hist) < maxMatchOffset*2 { + panic("unexpected buffer size") } + // Move down + offset := int32(len(e.hist)) - maxMatchOffset + // copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + *(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:]) + e.cur += offset + e.hist = e.hist[:maxMatchOffset] } } + s := int32(len(e.hist)) + e.hist = append(e.hist, src...) + return s +} -emitRemainder: - if int(nextEmit) < len(src) { - dst = emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) - return dst +type tableEntryPrev struct { + Cur tableEntry + Prev tableEntry } -func emitLiteral(dst []token, lit []byte) []token { - for _, v := range lit { - dst = append(dst, literalToken(uint32(v))) - } - return dst +// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash7(u uint64, h uint8) uint32 { + return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64)) } -// matchLen returns the match length between src[s:] and src[t:]. -// t can be negative to indicate the match is starting in e.prev. -// We assume that src[s-4:s] and src[t-4:t] already match. -func (e *deflateFast) matchLen(s, t int32, src []byte) int32 { - s1 := int(s) + maxMatchLength - 4 - if s1 > len(src) { - s1 = len(src) +// hashLen returns a hash of the lowest mls bytes of with length output bits. +// mls must be >=3 and <=8. Any other value will return hash for 4 bytes. +// length should always be < 32. +// Preferably, length and mls should be a constant for inlining. +func hashLen(u uint64, length, mls uint8) uint32 { + switch mls { + case 3: + return (uint32(u<<8) * prime3bytes) >> (32 - length) + case 5: + return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length)) + case 6: + return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length)) + case 7: + return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length)) + case 8: + return uint32((u * prime8bytes) >> (64 - length)) + default: + return (uint32(u) * prime4bytes) >> (32 - length) } +} - // If we are inside the current block - if t >= 0 { - b := src[t:] - a := src[s:s1] - b = b[:len(a)] - // Extend the match to be as long as possible. - for i := range a { - if a[i] != b[i] { - return int32(i) - } - } - return int32(len(a)) - } +// matchLenLimited will return the match length between offsets and t in src. +// The maximum length returned is maxMatchLength - 4. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchLenLimited(s, t int, src []byte) int32 { + a := src[s:min(s+maxMatchLength-4, len(src))] + b := src[t:] + return int32(matchLen(a, b)) +} - // We found a match in the previous block. - tp := int32(len(e.prev)) + t - if tp < 0 { - return 0 - } +// matchlenLong will return the match length between offsets and t in src. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchlenLong(s, t int, src []byte) int32 { + return int32(matchLen(src[s:], src[t:])) +} - // Extend the match to be as long as possible. - a := src[s:s1] - b := e.prev[tp:] - if len(b) > len(a) { - b = b[:len(a)] +// Reset the encoding table. +func (e *fastGen) Reset() { + if cap(e.hist) < allocHistory { + e.hist = make([]byte, 0, allocHistory) } - a = a[:len(b)] - for i := range b { - if a[i] != b[i] { - return int32(i) - } + // We offset current position so everything will be out of reach. + // If we are above the buffer reset it will be cleared anyway since len(hist) == 0. + if e.cur <= bufferReset { + e.cur += maxMatchOffset + int32(len(e.hist)) } + e.hist = e.hist[:0] +} - // If we reached our limit, we matched everything we are - // allowed to in the previous block and we return. - n := int32(len(b)) - if int(s+n) == s1 { - return n +// matchLen returns the maximum common prefix length of a and b. +// a must be the shortest of the two. +func matchLen(a, b []byte) (n int) { + left := len(a) + for left >= 8 { + diff := loadLE64(a, n) ^ loadLE64(b, n) + if diff != 0 { + return n + bits.TrailingZeros64(diff)>>3 + } + n += 8 + left -= 8 } - // Continue looking for more matches in the current block. - a = src[s+n : s1] - b = src[:len(a)] + a = a[n:] + b = b[n:] for i := range a { if a[i] != b[i] { - return int32(i) + n - } - } - return int32(len(a)) + n -} - -// Reset resets the encoding history. -// This ensures that no matches are made to the previous block. -func (e *deflateFast) reset() { - e.prev = e.prev[:0] - // Bump the offset, so all matches will fail distance check. - // Nothing should be >= e.cur in the table. - e.cur += maxMatchOffset - - // Protect against e.cur wraparound. - if e.cur >= bufferReset { - e.shiftOffsets() - } -} - -// shiftOffsets will shift down all match offset. -// This is only called in rare situations to prevent integer overflow. -// -// See https://golang.org/issue/18636 and https://github.com/golang/go/issues/34121. -func (e *deflateFast) shiftOffsets() { - if len(e.prev) == 0 { - // We have no history; just clear the table. - clear(e.table[:]) - e.cur = maxMatchOffset + 1 - return - } - - // Shift down everything in the table that isn't already too far away. - for i := range e.table[:] { - v := e.table[i].offset - e.cur + maxMatchOffset + 1 - if v < 0 { - // We want to reset e.cur to maxMatchOffset + 1, so we need to shift - // all table entries down by (e.cur - (maxMatchOffset + 1)). - // Because we ignore matches > maxMatchOffset, we can cap - // any negative offsets at 0. - v = 0 + break } - e.table[i].offset = v + n++ } - e.cur = maxMatchOffset + 1 + return n } diff --git a/src/compress/flate/dict_decoder.go b/src/compress/flate/dict_decoder.go index d2c19040f54f53..cb855abc4ba1d7 100644 --- a/src/compress/flate/dict_decoder.go +++ b/src/compress/flate/dict_decoder.go @@ -104,10 +104,7 @@ func (dd *dictDecoder) writeCopy(dist, length int) int { dstBase := dd.wrPos dstPos := dstBase srcPos := dstPos - dist - endPos := dstPos + length - if endPos > len(dd.hist) { - endPos = len(dd.hist) - } + endPos := min(dstPos+length, len(dd.hist)) // Copy non-overlapping section after destination position. // @@ -160,8 +157,10 @@ func (dd *dictDecoder) tryWriteCopy(dist, length int) int { srcPos := dstPos - dist // Copy possibly overlapping section before destination position. - for dstPos < endPos { - dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos]) +loop: + dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos]) + if dstPos < endPos { + goto loop // Avoid for-loop so that this function can be inlined } dd.wrPos = dstPos diff --git a/src/compress/flate/example_test.go b/src/compress/flate/example_test.go index 578009248f5704..3af5c1d95de1d1 100644 --- a/src/compress/flate/example_test.go +++ b/src/compress/flate/example_test.go @@ -93,7 +93,7 @@ func Example_dictionary() { var b bytes.Buffer // Compress the data using the specially crafted dictionary. - zw, err := flate.NewWriterDict(&b, flate.DefaultCompression, []byte(dict)) + zw, err := flate.NewWriterDict(&b, flate.BestCompression, []byte(dict)) if err != nil { log.Fatal(err) } @@ -168,6 +168,7 @@ func Example_synchronization() { wg.Add(1) go func() { defer wg.Done() + defer wp.Close() zw, err := flate.NewWriter(wp, flate.BestSpeed) if err != nil { diff --git a/src/compress/flate/fuzz_test.go b/src/compress/flate/fuzz_test.go new file mode 100644 index 00000000000000..1ea8cc49e54672 --- /dev/null +++ b/src/compress/flate/fuzz_test.go @@ -0,0 +1,111 @@ +package flate + +import ( + "bytes" + "flag" + "io" + "os" + "strconv" + "testing" +) + +// Fuzzing tweaks: +var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level") +var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)") +var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size") + +func TestMain(m *testing.M) { + flag.Parse() + os.Exit(m.Run()) +} + +// FuzzEncoding tests the fuzzer by doing roundtrips. +// Every input is run through the fuzzer at every level. +// Note: When running the fuzzer, it may hit the 10-second timeout on slower CPUs. +func FuzzEncoding(f *testing.F) { + startFuzz := *fuzzStartF + endFuzz := *fuzzEndF + maxSize := *fuzzMaxF + + decoder := NewReader(nil) + buf, buf2 := new(bytes.Buffer), new(bytes.Buffer) + encs := make([]*Writer, endFuzz-startFuzz+1) + for i := range encs { + var err error + encs[i], err = NewWriter(nil, i+startFuzz) + if err != nil { + f.Fatal(err.Error()) + } + } + + f.Fuzz(func(t *testing.T, data []byte) { + if len(data) > maxSize { + return + } + for level := startFuzz; level <= endFuzz; level++ { + if level == DefaultCompression { + continue // Already covered. + } + msg := "level " + strconv.Itoa(level) + ":" + buf.Reset() + fw := encs[level-startFuzz] + fw.Reset(buf) + n, err := fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + compressed := buf.Bytes() + err = decoder.(Resetter).Reset(buf, nil) + if err != nil { + t.Fatal(msg + err.Error()) + } + data2, err := io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "decompressed not equal") + } + + // Do it again... + msg = "level " + strconv.Itoa(level) + " (reset):" + buf2.Reset() + fw.Reset(buf2) + n, err = fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + compressed2 := buf2.Bytes() + err = decoder.(Resetter).Reset(buf2, nil) + if err != nil { + t.Fatal(msg + err.Error()) + } + data2, err = io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "decompressed not equal") + } + // Determinism checks will usually not be reproducible, + // since it often relies on the internal state of the compressor. + if !bytes.Equal(compressed, compressed2) { + t.Fatal(msg + "non-deterministic output") + } + } + }) +} diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go index d68c77fb32e32a..f5e50925db8802 100644 --- a/src/compress/flate/huffman_bit_writer.go +++ b/src/compress/flate/huffman_bit_writer.go @@ -6,6 +6,7 @@ package flate import ( "io" + "math" ) const ( @@ -22,20 +23,22 @@ const ( codegenCodeCount = 19 badCode = 255 + // maxPredefinedTokens is the maximum number of tokens + // where we check if fixed size is smaller. + maxPredefinedTokens = 250 + // bufferFlushSize indicates the buffer size // after which bytes are flushed to the writer. // Should preferably be a multiple of 6, since // we accumulate 6 bytes between writes to the buffer. - bufferFlushSize = 240 - - // bufferSize is the actual output byte buffer size. - // It must have additional headroom for a flush - // which can contain up to 8 bytes. - bufferSize = bufferFlushSize + 8 + bufferFlushSize = 246 ) +// Minimum length code that emits bits. +const lengthExtraBitsMinCode = 8 + // The number of extra bits needed by length code X - LENGTH_CODES_START. -var lengthExtraBits = []int8{ +var lengthExtraBits = [32]uint8{ /* 257 */ 0, 0, 0, /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, @@ -43,26 +46,47 @@ var lengthExtraBits = []int8{ } // The length indicated by length code X - LENGTH_CODES_START. -var lengthBase = []uint32{ +var lengthBase = [32]uint8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 255, } +// Minimum offset code that emits bits. +const offsetExtraBitsMinCode = 4 + // offset code word extra bits. -var offsetExtraBits = []int8{ +var offsetExtraBits = [32]int8{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, + /* extended window */ + 14, 14, } -var offsetBase = []uint32{ - 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, - 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, - 0x000020, 0x000030, 0x000040, 0x000060, 0x000080, - 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300, - 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000, - 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, +var offsetCombined = [32]uint32{} + +func init() { + var offsetBase = [32]uint32{ + /* normal deflate */ + 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, + 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, + 0x000020, 0x000030, 0x000040, 0x000060, 0x000080, + 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300, + 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000, + 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, + + /* extended window */ + 0x008000, 0x00c000, + } + + for i := range offsetCombined[:] { + // Don't use extended window values... + if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 { + continue + } + offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8) + } } // The odd order in which the codegen code sizes are written. @@ -75,29 +99,49 @@ type huffmanBitWriter struct { writer io.Writer // Data waiting to be written is bytes[0:nbytes] - // and then the low nbits of bits. Data is always written - // sequentially into the bytes array. - bits uint64 - nbits uint - bytes [bufferSize]byte - codegenFreq [codegenCodeCount]int32 - nbytes int - literalFreq []int32 - offsetFreq []int32 - codegen []uint8 - literalEncoding *huffmanEncoder - offsetEncoding *huffmanEncoder - codegenEncoding *huffmanEncoder - err error + // and then the low nbits of bits. + bits uint64 + nbits uint8 + nbytes uint8 + lastHuffMan bool + literalEncoding *huffmanEncoder + tmpLitEncoding *huffmanEncoder + offsetEncoding *huffmanEncoder + codegenEncoding *huffmanEncoder + err error + lastHeader int + logNewTablePenalty uint // Bigger values will reduce the penalty of a new table. + bytes [256 + 8]byte + literalFreq [lengthCodesStart + 32]uint16 + offsetFreq [32]uint16 + codegenFreq [codegenCodeCount]uint16 + + // codegen must have an extra space for the final symbol. + codegen [literalCount + offsetCodeCount + 1]uint8 } +// The huffmanBitWriter supports reusing huffman tables and will combine +// blocks, if compression is less than creating a new table. +// +// This is controlled by several variables: +// +// If 'lastHeader' is non-zero the Huffman table can be reused. +// It also indicates that an EOB has not yet been emitted, so if a new table +// is generated, an EOB with the previous table must be written. +// +// If 'lastHuffMan' is set, a table for outputting literals +// has been generated and offsets are invalid. +// +// An incoming block estimates the output size of a new table using a +// 'fresh' by calculating the optimal size and adding a penalty. +// A Huffman table is not optimal, which is why we add a penalty, +// and generating a new table is slower for both compression and decompression. + func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { return &huffmanBitWriter{ writer: w, - literalFreq: make([]int32, maxNumLit), - offsetFreq: make([]int32, offsetCodeCount), - codegen: make([]uint8, maxNumLit+offsetCodeCount+1), - literalEncoding: newHuffmanEncoder(maxNumLit), + literalEncoding: newHuffmanEncoder(literalCount), + tmpLitEncoding: newHuffmanEncoder(literalCount), codegenEncoding: newHuffmanEncoder(codegenCodeCount), offsetEncoding: newHuffmanEncoder(offsetCodeCount), } @@ -106,6 +150,37 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { func (w *huffmanBitWriter) reset(writer io.Writer) { w.writer = writer w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil + w.lastHeader = 0 + w.lastHuffMan = false +} + +func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { + a := t.offHist[:offsetCodeCount] + b := w.offsetEncoding.codes + b = b[:len(a)] + for i, v := range a { + if v != 0 && b[i].zero() { + return false + } + } + + a = t.extraHist[:literalCount-256] + b = w.literalEncoding.codes[256:literalCount] + b = b[:len(a)] + for i, v := range a { + if v != 0 && b[i].zero() { + return false + } + } + + a = t.litHist[:256] + b = w.literalEncoding.codes[:len(a)] + for i, v := range a { + if v != 0 && b[i].zero() { + return false + } + } + return true } func (w *huffmanBitWriter) flush() { @@ -113,6 +188,11 @@ func (w *huffmanBitWriter) flush() { w.nbits = 0 return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } n := w.nbytes for w.nbits != 0 { w.bytes[n] = byte(w.bits) @@ -125,7 +205,9 @@ func (w *huffmanBitWriter) flush() { n++ } w.bits = 0 - w.write(w.bytes[:n]) + if n > 0 { + w.write(w.bytes[:n]) + } w.nbytes = 0 } @@ -136,30 +218,11 @@ func (w *huffmanBitWriter) write(b []byte) { _, w.err = w.writer.Write(b) } -func (w *huffmanBitWriter) writeBits(b int32, nb uint) { - if w.err != nil { - return - } - w.bits |= uint64(b) << w.nbits +func (w *huffmanBitWriter) writeBits(b int32, nb uint8) { + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - w.write(w.bytes[:n]) - n = 0 - } - w.nbytes = n + w.writeOutBits() } } @@ -198,21 +261,23 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) { // numOffsets The number of offsets in offsetEncoding // litenc, offenc The literal and offset encoder to use func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) { - clear(w.codegenFreq[:]) + for i := range w.codegenFreq { + w.codegenFreq[i] = 0 + } // Note that we are using codegen both as a temporary variable for holding // a copy of the frequencies, and as the place where we put the result. // This is fine because the output is always shorter than the input used // so far. - codegen := w.codegen // cache + codegen := w.codegen[:] // cache // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { - cgnl[i] = uint8(litEnc.codes[i].len) + cgnl[i] = litEnc.codes[i].len() } cgnl = codegen[numLiterals : numLiterals+numOffsets] for i := range cgnl { - cgnl[i] = uint8(offEnc.codes[i].len) + cgnl[i] = offEnc.codes[i].len() } codegen[numLiterals+numOffsets] = badCode @@ -234,10 +299,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE w.codegenFreq[size]++ count-- for count >= 3 { - n := 6 - if n > count { - n = count - } + n := min(6, count) codegen[outIndex] = 16 outIndex++ codegen[outIndex] = uint8(n - 3) @@ -247,10 +309,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE } } else { for count >= 11 { - n := 138 - if n > count { - n = count - } + n := min(138, count) codegen[outIndex] = 18 outIndex++ codegen[outIndex] = uint8(n - 11) @@ -282,30 +341,61 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE codegen[outIndex] = badCode } -// dynamicSize returns the size of dynamically encoded data in bits. -func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { +func (w *huffmanBitWriter) codegens() int { + numCodegens := len(w.codegenFreq) + for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { + numCodegens-- + } + return numCodegens +} + +func (w *huffmanBitWriter) headerSize() (size, numCodegens int) { numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } - header := 3 + 5 + 5 + 4 + (3 * numCodegens) + + return 3 + 5 + 5 + 4 + (3 * numCodegens) + w.codegenEncoding.bitLength(w.codegenFreq[:]) + int(w.codegenFreq[16])*2 + int(w.codegenFreq[17])*3 + - int(w.codegenFreq[18])*7 + int(w.codegenFreq[18])*7, numCodegens +} + +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) { + size = litEnc.bitLength(w.literalFreq[:]) + + offEnc.bitLength(w.offsetFreq[:]) + return size +} + +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { + header, numCodegens := w.headerSize() size = header + - litEnc.bitLength(w.literalFreq) + - offEnc.bitLength(w.offsetFreq) + + litEnc.bitLength(w.literalFreq[:]) + + offEnc.bitLength(w.offsetFreq[:]) + extraBits - return size, numCodegens } +// extraBitSize will return the number of bits that will be written +// as "extra" bits on matches. +func (w *huffmanBitWriter) extraBitSize() int { + total := 0 + for i, n := range w.literalFreq[257:literalCount] { + total += int(n) * int(lengthExtraBits[i&31]) + } + for i, n := range w.offsetFreq[:offsetCodeCount] { + total += int(n) * int(offsetExtraBits[i&31]) + } + return total +} + // fixedSize returns the size of dynamically encoded data in bits. func (w *huffmanBitWriter) fixedSize(extraBits int) int { return 3 + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + + fixedLiteralEncoding.bitLength(w.literalFreq[:]) + + fixedOffsetEncoding.bitLength(w.offsetFreq[:]) + extraBits } @@ -323,30 +413,35 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { } func (w *huffmanBitWriter) writeCode(c hcode) { - if w.err != nil { - return - } - w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) + // The function does not get inlined if we "& 63" the shift. + w.bits |= c.code64() << (w.nbits & reg8SizeMask64) + w.nbits += c.len() if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - w.write(w.bytes[:n]) + w.writeOutBits() + } +} + +// writeOutBits will write bits to the buffer. +func (w *huffmanBitWriter) writeOutBits() { + bits := w.bits + w.bits >>= 48 + w.nbits -= 48 + n := w.nbytes + + // We overwrite, but faster... + storeLE64(w.bytes[n:], bits) + n += 6 + + if n >= bufferFlushSize { + if w.err != nil { n = 0 + return } - w.nbytes = n + w.write(w.bytes[:n]) + n = 0 } + + w.nbytes = n } // Write the header of a dynamic Huffman block to the output stream. @@ -367,19 +462,19 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n w.writeBits(int32(numOffsets-1), 5) w.writeBits(int32(numCodegens-4), 4) - for i := 0; i < numCodegens; i++ { - value := uint(w.codegenEncoding.codes[codegenOrder[i]].len) + for i := range numCodegens { + value := uint(w.codegenEncoding.codes[codegenOrder[i]].len()) w.writeBits(int32(value), 3) } i := 0 for { - var codeWord int = int(w.codegen[i]) + var codeWord = uint32(w.codegen[i]) i++ if codeWord == badCode { break } - w.writeCode(w.codegenEncoding.codes[uint32(codeWord)]) + w.writeCode(w.codegenEncoding.codes[codeWord]) switch codeWord { case 16: @@ -395,10 +490,28 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n } } +// writeStoredHeader will write a stored header. +// If the stored block is only used for EOF, +// it is replaced with a fixed huffman block. func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + + // To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes. + if length == 0 && isEof { + w.writeFixedHeader(isEof) + // EOB: 7 bits, value: 0 + w.writeBits(0, 7) + w.flush() + return + } + var flag int32 if isEof { flag = 1 @@ -413,6 +526,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + // Indicate that we are a fixed Huffman block var value int32 = 2 if isEof { @@ -426,36 +545,33 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { // is larger than the original bytes, the data will be written as a // stored block. // If the input is nil, the tokens will always be Huffman encoded. -func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) + tokens.AddEOB() + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } numLiterals, numOffsets := w.indexTokens(tokens) - + w.generate() var extraBits int storedSize, storable := w.storedSize(input) if storable { - // We only bother calculating the costs of the extra bits required by - // the length of offset fields (which will be the same for both fixed - // and dynamic encoding), if we need to compare those two encodings - // against stored encoding. - for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { - // First eight length codes have extra size = 0. - extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart]) - } - for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { - // First four offset codes have extra size = 0. - extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode]) - } + extraBits = w.extraBitSize() } // Figure out smallest code. // Fixed Huffman baseline. var literalEncoding = fixedLiteralEncoding var offsetEncoding = fixedOffsetEncoding - var size = w.fixedSize(extraBits) + var size = math.MaxInt32 + if tokens.n < maxPredefinedTokens { + size = w.fixedSize(extraBits) + } // Dynamic Huffman? var numCodegens int @@ -473,7 +589,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Stored bytes? - if storable && storedSize < size { + if storable && storedSize <= size { w.writeStoredHeader(len(input), eof) w.writeBytes(input) return @@ -487,7 +603,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Write the tokens. - w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes) + w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes) } // writeBlockDynamic encodes a block using a dynamic Huffman table. @@ -495,53 +611,153 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // histogram distribution. // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. -func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) + sync = sync || eof + if sync { + tokens.AddEOB() + } + + // We cannot reuse pure huffman table, and must mark as EOF. + if (w.lastHuffMan || eof) && w.lastHeader > 0 { + // We will not try to reuse. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false + } + + if w.lastHeader > 0 && !w.canReuse(tokens) { + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + numLiterals, numOffsets := w.indexTokens(tokens) + extraBits := 0 + ssize, storable := w.storedSize(input) - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0) + if storable || w.lastHeader > 0 { + extraBits = w.extraBitSize() + } - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) - return + var size int + + // Check if we should reuse. + if w.lastHeader > 0 { + // Estimate size for using a new table. + // Use the previous header size as the best estimate. + newSize := w.lastHeader + tokens.EstimatedBits() + + // The estimated size is calculated as an optimal table. + // We add a penalty to make it more realistic and re-use a bit more. + newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty + + // Calculate the size for reusing the current table. + reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits + + // Check if a new table is better. + if newSize < reuseSize { + // Write the EOB we owe. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + size = newSize + w.lastHeader = 0 + } else { + size = reuseSize + } + + // Small blocks can be more efficient with fixed encoding. + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits) + 7; preSize < size { + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) + return + } + } + + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } } - // Write Huffman table. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + // We want a new block/table + if w.lastHeader == 0 { + w.literalFreq[endBlockMarker] = 1 + + w.generate() + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + + var numCodegens int + size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) + + // Store predefined or raw, if we don't get a reasonable improvement. + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits); preSize <= size { + // Store bytes, if we don't get an improvement. + if storable && ssize <= preSize { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) + return + } + } + + if storable && ssize <= size { + // Store bytes, if we don't get an improvement. + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + + // Write Huffman table. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + if !sync { + w.lastHeader, _ = w.headerSize() + } + w.lastHuffMan = false + } + if sync { + w.lastHeader = 0 + } // Write the tokens. - w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes) + w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes) } // indexTokens indexes a slice of tokens, and updates // literalFreq and offsetFreq, and generates literalEncoding // and offsetEncoding. // The number of literal and offset tokens is returned. -func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) { - clear(w.literalFreq) - clear(w.offsetFreq) +func (w *huffmanBitWriter) indexTokens(t *tokens) (numLiterals, numOffsets int) { + *(*[256]uint16)(w.literalFreq[:]) = t.litHist + *(*[32]uint16)(w.literalFreq[256:]) = t.extraHist + w.offsetFreq = t.offHist - for _, t := range tokens { - if t < matchType { - w.literalFreq[t.literal()]++ - continue - } - length := t.length() - offset := t.offset() - w.literalFreq[lengthCodesStart+lengthCode(length)]++ - w.offsetFreq[offsetCode(offset)]++ + if t.n == 0 { + return } - // get the number of literals numLiterals = len(w.literalFreq) for w.literalFreq[numLiterals-1] == 0 { @@ -558,41 +774,153 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets w.offsetFreq[0] = 1 numOffsets = 1 } - w.literalEncoding.generate(w.literalFreq, 15) - w.offsetEncoding.generate(w.offsetFreq, 15) return } +func (w *huffmanBitWriter) generate() { + w.literalEncoding.generate(w.literalFreq[:literalCount], 15) + w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15) +} + // writeTokens writes a slice of tokens to the output. // codes for literal and offset encoding must be supplied. func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) { if w.err != nil { return } + if len(tokens) == 0 { + return + } + + // Only last token should be endBlockMarker. + var deferEOB bool + if tokens[len(tokens)-1] == endBlockMarker { + tokens = tokens[:len(tokens)-1] + deferEOB = true + } + + // Create slices up to the next power of two to avoid bounds checks. + lits := leCodes[:256] + offs := oeCodes[:32] + lengths := leCodes[lengthCodesStart:] + lengths = lengths[:32] + + // Go 1.16 LOVES having these on stack. + bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + for _, t := range tokens { - if t < matchType { - w.writeCode(leCodes[t.literal()]) + if t < 256 { + c := lits[t] + bits |= c.code64() << (nbits & 63) + nbits += c.len() + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } continue } + // Write the length length := t.length() - lengthCode := lengthCode(length) - w.writeCode(leCodes[lengthCode+lengthCodesStart]) - extraLengthBits := uint(lengthExtraBits[lengthCode]) - if extraLengthBits > 0 { - extraLength := int32(length - lengthBase[lengthCode]) - w.writeBits(extraLength, extraLengthBits) + lenCode := lengthCode(length) & 31 + // inlined 'w.writeCode(lengths[lengthCode])' + c := lengths[lenCode] + bits |= c.code64() << (nbits & 63) + nbits += c.len() + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } + + if lenCode >= lengthExtraBitsMinCode { + extraLengthBits := lengthExtraBits[lenCode] + //w.writeBits(extraLength, extraLengthBits) + extraLength := int32(length - lengthBase[lenCode]) + bits |= uint64(extraLength) << (nbits & 63) + nbits += extraLengthBits + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } } // Write the offset offset := t.offset() - offsetCode := offsetCode(offset) - w.writeCode(oeCodes[offsetCode]) - extraOffsetBits := uint(offsetExtraBits[offsetCode]) - if extraOffsetBits > 0 { - extraOffset := int32(offset - offsetBase[offsetCode]) - w.writeBits(extraOffset, extraOffsetBits) + offCode := (offset >> 16) & 31 + // inlined 'w.writeCode(offs[offCode])' + c = offs[offCode] + bits |= c.code64() << (nbits & 63) + nbits += c.len() + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } + + if offCode >= offsetExtraBitsMinCode { + offsetComb := offsetCombined[offCode] + bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63) + nbits += uint8(offsetComb) + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } } } + // Restore... + w.bits, w.nbits, w.nbytes = bits, nbits, nbytes + + if deferEOB { + w.writeCode(leCodes[endBlockMarker]) + } } // huffOffset is a static offset encoder used for huffman only encoding. @@ -600,94 +928,168 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) var huffOffset *huffmanEncoder func init() { - offsetFreq := make([]int32, offsetCodeCount) - offsetFreq[0] = 1 + w := newHuffmanBitWriter(nil) + w.offsetFreq[0] = 1 huffOffset = newHuffmanEncoder(offsetCodeCount) - huffOffset.generate(offsetFreq, 15) + huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15) } // writeBlockHuff encodes a block of bytes as either // Huffman encoded literals or uncompressed bytes if the // results only gains very little from compression. -func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { if w.err != nil { return } // Clear histogram - clear(w.literalFreq) - - // Add everything as literals - histogram(input, w.literalFreq) - - w.literalFreq[endBlockMarker] = 1 + for i := range w.literalFreq[:] { + w.literalFreq[i] = 0 + } + if !w.lastHuffMan { + for i := range w.offsetFreq[:] { + w.offsetFreq[i] = 0 + } + } const numLiterals = endBlockMarker + 1 - w.offsetFreq[0] = 1 const numOffsets = 1 - w.literalEncoding.generate(w.literalFreq, 15) - - // Figure out smallest code. - // Always use dynamic Huffman or Store - var numCodegens int - - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) + // Add everything as literals + // We have to estimate the header size. + // Assume header is around 70 bytes: + // https://stackoverflow.com/a/25454430 + const guessHeaderSizeBits = 70 * 8 + histogram(input, w.literalFreq[:numLiterals]) + ssize, storable := w.storedSize(input) + if storable && len(input) > 1024 { + // Quick check for incompressible content. + abs := float64(0) + avg := float64(len(input)) / 256 + max := float64(len(input) * 2) + for _, v := range w.literalFreq[:256] { + diff := float64(v) - avg + abs += diff * diff + if abs > max { + break + } + } + if abs < max { + // No chance we can compress this... + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + } + w.literalFreq[endBlockMarker] = 1 + w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15) + estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals]) + if estBits < math.MaxInt32 { + estBits += w.lastHeader + if w.lastHeader == 0 { + estBits += guessHeaderSizeBits + } + estBits += estBits >> w.logNewTablePenalty + } // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + if storable && ssize <= estBits { w.writeStoredHeader(len(input), eof) w.writeBytes(input) return } - // Huffman. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) - encoding := w.literalEncoding.codes[:257] - n := w.nbytes - for _, t := range input { - // Bitwriting inlined, ~30% speedup - c := encoding[t] - w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) - if w.nbits < 48 { - continue + if w.lastHeader > 0 { + reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256]) + + if estBits < reuseSize { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 } - // Store 6 bytes - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n < bufferFlushSize { - continue + } + + if w.lastHeader == 0 { + // Use the temp encoding, so swap. + w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + numCodegens := w.codegens() + + // Huffman. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + w.lastHuffMan = true + w.lastHeader, _ = w.headerSize() + } + + encoding := w.literalEncoding.codes[:256] + // Go 1.16 LOVES having these on stack. At least 1.5x the speed. + bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + + // Unroll, write 3 codes/loop. + // Fastest number of unrolls. + for len(input) > 3 { + // We must have at least 48 bits free. + if nbits >= 8 { + n := nbits >> 3 + storeLE64(w.bytes[nbytes:], bits) + bits >>= (n * 8) & 63 + nbits -= n * 8 + nbytes += n } - w.write(w.bytes[:n]) - if w.err != nil { - return // Return early in the event of write failures + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 } - n = 0 + a, b := encoding[input[0]], encoding[input[1]] + bits |= a.code64() << (nbits & 63) + bits |= b.code64() << ((nbits + a.len()) & 63) + c := encoding[input[2]] + nbits += b.len() + a.len() + bits |= c.code64() << (nbits & 63) + nbits += c.len() + input = input[3:] } - w.nbytes = n - w.writeCode(encoding[endBlockMarker]) -} -// histogram accumulates a histogram of b in h. -// -// len(h) must be >= 256, and h's elements must be all zeroes. -func histogram(b []byte, h []int32) { - h = h[:256] - for _, t := range b { - h[t]++ + // Remaining... + for _, t := range input { + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } + // Bitwriting inlined, ~30% speedup + c := encoding[t] + bits |= c.code64() << (nbits & 63) + + nbits += c.len() + } + // Restore... + w.bits, w.nbits, w.nbytes = bits, nbits, nbytes + + // Flush if needed to have space. + if w.nbits >= 48 { + w.writeOutBits() + } + + if eof || sync { + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false } } diff --git a/src/compress/flate/huffman_bit_writer_test.go b/src/compress/flate/huffman_bit_writer_test.go index a57799cae02685..dfb93e326c0871 100644 --- a/src/compress/flate/huffman_bit_writer_test.go +++ b/src/compress/flate/huffman_bit_writer_test.go @@ -32,7 +32,9 @@ func TestBlockHuff(t *testing.T) { if strings.HasSuffix(in, ".in") { out = in[:len(in)-len(".in")] + ".golden" } - testBlockHuff(t, in, out) + t.Run(in, func(t *testing.T) { + testBlockHuff(t, in, out) + }) } } @@ -44,7 +46,8 @@ func testBlockHuff(t *testing.T, in, out string) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) - bw.writeBlockHuff(false, all) + bw.logNewTablePenalty = 8 + bw.writeBlockHuff(false, all, false) bw.flush() got := buf.Bytes() @@ -79,7 +82,7 @@ func testBlockHuff(t *testing.T, in, out string) { // Test if the writer produces the same output after reset. buf.Reset() bw.reset(&buf) - bw.writeBlockHuff(false, all) + bw.writeBlockHuff(false, all, false) bw.flush() got = buf.Bytes() if !bytes.Equal(got, want) { @@ -175,13 +178,23 @@ func TestWriteBlockDynamic(t *testing.T) { } } +// TestWriteBlockDynamic tests if the writeBlockDynamic encoding has changed. +// To update the reference files use the "-update" flag on the test. +func TestWriteBlockDynamicSync(t *testing.T) { + for _, test := range writeBlockTests { + testBlock(t, test, "sync") + } +} + // testBlock tests a block against its references, // or regenerate the references, if "-update" flag is set. func testBlock(t *testing.T, test huffTest, ttype string) { if test.want != "" { test.want = fmt.Sprintf(test.want, ttype) } + const gotSuffix = ".got" test.wantNoInput = fmt.Sprintf(test.wantNoInput, ttype) + tokens := indexTokens(test.tokens) if *update { if test.input != "" { t.Logf("Updating %q", test.want) @@ -198,7 +211,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } defer f.Close() bw := newHuffmanBitWriter(f) - writeToType(t, ttype, bw, test.tokens, input) + writeToType(t, ttype, bw, tokens, input) } t.Logf("Updating %q", test.wantNoInput) @@ -209,7 +222,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } defer f.Close() bw := newHuffmanBitWriter(f) - writeToType(t, ttype, bw, test.tokens, nil) + writeToType(t, ttype, bw, tokens, nil) return } @@ -227,12 +240,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) - writeToType(t, ttype, bw, test.tokens, input) + writeToType(t, ttype, bw, tokens, input) got := buf.Bytes() if !bytes.Equal(got, want) { - t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".got") - if err := os.WriteFile(test.want+".got", got, 0666); err != nil { + t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+gotSuffix) + if err := os.WriteFile(test.want+gotSuffix, got, 0666); err != nil { t.Error(err) } } @@ -241,12 +254,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { // Test if the writer produces the same output after reset. buf.Reset() bw.reset(&buf) - writeToType(t, ttype, bw, test.tokens, input) + writeToType(t, ttype, bw, tokens, input) bw.flush() got = buf.Bytes() if !bytes.Equal(got, want) { - t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset.got") - if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil { + t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset"+gotSuffix) + if err := os.WriteFile(test.want+".reset"+gotSuffix, got, 0666); err != nil { t.Error(err) } return @@ -262,12 +275,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) - writeToType(t, ttype, bw, test.tokens, nil) + writeToType(t, ttype, bw, tokens, nil) got := buf.Bytes() if !bytes.Equal(got, wantNI) { - t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+".got") - if err := os.WriteFile(test.want+".got", got, 0666); err != nil { + t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+gotSuffix) + if err := os.WriteFile(test.wantNoInput+gotSuffix, got, 0666); err != nil { t.Error(err) } } else if got[0]&1 == 1 { @@ -280,12 +293,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { // Test if the writer produces the same output after reset. buf.Reset() bw.reset(&buf) - writeToType(t, ttype, bw, test.tokens, nil) + writeToType(t, ttype, bw, tokens, nil) bw.flush() got = buf.Bytes() if !bytes.Equal(got, wantNI) { - t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.want, test.want+".reset.got") - if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil { + t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.wantNoInput, test.wantNoInput+".reset"+gotSuffix) + if err := os.WriteFile(test.wantNoInput+".reset"+gotSuffix, got, 0666); err != nil { t.Error(err) } return @@ -294,12 +307,14 @@ func testBlock(t *testing.T, test huffTest, ttype string) { testWriterEOF(t, "wb", test, false) } -func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok []token, input []byte) { +func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok tokens, input []byte) { switch ttype { case "wb": - bw.writeBlock(tok, false, input) + bw.writeBlock(&tok, false, input) case "dyn": - bw.writeBlockDynamic(tok, false, input) + bw.writeBlockDynamic(&tok, false, input, false) + case "sync": + bw.writeBlockDynamic(&tok, false, input, true) default: panic("unknown test type") } @@ -332,13 +347,14 @@ func testWriterEOF(t *testing.T, ttype string, test huffTest, useInput bool) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) + tokens := indexTokens(test.tokens) switch ttype { case "wb": - bw.writeBlock(test.tokens, true, input) + bw.writeBlock(&tokens, true, input) case "dyn": - bw.writeBlockDynamic(test.tokens, true, input) + bw.writeBlockDynamic(&tokens, true, input, true) case "huff": - bw.writeBlockHuff(true, input) + bw.writeBlockHuff(true, input, true) default: panic("unknown test type") } diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go index 6f69cabfd060d4..f3e202430736d3 100644 --- a/src/compress/flate/huffman_code.go +++ b/src/compress/flate/huffman_code.go @@ -7,25 +7,42 @@ package flate import ( "math" "math/bits" - "sort" +) + +const ( + maxBitsLimit = 16 + // number of valid literals + literalCount = 286 ) // hcode is a huffman code with a bit code and bit length. -type hcode struct { - code, len uint16 +type hcode uint32 + +func (h hcode) len() uint8 { + return uint8(h) +} + +func (h hcode) code64() uint64 { + return uint64(h >> 8) +} + +func (h hcode) zero() bool { + return h == 0 } type huffmanEncoder struct { - codes []hcode - freqcache []literalNode - bitCount [17]int32 - lns byLiteral // stored to avoid repeated allocation in generate - lfs byFreq // stored to avoid repeated allocation in generate + codes []hcode + bitCount [17]int32 + + // Allocate a reusable buffer with the longest possible frequency table. + // Possible lengths are codegenCodeCount, offsetCodeCount and literalCount. + // The largest of these is literalCount, so we allocate for that case. + freqcache [literalCount + 1]literalNode } type literalNode struct { literal uint16 - freq int32 + freq uint16 } // A levelInfo describes the state of the constructed tree for a given depth. @@ -49,25 +66,34 @@ type levelInfo struct { } // set sets the code and length of an hcode. -func (h *hcode) set(code uint16, length uint16) { - h.len = length - h.code = code +func (h *hcode) set(code uint16, length uint8) { + *h = hcode(length) | (hcode(code) << 8) +} + +func newhcode(code uint16, length uint8) hcode { + return hcode(length) | (hcode(code) << 8) +} + +func reverseBits(number uint16, bitLength byte) uint16 { + return bits.Reverse16(number << ((16 - bitLength) & 15)) } -func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} } +func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} } func newHuffmanEncoder(size int) *huffmanEncoder { - return &huffmanEncoder{codes: make([]hcode, size)} + // Make capacity to next power of two. + c := uint(bits.Len32(uint32(size - 1))) + return &huffmanEncoder{codes: make([]hcode, size, 1<= 3. +// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused. +func (h *huffmanEncoder) canReuseBits(freq []uint16) int { + var total int + for i, f := range freq { + if f != 0 { + code := h.codes[i] + if code.zero() { + return math.MaxInt32 + } + total += int(f) * int(code.len()) + } + } + return total +} + +// Return the number of literals assigned to each bit size in the Huffman encoding +// +// This method is only called when list.length >= 3 // The cases of 0, 1, and 2 literals are handled by special case code. // -// list is an array of the literals with non-zero frequencies -// and their associated frequencies. The array is in order of increasing -// frequency and has as its last element a special element with frequency -// MaxInt32. +// list An array of the literals with non-zero frequencies +// +// and their associated frequencies. The array is in order of increasing +// frequency, and has as its last element a special element with frequency +// MaxInt32 +// +// maxBits The maximum number of bits that should be used to encode any literal. +// +// Must be less than 16. // -// maxBits is the maximum number of bits that should be used to encode any literal. -// It must be less than 16. +// return An integer array in which array[i] indicates the number of literals // -// bitCounts returns an integer slice in which slice[i] indicates the number of literals -// that should be encoded in i bits. +// that should be encoded in i bits. func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { if maxBits >= maxBitsLimit { panic("flate: maxBits too large") @@ -154,14 +205,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // of the level j ancestor. var leafCounts [maxBitsLimit][maxBitsLimit]int32 + // Descending to only have 1 bounds check. + l2f := int32(list[2].freq) + l1f := int32(list[1].freq) + l0f := int32(list[0].freq) + int32(list[1].freq) + for level := int32(1); level <= maxBits; level++ { // For every level, the first two items are the first two characters. // We initialize the levels as if we had already figured this out. levels[level] = levelInfo{ level: level, - lastFreq: list[1].freq, - nextCharFreq: list[2].freq, - nextPairFreq: list[0].freq + list[1].freq, + lastFreq: l1f, + nextCharFreq: l2f, + nextPairFreq: l0f, } leafCounts[level][level] = 2 if level == 1 { @@ -172,11 +228,11 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // We need a total of 2*n - 2 items at top level and have already generated 2. levels[maxBits].needed = 2*n - 4 - level := maxBits - for { + level := uint32(maxBits) + for level < 16 { l := &levels[level] if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 { - // We've run out of both leaves and pairs. + // We've run out of both leafs and pairs. // End all calculations for this level. // To make sure we never come back to this level or any lower level, // set nextPairFreq impossibly large. @@ -193,14 +249,21 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { l.lastFreq = l.nextCharFreq // Lower leafCounts are the same of the previous node. leafCounts[level][level] = n - l.nextCharFreq = list[n].freq + e := list[n] + if e.literal < math.MaxUint16 { + l.nextCharFreq = int32(e.freq) + } else { + l.nextCharFreq = math.MaxInt32 + } } else { // The next item on this row is a pair from the previous row. // nextPairFreq isn't valid until we generate two // more values in the level below l.lastFreq = l.nextPairFreq // Take leaf counts from the lower level, except counts[level] remains the same. - copy(leafCounts[level][:level], leafCounts[level-1][:level]) + save := leafCounts[level][level] + leafCounts[level] = leafCounts[level-1] + leafCounts[level][level] = save levels[l.level-1].needed = 2 } @@ -256,9 +319,9 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // assigned in literal order (not frequency order). chunk := list[len(list)-int(bits):] - h.lns.sort(chunk) + sortByLiteral(chunk) for _, node := range chunk { - h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)} + h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n)) code++ } list = list[0 : len(list)-int(bits)] @@ -268,15 +331,10 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // Update this Huffman Code object to be the minimum code for the specified frequency count. // // freq is an array of frequencies, in which freq[i] gives the frequency of literal i. -// maxBits The maximum number of bits to use for any literal. -func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { - if h.freqcache == nil { - // Allocate a reusable buffer with the longest possible frequency table. - // Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit. - // The largest of these is maxNumLit, so we allocate for that case. - h.freqcache = make([]literalNode, maxNumLit+1) - } +// maxBits is the maximum number of bits to use for any literal. +func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list := h.freqcache[:len(freq)+1] + codes := h.codes[:len(freq)] // Number of non-zero literals count := 0 // Set list to be the set of all non-zero literals and their frequencies @@ -285,9 +343,10 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { list[count] = literalNode{uint16(i), f} count++ } else { - h.codes[i].len = 0 + codes[i] = 0 } } + list[count] = literalNode{} list = list[:count] if count <= 2 { @@ -299,7 +358,7 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { } return } - h.lfs.sort(list) + sortByFreq(list) // Get the number of literals for each bit count bitCount := h.bitCounts(list, maxBits) @@ -307,39 +366,43 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { h.assignEncodingAndSize(bitCount, list) } -type byLiteral []literalNode - -func (s *byLiteral) sort(a []literalNode) { - *s = byLiteral(a) - sort.Sort(s) +// atLeastOne clamps the result between 1 and 15. +func atLeastOne(v float32) float32 { + return min(15, max(1, v)) } -func (s byLiteral) Len() int { return len(s) } - -func (s byLiteral) Less(i, j int) bool { - return s[i].literal < s[j].literal -} - -func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] } - -type byFreq []literalNode - -func (s *byFreq) sort(a []literalNode) { - *s = byFreq(a) - sort.Sort(s) -} - -func (s byFreq) Len() int { return len(s) } - -func (s byFreq) Less(i, j int) bool { - if s[i].freq == s[j].freq { - return s[i].literal < s[j].literal +func histogram(b []byte, h []uint16) { + if len(b) >= 8<<10 { + // Split for bigger inputs + histogramSplit(b, h) + } else { + h = h[:256] + for _, t := range b { + h[t]++ + } } - return s[i].freq < s[j].freq } -func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] } - -func reverseBits(number uint16, bitLength byte) uint16 { - return bits.Reverse16(number << (16 - bitLength)) +func histogramSplit(b []byte, h []uint16) { + // Tested, and slightly faster than 2-way. + // Writing to separate arrays and combining is also slightly slower. + h = h[:256] + // Make size divisible by 4 + for len(b)&3 != 0 { + h[b[0]]++ + b = b[1:] + } + n := len(b) / 4 + x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:] + y, z, w = y[:len(x)], z[:len(x)], w[:len(x)] + for i, t := range x { + v0 := &h[t] + v1 := &h[y[i]] + v3 := &h[w[i]] + v2 := &h[z[i]] + *v0++ + *v1++ + *v2++ + *v3++ + } } diff --git a/src/compress/flate/huffman_sortByFreq.go b/src/compress/flate/huffman_sortByFreq.go new file mode 100644 index 00000000000000..6c05ba8c1c2e2a --- /dev/null +++ b/src/compress/flate/huffman_sortByFreq.go @@ -0,0 +1,159 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Sort sorts data. +// It makes one call to data.Len to determine n, and O(n*log(n)) calls to +// data.Less and data.Swap. The sort is not guaranteed to be stable. +func sortByFreq(data []literalNode) { + n := len(data) + quickSortByFreq(data, 0, n, maxDepth(n)) +} + +func quickSortByFreq(data []literalNode, a, b, maxDepth int) { + for b-a > 12 { // Use ShellSort for slices <= 12 elements + if maxDepth == 0 { + heapSort(data, a, b) + return + } + maxDepth-- + mlo, mhi := doPivotByFreq(data, a, b) + // Avoiding recursion on the larger subproblem guarantees + // a stack depth of at most lg(b-a). + if mlo-a < b-mhi { + quickSortByFreq(data, a, mlo, maxDepth) + a = mhi // i.e., quickSortByFreq(data, mhi, b) + } else { + quickSortByFreq(data, mhi, b, maxDepth) + b = mlo // i.e., quickSortByFreq(data, a, mlo) + } + } + if b-a > 1 { + // Do ShellSort pass with gap 6 + // It could be written in this simplified form cause b-a <= 12 + for i := a + 6; i < b; i++ { + if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq { + data[i], data[i-6] = data[i-6], data[i] + } + } + insertionSortByFreq(data, a, b) + } +} + +func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) { + m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow. + if hi-lo > 40 { + // Tukey's ``Ninther,'' median of three medians of three. + s := (hi - lo) / 8 + medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s) + medianOfThreeSortByFreq(data, m, m-s, m+s) + medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s) + } + medianOfThreeSortByFreq(data, lo, m, hi-1) + + // Invariants are: + // data[lo] = pivot (set up by ChoosePivot) + // data[lo < i < a] < pivot + // data[a <= i < b] <= pivot + // data[b <= i < c] unexamined + // data[c <= i < hi-1] > pivot + // data[hi-1] >= pivot + pivot := lo + a, c := lo+1, hi-1 + + for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { + } + b := a + for { + for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot + } + for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot + } + if b >= c { + break + } + // data[b] > pivot; data[c-1] <= pivot + data[b], data[c-1] = data[c-1], data[b] + b++ + c-- + } + // If hi-c<3 then there are duplicates (by property of median of nine). + // Let's be a bit more conservative, and set border to 5. + protect := hi-c < 5 + if !protect && hi-c < (hi-lo)/4 { + // Lets test some points for equality to pivot + dups := 0 + if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot + data[c], data[hi-1] = data[hi-1], data[c] + c++ + dups++ + } + if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot + b-- + dups++ + } + // m-lo = (hi-lo)/2 > 6 + // b-lo > (hi-lo)*3/4-1 > 8 + // ==> m < b ==> data[m] <= pivot + if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot + data[m], data[b-1] = data[b-1], data[m] + b-- + dups++ + } + // if at least 2 points are equal to pivot, assume skewed distribution + protect = dups > 1 + } + if protect { + // Protect against a lot of duplicates + // Add invariant: + // data[a <= i < b] unexamined + // data[b <= i < c] = pivot + for { + for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot + } + for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot + } + if a >= b { + break + } + // data[a] == pivot; data[b-1] < pivot + data[a], data[b-1] = data[b-1], data[a] + a++ + b-- + } + } + // Swap pivot into middle + data[pivot], data[b-1] = data[b-1], data[pivot] + return b - 1, c +} + +// Insertion sort +func insertionSortByFreq(data []literalNode, a, b int) { + for i := a + 1; i < b; i++ { + for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- { + data[j], data[j-1] = data[j-1], data[j] + } + } +} + +// quickSortByFreq, loosely following Bentley and McIlroy, +// ``Engineering a Sort Function,'' SP&E November 1993. + +// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1]. +func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) { + // sort 3 elements + if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq { + data[m1], data[m0] = data[m0], data[m1] + } + // data[m0] <= data[m1] + if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq { + data[m2], data[m1] = data[m1], data[m2] + // data[m0] <= data[m2] && data[m1] < data[m2] + if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq { + data[m1], data[m0] = data[m0], data[m1] + } + } + // now data[m0] <= data[m1] <= data[m2] +} diff --git a/src/compress/flate/huffman_sortByLiteral.go b/src/compress/flate/huffman_sortByLiteral.go new file mode 100644 index 00000000000000..93f1aea109e123 --- /dev/null +++ b/src/compress/flate/huffman_sortByLiteral.go @@ -0,0 +1,201 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Sort sorts data. +// It makes one call to data.Len to determine n, and O(n*log(n)) calls to +// data.Less and data.Swap. The sort is not guaranteed to be stable. +func sortByLiteral(data []literalNode) { + n := len(data) + quickSort(data, 0, n, maxDepth(n)) +} + +func quickSort(data []literalNode, a, b, maxDepth int) { + for b-a > 12 { // Use ShellSort for slices <= 12 elements + if maxDepth == 0 { + heapSort(data, a, b) + return + } + maxDepth-- + mlo, mhi := doPivot(data, a, b) + // Avoiding recursion on the larger subproblem guarantees + // a stack depth of at most lg(b-a). + if mlo-a < b-mhi { + quickSort(data, a, mlo, maxDepth) + a = mhi // i.e., quickSort(data, mhi, b) + } else { + quickSort(data, mhi, b, maxDepth) + b = mlo // i.e., quickSort(data, a, mlo) + } + } + if b-a > 1 { + // Do ShellSort pass with gap 6 + // It could be written in this simplified form cause b-a <= 12 + for i := a + 6; i < b; i++ { + if data[i].literal < data[i-6].literal { + data[i], data[i-6] = data[i-6], data[i] + } + } + insertionSort(data, a, b) + } +} +func heapSort(data []literalNode, a, b int) { + first := a + lo := 0 + hi := b - a + + // Build heap with greatest element at top. + for i := (hi - 1) / 2; i >= 0; i-- { + siftDown(data, i, hi, first) + } + + // Pop elements, largest first, into end of data. + for i := hi - 1; i >= 0; i-- { + data[first], data[first+i] = data[first+i], data[first] + siftDown(data, lo, i, first) + } +} + +// siftDown implements the heap property on data[lo, hi). +// first is an offset into the array where the root of the heap lies. +func siftDown(data []literalNode, lo, hi, first int) { + root := lo + for { + child := 2*root + 1 + if child >= hi { + break + } + if child+1 < hi && data[first+child].literal < data[first+child+1].literal { + child++ + } + if data[first+root].literal > data[first+child].literal { + return + } + data[first+root], data[first+child] = data[first+child], data[first+root] + root = child + } +} +func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) { + m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow. + if hi-lo > 40 { + // Tukey's ``Ninther,'' median of three medians of three. + s := (hi - lo) / 8 + medianOfThree(data, lo, lo+s, lo+2*s) + medianOfThree(data, m, m-s, m+s) + medianOfThree(data, hi-1, hi-1-s, hi-1-2*s) + } + medianOfThree(data, lo, m, hi-1) + + // Invariants are: + // data[lo] = pivot (set up by ChoosePivot) + // data[lo < i < a] < pivot + // data[a <= i < b] <= pivot + // data[b <= i < c] unexamined + // data[c <= i < hi-1] > pivot + // data[hi-1] >= pivot + pivot := lo + a, c := lo+1, hi-1 + + for ; a < c && data[a].literal < data[pivot].literal; a++ { + } + b := a + for { + for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot + } + for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot + } + if b >= c { + break + } + // data[b] > pivot; data[c-1] <= pivot + data[b], data[c-1] = data[c-1], data[b] + b++ + c-- + } + // If hi-c<3 then there are duplicates (by property of median of nine). + // Let's be a bit more conservative, and set border to 5. + protect := hi-c < 5 + if !protect && hi-c < (hi-lo)/4 { + // Lets test some points for equality to pivot + dups := 0 + if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot + data[c], data[hi-1] = data[hi-1], data[c] + c++ + dups++ + } + if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot + b-- + dups++ + } + // m-lo = (hi-lo)/2 > 6 + // b-lo > (hi-lo)*3/4-1 > 8 + // ==> m < b ==> data[m] <= pivot + if data[m].literal > data[pivot].literal { // data[m] = pivot + data[m], data[b-1] = data[b-1], data[m] + b-- + dups++ + } + // if at least 2 points are equal to pivot, assume skewed distribution + protect = dups > 1 + } + if protect { + // Protect against a lot of duplicates + // Add invariant: + // data[a <= i < b] unexamined + // data[b <= i < c] = pivot + for { + for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot + } + for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot + } + if a >= b { + break + } + // data[a] == pivot; data[b-1] < pivot + data[a], data[b-1] = data[b-1], data[a] + a++ + b-- + } + } + // Swap pivot into middle + data[pivot], data[b-1] = data[b-1], data[pivot] + return b - 1, c +} + +// Insertion sort +func insertionSort(data []literalNode, a, b int) { + for i := a + 1; i < b; i++ { + for j := i; j > a && data[j].literal < data[j-1].literal; j-- { + data[j], data[j-1] = data[j-1], data[j] + } + } +} + +// maxDepth returns a threshold at which quicksort should switch +// to heapsort. It returns 2*ceil(lg(n+1)). +func maxDepth(n int) int { + var depth int + for i := n; i > 0; i >>= 1 { + depth++ + } + return depth * 2 +} + +// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1]. +func medianOfThree(data []literalNode, m1, m0, m2 int) { + // sort 3 elements + if data[m1].literal < data[m0].literal { + data[m1], data[m0] = data[m0], data[m1] + } + // data[m0] <= data[m1] + if data[m2].literal < data[m1].literal { + data[m2], data[m1] = data[m1], data[m2] + // data[m0] <= data[m2] && data[m1] < data[m2] + if data[m1].literal < data[m0].literal { + data[m1], data[m0] = data[m0], data[m1] + } + } + // now data[m0] <= data[m1] <= data[m2] +} diff --git a/src/compress/flate/level1.go b/src/compress/flate/level1.go new file mode 100644 index 00000000000000..2195df4fa38f93 --- /dev/null +++ b/src/compress/flate/level1.go @@ -0,0 +1,197 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 1 uses a single small table with 5 byte hashes. +type fastEncL1 struct { + fastGen + table [tableSize]tableEntry +} + +func (e *fastEncL1) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + + for { + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + var t int32 + for { + nextHash := hashLen(cv, tableBits, hashBytes) + candidate = e.table[nextHash] + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + + now := loadLE64(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur} + nextHash = hashLen(now, tableBits, hashBytes) + t = candidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + e.table[nextHash] = tableEntry{offset: nextS + e.cur} + break + } + + // Do one right away... + cv = now + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur} + + t = candidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + e.table[nextHash] = tableEntry{offset: nextS + e.cur} + break + } + cv = now + s = nextS + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && loadLE8(src, t-1) == loadLE8(src, s-1) { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + // Save the match found. Same as 'dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))' + xOffset := uint32(s - t - baseMatchOffset) + xLength := l + oc := offsetCode(xOffset) + xOffset |= oc << 16 + for xLength > 0 { + xl := xLength + if xl > 258 { + if xl > 258+baseMatchLength { + xl = 258 + } else { + xl = 258 - baseMatchLength + } + } + xLength -= xl + xl -= baseMatchLength + dst.extraHist[lengthCodes1[uint8(xl)]]++ + dst.offHist[oc]++ + dst.tokens[dst.n] = token(matchType | uint32(xl)<= s { + s = nextS + 1 + } + if s >= sLimit { + // Index first pair after match end. + if int(s+l+8) < len(src) { + cv := loadLE64(src, s) + e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur} + } + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := loadLE64(src, s-2) + o := e.cur + s - 2 + prevHash := hashLen(x, tableBits, hashBytes) + e.table[prevHash] = tableEntry{offset: o} + x >>= 16 + currHash := hashLen(x, tableBits, hashBytes) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2} + + t = candidate.offset - e.cur + if s-t > maxMatchOffset || uint32(x) != loadLE32(src, t) { + cv = x >> 8 + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level2.go b/src/compress/flate/level2.go new file mode 100644 index 00000000000000..7a2fdf7abe6ddb --- /dev/null +++ b/src/compress/flate/level2.go @@ -0,0 +1,187 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 2 uses a similar algorithm to level 1, but with a larger table. +type fastEncL2 struct { + fastGen + table [bTableSize]tableEntry +} + +func (e *fastEncL2) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + for { + // When should we start skipping if we haven't found matches in a long while. + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hashLen(cv, bTableBits, hashBytes) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidate = e.table[nextHash] + now := loadLE64(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur} + nextHash = hashLen(now, bTableBits, hashBytes) + + offset := s - (candidate.offset - e.cur) + if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + e.table[nextHash] = tableEntry{offset: nextS + e.cur} + break + } + + // Do one right away... + cv = now + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur} + + offset = s - (candidate.offset - e.cur) + if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + break + } + cv = now + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes match. + for { + // Extend the 4-byte match as long as possible. + t := candidate.offset - e.cur + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+l+8) < len(src) { + cv := loadLE64(src, s) + e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur} + } + goto emitRemainder + } + + // Store every second hash in-between, but offset by 1. + for i := s - l + 2; i < s-5; i += 7 { + x := loadLE64(src, i) + nextHash := hashLen(x, bTableBits, hashBytes) + e.table[nextHash] = tableEntry{offset: e.cur + i} + // Skip one + x >>= 16 + nextHash = hashLen(x, bTableBits, hashBytes) + e.table[nextHash] = tableEntry{offset: e.cur + i + 2} + // Skip one + x >>= 16 + nextHash = hashLen(x, bTableBits, hashBytes) + e.table[nextHash] = tableEntry{offset: e.cur + i + 4} + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. + x := loadLE64(src, s-2) + o := e.cur + s - 2 + prevHash := hashLen(x, bTableBits, hashBytes) + prevHash2 := hashLen(x>>8, bTableBits, hashBytes) + e.table[prevHash] = tableEntry{offset: o} + e.table[prevHash2] = tableEntry{offset: o + 1} + currHash := hashLen(x>>16, bTableBits, hashBytes) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2} + + offset := s - (candidate.offset - e.cur) + if offset > maxMatchOffset || uint32(x>>16) != loadLE32(src, candidate.offset-e.cur) { + cv = x >> 24 + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level3.go b/src/compress/flate/level3.go new file mode 100644 index 00000000000000..adda8714879c8d --- /dev/null +++ b/src/compress/flate/level3.go @@ -0,0 +1,226 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 3 uses a similar algorithm to level 2, with a smaller table, +// but will check up two candidates for each iteration with more +// entries added to the table. +type fastEncL3 struct { + fastGen + table [1 << 16]tableEntryPrev +} + +func (e *fastEncL3) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + tableBits = 16 + hashBytes = 5 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + } + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + e.table[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // Skip if too small. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiterals should start from. + cv := loadLE64(src, s) + for { + const skipLog = 7 + nextS := s + var candidate tableEntry + for { + nextHash := hashLen(cv, tableBits, hashBytes) + s = nextS + nextS = s + 1 + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidates := e.table[nextHash] + now := loadLE64(src, nextS) + + // Safe offset distance until s + 4... + minOffset := e.cur + s - (maxMatchOffset - 4) + e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}} + + // Check both candidates + candidate = candidates.Cur + if candidate.offset < minOffset { + cv = now + // Previous will also be invalid, we have nothing. + continue + } + + if uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + if candidates.Prev.offset < minOffset || uint32(cv) != loadLE32(src, candidates.Prev.offset-e.cur) { + break + } + // Both match and are valid, pick longest. + offset := s - (candidate.offset - e.cur) + o2 := s - (candidates.Prev.offset - e.cur) + l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:]) + if l2 > l1 { + candidate = candidates.Prev + } + break + } else { + // We only check if value mismatches. + // Offset will always be invalid in other cases. + candidate = candidates.Prev + if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + break + } + } + cv = now + } + + for { + // Extend the 4-byte match as long as possible. + // + t := candidate.offset - e.cur + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + // Emit literals. + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + // Emit match. + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + t += l + // Index first pair after match end. + if int(t+8) < len(src) && t > 0 { + cv = loadLE64(src, t) + nextHash := hashLen(cv, tableBits, hashBytes) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + t}, + } + } + goto emitRemainder + } + + // Store every 5th hash in-between. + for i := s - l + 2; i < s-5; i += 6 { + nextHash := hashLen(loadLE64(src, i), tableBits, hashBytes) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + i}} + } + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. + x := loadLE64(src, s-2) + prevHash := hashLen(x, tableBits, hashBytes) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 2}, + } + x >>= 8 + prevHash = hashLen(x, tableBits, hashBytes) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 1}, + } + x >>= 8 + currHash := hashLen(x, tableBits, hashBytes) + candidates := e.table[currHash] + cv = x + e.table[currHash] = tableEntryPrev{ + Prev: candidates.Cur, + Cur: tableEntry{offset: s + e.cur}, + } + + // Check both candidates + candidate = candidates.Cur + minOffset := e.cur + s - (maxMatchOffset - 4) + + if candidate.offset > minOffset { + if uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + // Found a match... + continue + } + candidate = candidates.Prev + if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + // Match at prev... + continue + } + } + cv = x >> 8 + s++ + break + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level4.go b/src/compress/flate/level4.go new file mode 100644 index 00000000000000..f62168b64ed9e3 --- /dev/null +++ b/src/compress/flate/level4.go @@ -0,0 +1,204 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 4 uses two tables, one for short (4 bytes) and one for long (7 bytes) matches. +type fastEncL4 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntry +} + +func (e *fastEncL4) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.bTable[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiterals should start from. + cv := loadLE64(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := loadLE64(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + e.bTable[nextHashL] = entry + + t = lCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // We got a long match. Use that. + break + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Found a 4 match... + lCandidate = e.bTable[hash7(next, tableBits)] + + // If the next long is a candidate, check if we should use that instead... + lOff := lCandidate.offset - e.cur + if nextS-lOff < maxMatchOffset && loadLE32(src, lOff) == uint32(next) { + l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:]) + if l2 > l1 { + s = nextS + t = lCandidate.offset - e.cur + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+8) < len(src) { + cv := loadLE64(src, s) + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur} + e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur} + } + goto emitRemainder + } + + // Store every 3rd hash in-between + i := nextS + if i < s-1 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + e.bTable[hash7(cv, tableBits)] = t + e.bTable[hash7(cv>>8, tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + + i += 3 + for ; i < s-1; i += 3 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + e.bTable[hash7(cv, tableBits)] = t + e.bTable[hash7(cv>>8, tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := loadLE64(src, s-1) + o := e.cur + s - 1 + prevHashS := hashLen(x, tableBits, hashShortBytes) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o} + e.bTable[prevHashL] = tableEntry{offset: o} + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level5.go b/src/compress/flate/level5.go new file mode 100644 index 00000000000000..5ef342eae0e8a2 --- /dev/null +++ b/src/compress/flate/level5.go @@ -0,0 +1,291 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 5 is similar to level 4, but for long matches two candidates are tested. +// Once a match is found, when it stops it will attempt to find a match that extends further. +type fastEncL5 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL5) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := loadLE64(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + nextHashS = hashLen(next, tableBits, hashShortBytes) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == loadLE32(src, t) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) { + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Found a 4 match... + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + lCandidate = e.bTable[nextHashL] + // Store the next match + + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // If the next long is a candidate, use that... + t2 := lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + if l == 0 { + // Extend the 4-byte match as long as possible. + l = e.matchlenLong(int(s+4), int(t+4), src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(int(s+l), int(t+l), src) + } + + // Try to locate a better match by checking the end of best match... + if sAt := s + l; l < 30 && sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + eLong := e.bTable[hash7(loadLE64(src, sAt), tableBits)].Cur.offset + t2 := eLong - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 + if t2 >= 0 && off < maxMatchOffset && off > 0 { + if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + goto emitRemainder + } + + // Store every 3rd hash in-between. + const hashEvery = 3 + i := s - l + 1 + if i < s-1 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // Do an long at i+1 + cv >>= 8 + t = tableEntry{offset: t.offset + 1} + eLong = &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // We only have enough bits for a short entry at i+2 + cv >>= 8 + t = tableEntry{offset: t.offset + 1} + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + + // Skip one - otherwise we risk hitting 's' + i += 4 + for ; i < s-1; i += hashEvery { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := loadLE64(src, s-1) + o := e.cur + s - 1 + prevHashS := hashLen(x, tableBits, hashShortBytes) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o} + eLong := &e.bTable[prevHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level6.go b/src/compress/flate/level6.go new file mode 100644 index 00000000000000..851a7155853eec --- /dev/null +++ b/src/compress/flate/level6.go @@ -0,0 +1,301 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 6 extends level 5, but does "repeat offset" check, +// as well as adding more hash entries to the tables. +type fastEncL6 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL6) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + // Repeat MUST be > 1 and within range + repeat := int32(1) + for { + const skipLog = 7 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hash7(cv, tableBits) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := loadLE64(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + // Calculate hashes of 'next' + nextHashS = hashLen(next, tableBits, hashShortBytes) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == loadLE32(src, t) { + // Long candidate matches at least 4 bytes. + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // Check the previous long candidate as well. + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) { + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + // Current value did not match, but check if previous long value does. + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Found a 4 match... + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + + // Look up next long candidate (at nextS) + lCandidate = e.bTable[nextHashL] + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // Check repeat at s + repOff + const repOff = 1 + t2 := s - repeat + repOff + if loadLE32(src, t2) == uint32(cv>>(8*repOff)) { + ml := e.matchLenLimited(int(s+4+repOff), int(t2+4), src) + 4 + if ml > l { + t = t2 + l = ml + s += repOff + // Not worth checking more. + break + } + } + + // If the next long is a candidate, use that... + t2 = lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + // This is ok, but check previous as well. + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // Extend the 4-byte match as long as possible. + if l == 0 { + l = e.matchlenLong(int(s+4), int(t+4), src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(int(s+l), int(t+l), src) + } + + // Try to locate a better match by checking the end-of-match... + if sAt := s + l; sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + eLong := &e.bTable[hash7(loadLE64(src, sAt), tableBits)] + // Test current + t2 := eLong.Cur.offset - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 + if off < maxMatchOffset { + if off > 0 && t2 >= 0 { + if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + // Test previous entry: + t2 = eLong.Prev.offset - e.cur - l + skipBeginning + off := s2 - t2 + if off > 0 && off < maxMatchOffset && t2 >= 0 { + if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + } + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + repeat = s - t + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index after match end. + for i := nextS + 1; i < int32(len(src))-8; i += 2 { + cv := loadLE64(src, i) + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur + } + goto emitRemainder + } + + // Store every long hash in-between and every second short. + for i := nextS + 1; i < s-1; i += 2 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong2 := &e.bTable[hash7(cv>>8, tableBits)] + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + eLong.Cur, eLong.Prev = t, eLong.Cur + eLong2.Cur, eLong2.Prev = t2, eLong2.Cur + } + cv = loadLE64(src, s) + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/regmask_amd64.go b/src/compress/flate/regmask_amd64.go new file mode 100644 index 00000000000000..cd1469a909173d --- /dev/null +++ b/src/compress/flate/regmask_amd64.go @@ -0,0 +1,14 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +const ( + // Masks for shifts with register sizes of the shift value. + // This can be used to work around the x86 design of shifting by mod register size. + // It can be used when a variable shift is always smaller than the register size. + + // reg8SizeMask64 - shift value is 8 bits on 64 bit register. + reg8SizeMask64 = 63 +) diff --git a/src/compress/flate/regmask_other.go b/src/compress/flate/regmask_other.go new file mode 100644 index 00000000000000..e25fc87af1b0d2 --- /dev/null +++ b/src/compress/flate/regmask_other.go @@ -0,0 +1,18 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 +// +build !amd64 + +package flate + +const ( + // Masks for shifts with register sizes of the shift value. + // This can be used to work around the x86 design of shifting by mod register size. + // On other platforms the mask is ineffective so the AND can be removed by the compiler. + // It can be used when a variable shift is always smaller than the register size. + + // reg8SizeMask64 - shift value is 8 bits on 64 bit register. + reg8SizeMask64 = 0xff +) diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect b/src/compress/flate/testdata/huffman-null-max.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..c08165143f2c570013c4916cbac5addfe9622a55 GIT binary patch literal 78 ZcmaEJppgLx8W#LrDZUcKq5v#l0|1+Y23i0B literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput b/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput new file mode 100644 index 0000000000000000000000000000000000000000..c08165143f2c570013c4916cbac5addfe9622a55 GIT binary patch literal 78 ZcmaEJppgLx8W#LrDZUcKq5v#l0|1+Y23i0B literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect b/src/compress/flate/testdata/huffman-pi.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..e4396ac6fe5e34609ccb7ea0bc359e6adb48c7f4 GIT binary patch literal 1696 zcmV;R24DFkmtE59U+}dp3M$E(%$UAJ`ff>rsvsiW8T+$6ZwCa`!Y=s-_luo9MajP$09#>I(F*#bYgkvGSvgH9cjqJxOtZL@E-R zxap{F9H>K0YPWsSkS2)R*aWKe{#|WqFIuv-wS}!bk75c(Z-9;7Wc4VnBBzs?752D& zc8p>URDdmkKtvR3uWp%l!&_EmTpc=NETFYQZ$(jWT@; zgN3|cc@&v*F@uVLa&KnX>Fd2bZUkkwfB)b_MW1tl319U*%S zvp^|A=dI~L9VRO0%SM^tpIF);2v& z2UTM|Eu;@^j|Ys3yuqcmNp8%xRb#N#JWo+RBgezuM69fAg{7zjhSjaxj9hCIS<|)) zTLN?jLt7gbXKG}iEUuqR-jG}(yN@N#B)wX z?|Hml6#3}s*c0K~nJep+6gLc-%e0Zx+0e0@vrzAOGcG64J5tD?3)Gal%l@md3K`X! zWHzzhS`E>KPF)C!q0$!IOpK<-WbmbF9QLE^nXFo~mu))PKI>??oiY z2eq0;6HL=Tt81EVym$AC{;?VPYEHwbEH44G@EQbW;L1XcSd)b||Ff@Ei(4Sj++jOm zBUh^KsO^kc_oqFUViJ1J^cG$3Tj{GxbaP=7I(EAlE=mRs3qthuA%e9rE-#PHFM(mQ zu6KhDd&6Mrg?qbky>)t9e~*^0hsbjfTxSkFOE@c#rEgM-#Z9ZTpaI9jc6f=dNhXc8 znW%G1wBBCANuz}>6H}+!y>*N6gKL$sTjqM=lH+`zajbQ|_!-Asw+~_~BPZz2`j$Kc zEhFt1TPE|&golz{9lnon*4~tBl|$aFu;^S(&T%XtkV=$yRZ5cBjJLTgxTv7rS!-y$2B``yh?Bd zU87(35T;+y=@n~to6Yow&?UtR3gMggy9M(CYsW0orRXZXb1;cR#nNz{C5S6uiE#A# z)e7C6h_D5sJRBg(Zy^5U!@dY0#$+}dp3M$E(%$UAJ`ff>rsvsiW8T+$6ZwCa`!Y=s-_luo9MajP$09#>I(F*#bYgkvGSvgH9cjqJxOtZL@E-R zxap{F9H>K0YPWsSkS2)R*aWKe{#|WqFIuv-wS}!bk75c(Z-9;7Wc4VnBBzs?752D& zc8p>URDdmkKtvR3uWp%l!&_EmTpc=NETFYQZ$(jWT@; zgN3|cc@&v*F@uVLa&KnX>Fd2bZUkkwfB)b_MW1tl319U*%S zvp^|A=dI~L9VRO0%SM^tpIF);2v& z2UTM|Eu;@^j|Ys3yuqcmNp8%xRb#N#JWo+RBgezuM69fAg{7zjhSjaxj9hCIS<|)) zTLN?jLt7gbXKG}iEUuqR-jG}(yN@N#B)wX z?|Hml6#3}s*c0K~nJep+6gLc-%e0Zx+0e0@vrzAOGcG64J5tD?3)Gal%l@md3K`X! zWHzzhS`E>KPF)C!q0$!IOpK<-WbmbF9QLE^nXFo~mu))PKI>??oiY z2eq0;6HL=Tt81EVym$AC{;?VPYEHwbEH44G@EQbW;L1XcSd)b||Ff@Ei(4Sj++jOm zBUh^KsO^kc_oqFUViJ1J^cG$3Tj{GxbaP=7I(EAlE=mRs3qthuA%e9rE-#PHFM(mQ zu6KhDd&6Mrg?qbky>)t9e~*^0hsbjfTxSkFOE@c#rEgM-#Z9ZTpaI9jc6f=dNhXc8 znW%G1wBBCANuz}>6H}+!y>*N6gKL$sTjqM=lH+`zajbQ|_!-Asw+~_~BPZz2`j$Kc zEhFt1TPE|&golz{9lnon*4~tBl|$aFu;^S(&T%XtkV=$yRZ5cBjJLTgxTv7rS!-y$2B``yh?Bd zU87(35T;+y=@n~to6Yow&?UtR3gMggy9M(CYsW0orRXZXb1;cR#nNz{C5S6uiE#A# z)e7C6h_D5sJRBg(Zy^5U!@dY0#$3S&tir-1&Xy^3rq$7WzjkH3oE5TPhoG|$Jj~G7Je1{gMAD^RtHK^{N zufoll6_IZ#Igui;)QE-iL)cr^(p6$f&T?BN!s4KwBwDM|h`zW0G3`5$X2Fl=+u6SL z$bbs5Whvee;g$C>RT%D-Dl1f-g_g`**F<)Ek@PCVoPL;K9`bu?lgzjHb%jJ)6&2?zP)IAw?)0ebClf!Z=Et)6wut_cD@v7o)NpS zsK_#(rtznQd!5{<5eH&uVED3}5?emSTvTf*_p#{4ge` zn(ID>yQn|Wo1TzEpWsAiwn-x2!aa=WR)#*V0|pdqh%CWATiUv6K5OS-0wY-7hCj?Tf`uBEd$hP2k_Ik53t_)WP?CL{3Dz=q58^>&Ha%fBIK{4daGq?sUcO$->)O zrL(={j>A#F8&cXoM~3O}e9Or|hmVKPIv?LKS!HKa#fdMJ!mb#z4eR(FdiJi1Cgqhq z-hw!drzD!qMHn$Jh)LAM;gcxRzEji-SpMM7p*fDjNTi0qIQ%9_MQ7pjZQ#@|e-9ZH zr~VKicRyt`JQ%8fm6;9D;%&5h`hA3XWyEnO2wPwf#ty}*9$nY^N{EaB-QN0{37CZY z$q)aOl;;m)>;+-AznRG38SG@jWRbkZ?`y-mSC;XhcuB8cm3z(6n*y_AgZY3d&mt3* zn><`VT-|Xtb%b$?w{37^ql(=Zu?n)?a;t|0D}bE`Dvf7dz4(QJ4yCTrQT2k^9Pg(5 zPaRK{q+s7o@$GdxX8%j#mu~+9FDS;suFhYN3wXU}FF0v%9YUkIe{5qUD{xJIy+(37 cib^4s>$zE)GBdYpa@8P@=jF5wYA^% zs8Z((qWRNUqWGOOj&@G}PCC-a6ZBHX*s}@}+i-%H7u2<(yeGk_p}IQpkY=q{=8%5j zCh1&k#k?{By|N{2W8kxb1k)Ko<>Y*X-AF6cx)OZF!wG|b|A^sp&39;F`tdn`T7&BT z`6}F;SrPe`k`pNcOO04KKZLzyEnOv+N#+B{By^cDULDE6Dg;&& zO=u{r#qY9CX2q~>M2)v~oJjxXwYfA4W6UEykUq9QGg?N01rigUU44BE!qnW&8XUe) zez=s$?Lpl~RS(YSo`<)8wH6IiQ0ey+mMwU6mI5|$jdg{XyiM1gG;p;#L9s@?8yM9bG|EGU{)i7>&=1y1ao-Dkr zRXW>C?l>F;ydkCib7Ytf&$pZmbohAqtn={=lT~&$Rh;-jDeQ_d+pvz`p=a;9Xi{F; z<1L8OcuJz#T!azxf|x{296pI6?K?%ifaMSF9Gc@ej6`Y(jKgn|RCE?T-v&>d|$ruY|}L(Cw|CnSe>S zpZxGoNqPP-#$FI+`k@ zxyi!?#MK>VQ%4xLc-sayHmcZt5vw5EEw_4Dumae5pwf6<){9>l=uqk^9aS%w&GBx^ z|J3nRNecGe6yILAWA?u!e(Cl<@PcA2?CSjWxPaGt_JWfJ*C8~T`^Pp$vI5uS*J~uV cqo@>8xt^P)DKm4sCRYuzNKydW#Fu~kA7M59P5=M^ diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect b/src/compress/flate/testdata/huffman-rand-1k.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..09dc798ee37df82176b8b7c9998c88a14207c1ad GIT binary patch literal 1005 zcmVlcQ}x5eHD>U|iC=ROD8-~ViL-puE1 zjRYA__oQ{&>YEB=3*aLuz4zyXJp13Xu1};#Rhix|mTnwF zOo!rp*PZhF=TqnOy;6>9pEFaaeUqI8B!YL)2W zP7ZdtNvU6;rei#QejpQ1yJnKOE~NTM%dWXRuhSpl)r~@J@cfJn0Ny~Wi$|AEsLzhu zri&m6gnDM>m?;94<~TB71LK+=ROn-XNSxENOU6sujQmH^hn%vbF>Y9-Bf>bg4ep_N_banGD$o@)BlG0~`IFf*!A z7ZZY+$P{3oO)_oT873jzel8_va>@^q&Gy#Imx?o3b8wLzzbGT44Do}*$X0h~ljl$J4Xnb zbD&&|U+WJ#!b4}YW@ms{4#Dg|)FPD1`RJ15X*j-TWXe#-24_NUqwu$E^5|c&ujkvl zceVJ-2*h=M!1)}1Jc%#TSUTePk+ypzC+V()i{5ms{n@u^D(o_E@REe_Kn#k!Ic_d< z)NYD&D%@ZnqX*t~i*(5TV|DgDW2`fY!|?bmYqXwpi(E6b%BbX-wveIk57S|?#u}7- zL{;=f|DL5<#-Qjb!HsV;5xKrj*@u^N&pjiq)f!%|U1|gQA`KAPM`;y5?oy)&(mYZ0 z_?_gKiO6R;)m}AtC+IwYu6c3Nlk}=l5*$k#%8*z(mO5DYDWih#pN0k_;dS~5vECO-S0Dj5 literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput new file mode 100644 index 0000000000000000000000000000000000000000..0c24742fde2487e3a454ec3364f15e541693c37c GIT binary patch literal 1054 zcmV+(1mXJxzzaAU2mk=!nayIXbMy_f)7H$mL&SF;F?3`%k8@)&&%@Oe(UOiioadDG zS>BI}35WJ&PF@*1*&LbA=aF5pFj3x*HIFRrKcto>d1~bp8)vlgPG~al`sLh_uD4>f zwcquqQs)bz`O{dU_?0E5ZyfOj3vL$R|;Io1R(-}eKi+pE+?-hv`IeFsDFRE4SU5j~y=5(3C6?qYw^br64(dswwJMG1iwh9bz5{6%{CK{d z?OTrws1RG0;tdgAc^^}S;a;h-Le*Jl$;@?4WVbi2?}j$(yZ8P0lo@^JyA?I@?GEt7oU6m&;AhmaN!WN2o4Ue&a8T%J8g~M#1p4zh)_hxG4z2`Ogny za;mxRW4Md@6TRsPIrIrmbY`0*@-5uMh;C)*<4Qh|=G6i5GP){GL)z@9EkaXFMahfN zv?c%P&)d;?j&h!ypwqm%P^YHL3jM3}%*^0B)TTYwcr0m+>#+B{By^cDULDE6Dg;&& zO=u{r#qY9CX2q~>M2)v~oJjxXwYfA4W6UEykUq9QGg?N01rigUU44BE!qnW&8XUe) zez=s$?Lpl~RS(YSo`<)!77bHS>Gu?tEqHX60yc4tb%nr56)BI?!K^R=U-@BSOT=w5 zsVIvXfM*tHgqR0EakjC|{oW&au|@y5MGR8cGC-Yn06%^E0PC$!Cb-Z0wr}jN>)ms9 zL;@;_wIK!J>p%w{0>eRLG6F9RY`9EcFXkV~=#m(_eoQp~r+?KjZg}QSSL~iFyscF_ z+e_{^90j}~rTuecm=4dkoD6jMc=)XI@ePwzb~aU<_(Cb{iZR=;j^CkY@49GGUfJU< zh|_pVqS;)85%YqWL`@t%i6ZSgMZJLK5AGbA<2Z~&Y6y(OZ<17W7CzqwPW|%tkU??k z4*_!bQ%1vsp<0>Q04?4|yQkkrm{&#|cY?4524U<_tm@Hqt*?a07|`vlpP7J3xS#y+ zPf2l=uqk^9aS%w&GBx^|J3nR zNecGe6yILAWA?u!e(Cl<@PcA2?CSjWxPaGt_JWfJ*C8~T`^Pp$vI5uS*J~uVqo@>8 Yxt^P)DKm4sCRYuzNKydW#Fu~kAGX;UBLDyZ literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect index 2d6527934e98300d744c7558a025250f67e0f1c9..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0 100644 GIT binary patch literal 186 zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX+&C|&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po literal 229 zcmV+&C|&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po literal 229 zcmV-LRpuX`%(tEwHyI;4Gxl-|h3T{N z6-jI@ns~S?%F^$acmD1V*M!+Qrys9lzyJ+Dd}Vank36fZwwe0V*y~gD<*;wR&aErE zK1x=B^waSM#UtI25yeIe3z8TRIdp4hX rzbfU)&SL#*50@_2z4oXRI~IhW&XxGJNu5- o$p8QV diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect b/src/compress/flate/testdata/huffman-rand-limit.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0 GIT binary patch literal 186 zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX+&C|&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput new file mode 100644 index 0000000000000000000000000000000000000000..881e59c9ab9bb356c5f1b8f2e188818bd42dbcf0 GIT binary patch literal 186 zcmV;r07d^wq#oe<(LJrqgR6ClYQy?N|9W32ycTaex&7!pwpX+&C|&*fKV2Rd8oFPOxbQ)>6c^slqt_a&vbUd`qL0Dk3ZG5`Po literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect b/src/compress/flate/testdata/huffman-shifts.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..7812c1c62da3cbaeb6399e9aa8ab65ae7efa9b08 GIT binary patch literal 32 ocmaEJ(2|$IfP>+{UeCQBetd7^G}D{T$iTpm^J~2nL&Iw}0NYm#xc~qF literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput b/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput new file mode 100644 index 0000000000000000000000000000000000000000..7812c1c62da3cbaeb6399e9aa8ab65ae7efa9b08 GIT binary patch literal 32 ocmaEJ(2|$IfP>+{UeCQBetd7^G}D{T$iTpm^J~2nL&Iw}0NYm#xc~qF literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect b/src/compress/flate/testdata/huffman-text-shift.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..71ce3aeb75a86e8375d9ac4350b7d83b9229a3ed GIT binary patch literal 231 zcmVb2j)h-%-Q8H+K zIkmg!?Y-=9be1Hi$&iwP9DQ6&foC2grh=5#ja@KiZ1-F{b`bob2j)h-%-Q8H+K zIkmg!?Y-=9be1Hi$&iwP9DQ6&foC2grh=5#ja@KiZ1-F{b`bo4 a=-^ 1`_ 1 ő:Y-F66!A`aC;ANyr4ߜU!GKС#r:B[G3.L׶bFRuM]^⇳(#Z ivBBH2S]u/ֽWTGnr \ No newline at end of file diff --git a/src/compress/flate/testdata/huffman-text.sync.expect-noinput b/src/compress/flate/testdata/huffman-text.sync.expect-noinput new file mode 100644 index 00000000000000..d448727c323caf --- /dev/null +++ b/src/compress/flate/testdata/huffman-text.sync.expect-noinput @@ -0,0 +1 @@ +_K0`K0Aasě)^HIɟb߻_>4 a=-^ 1`_ 1 ő:Y-F66!A`aC;ANyr4ߜU!GKС#r:B[G3.L׶bFRuM]^⇳(#Z ivBBH2S]u/ֽWTGnr \ No newline at end of file diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect b/src/compress/flate/testdata/huffman-zero.dyn.expect index 830348a79ad9ab38d0edc449e8335c056f7d185f..dbe401c54c4b6f45f3169376185a476dcf00dde9 100644 GIT binary patch literal 6 NcmXq#U{zse0006o0CxZY literal 17 XcmaEJU?T$%G#D)X^D^m0zK$>eMUV%O diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput b/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput index 830348a79ad9ab38d0edc449e8335c056f7d185f..dbe401c54c4b6f45f3169376185a476dcf00dde9 100644 GIT binary patch literal 6 NcmXq#U{zse0006o0CxZY literal 17 XcmaEJU?T$%G#D)X^D^m0zK$>eMUV%O diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect b/src/compress/flate/testdata/huffman-zero.sync.expect new file mode 100644 index 0000000000000000000000000000000000000000..dbe401c54c4b6f45f3169376185a476dcf00dde9 GIT binary patch literal 6 NcmXq#U{zse0006o0CxZY literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect-noinput b/src/compress/flate/testdata/huffman-zero.sync.expect-noinput new file mode 100644 index 0000000000000000000000000000000000000000..dbe401c54c4b6f45f3169376185a476dcf00dde9 GIT binary patch literal 6 NcmXq#U{zse0006o0CxZY literal 0 HcmV?d00001 diff --git a/src/compress/flate/testdata/null-long-match.sync.expect-noinput b/src/compress/flate/testdata/null-long-match.sync.expect-noinput new file mode 100644 index 0000000000000000000000000000000000000000..8b92d9fc20f1ee1fea5e4cc84d18aeea26a6fdaa GIT binary patch literal 206 ccmaEJz>txFf#HzC@8#d3xFvwhAq<`X0E^!Sx&QzG literal 0 HcmV?d00001 diff --git a/src/compress/flate/token.go b/src/compress/flate/token.go index fc0e4941e7bcd2..3f0d1c358077b8 100644 --- a/src/compress/flate/token.go +++ b/src/compress/flate/token.go @@ -4,20 +4,26 @@ package flate +import ( + "math" +) + const ( - // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused - // 8 bits: xlength = length - MIN_MATCH_LENGTH - // 22 bits xoffset = offset - MIN_OFFSET_SIZE, or literal - lengthShift = 22 - offsetMask = 1<maxnumlit + offHist [32]uint16 // offset codes + litHist [256]uint16 // codes 0->255 + nFilled int + n uint16 // Must be able to contain maxStoreBlockSize + tokens [65536]token +} + +func (t *tokens) Reset() { + if t.n == 0 { + return + } + t.n = 0 + t.nFilled = 0 + clear(t.litHist[:]) + clear(t.extraHist[:]) + clear(t.offHist[:]) +} + +func indexTokens(in []token) tokens { + var t tokens + t.indexTokens(in) + return t +} + +func (t *tokens) indexTokens(in []token) { + t.Reset() + for _, tok := range in { + if tok < matchType { + t.AddLiteral(tok.literal()) + continue + } + t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask) + } +} + +// emitLiterals writes a literal chunk and returns the number of bytes written. +func emitLiterals(dst *tokens, lit []byte) { + for _, v := range lit { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } +} + +func (t *tokens) AddLiteral(lit byte) { + t.tokens[t.n] = token(lit) + t.litHist[lit]++ + t.n++ +} + +// from https://stackoverflow.com/a/28730362 +func mFastLog2(val float32) float32 { + ux := int32(math.Float32bits(val)) + log2 := (float32)(((ux >> 23) & 255) - 128) + ux &= -0x7f800001 + ux += 127 << 23 + uval := math.Float32frombits(uint32(ux)) + log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759 + return log2 +} -// Convert a < xlength, xoffset > pair into a match token. -func matchToken(xlength uint32, xoffset uint32) token { - return token(matchType + xlength< 0 { + invTotal := 1.0 / float32(total) + for _, v := range t.litHist[:] { + if v > 0 { + n := float32(v) + shannon += atLeastOne(-mFastLog2(n*invTotal)) * n + } + } + // Just add 15 for EOB + shannon += 15 + for i, v := range t.extraHist[1 : literalCount-256] { + if v > 0 { + n := float32(v) + shannon += atLeastOne(-mFastLog2(n*invTotal)) * n + bits += int(lengthExtraBits[i&31]) * int(v) + nMatches += int(v) + } + } + } + if nMatches > 0 { + invTotal := 1.0 / float32(nMatches) + for i, v := range t.offHist[:offsetCodeCount] { + if v > 0 { + n := float32(v) + shannon += atLeastOne(-mFastLog2(n*invTotal)) * n + bits += int(offsetExtraBits[i&31]) * int(v) + } + } + } + return int(shannon) + bits } -// Returns the literal of a literal token. -func (t token) literal() uint32 { return uint32(t - literalType) } +// AddMatch adds a match to the tokens. +// This function is very sensitive to inlining and right on the border. +func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { + oCode := offsetCode(xoffset) + xoffset |= oCode << 16 -// Returns the extra offset of a match token. + t.extraHist[lengthCodes1[uint8(xlength)]]++ + t.offHist[oCode&31]++ + t.tokens[t.n] = token(matchType | xlength< 0 { + xl := xlength + if xl > 258 { + // We need to have at least baseMatchLength left over for next loop. + if xl > 258+baseMatchLength { + xl = 258 + } else { + xl = 258 - baseMatchLength + } + } + xlength -= xl + xl -= baseMatchLength + t.extraHist[lengthCodes1[uint8(xl)]]++ + t.offHist[oc&31]++ + t.tokens[t.n] = token(matchType | uint32(xl)<> lengthShift) } +func (t token) length() uint8 { return uint8(t >> lengthShift) } -func lengthCode(len uint32) uint32 { return lengthCodes[len] } +// Convert length to code. +func lengthCode(len uint8) uint8 { return lengthCodes[len] } -// Returns the offset code corresponding to a specific offset. +// Returns the offset code corresponding to a specific offset func offsetCode(off uint32) uint32 { if off < uint32(len(offsetCodes)) { - return offsetCodes[off] - } - if off>>7 < uint32(len(offsetCodes)) { - return offsetCodes[off>>7] + 14 + return offsetCodes[uint8(off)] } - return offsetCodes[off>>14] + 28 + return offsetCodes14[uint8(off>>7)] } diff --git a/src/compress/flate/unsafe_disabled.go b/src/compress/flate/unsafe_disabled.go new file mode 100644 index 00000000000000..1444494693468e --- /dev/null +++ b/src/compress/flate/unsafe_disabled.go @@ -0,0 +1,33 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +import ( + "internal/byteorder" +) + +type indexer interface { + int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64 +} + +// loadLE8 will load from b at index i. +func loadLE8[I indexer](b []byte, i I) byte { + return b[i] +} + +// loadLE32 will load from b at index i. +func loadLE32[I indexer](b []byte, i I) uint32 { + return byteorder.LEUint32(b[i:]) +} + +// loadLE64 will load from b at index i. +func loadLE64[I indexer](b []byte, i I) uint64 { + return byteorder.LEUint64(b[i:]) +} + +// storeLE64 will store v at start of b. +func storeLE64(b []byte, v uint64) { + byteorder.LEPutUint64(b, v) +} diff --git a/src/compress/flate/writer_test.go b/src/compress/flate/writer_test.go index c413735cd2c9f3..43815b2e4787fd 100644 --- a/src/compress/flate/writer_test.go +++ b/src/compress/flate/writer_test.go @@ -8,6 +8,7 @@ import ( "bytes" "fmt" "io" + "math" "math/rand" "runtime" "testing" @@ -40,6 +41,34 @@ func BenchmarkEncode(b *testing.B) { }) } +func TestWriterMemUsage(t *testing.T) { + testMem := func(t *testing.T, fn func()) { + var before, after runtime.MemStats + runtime.GC() + runtime.ReadMemStats(&before) + fn() + runtime.GC() + runtime.ReadMemStats(&after) + t.Logf("%s: Memory Used: %dKB, %d allocs", t.Name(), (after.HeapInuse-before.HeapInuse)/1024, after.HeapObjects-before.HeapObjects) + } + data := make([]byte, 100000) + + for level := HuffmanOnly; level <= BestCompression; level++ { + t.Run(fmt.Sprint("level-", level), func(t *testing.T) { + var zr *Writer + var err error + testMem(t, func() { + zr, err = NewWriter(io.Discard, level) + if err != nil { + t.Fatal(err) + } + zr.Write(data) + }) + zr.Close() + }) + } +} + // errorWriter is a writer that fails after N writes. type errorWriter struct { N int @@ -67,7 +96,7 @@ func TestWriteError(t *testing.T) { in := buf.Bytes() // We create our own buffer to control number of writes. copyBuffer := make([]byte, 128) - for l := 0; l < 10; l++ { + for l := range 10 { for fail := 1; fail <= 256; fail *= 2 { // Fail after 'fail' writes ew := &errorWriter{N: fail} @@ -110,6 +139,75 @@ func TestWriteError(t *testing.T) { } } +// Test if errors from the underlying writer is passed upwards. +func TestWriter_Reset(t *testing.T) { + buf := new(bytes.Buffer) + n := 65536 + if !testing.Short() { + n *= 4 + } + for i := 0; i < n; i++ { + fmt.Fprintf(buf, "asdasfasf%d%dfghfgujyut%dyutyu\n", i, i, i) + } + in := buf.Bytes() + for l := range 10 { + l := l + if testing.Short() && l > 1 { + continue + } + t.Run(fmt.Sprintf("level-%d", l), func(t *testing.T) { + t.Parallel() + offset := 1 + if testing.Short() { + offset = 256 + } + for ; offset <= 256; offset *= 2 { + // Fail after 'fail' writes + w, err := NewWriter(io.Discard, l) + if err != nil { + t.Fatalf("NewWriter: level %d: %v", l, err) + } + if w.d.fast == nil { + t.Skip("Not Fast...") + return + } + for i := 0; i < (bufferReset-len(in)-offset-maxMatchOffset)/maxMatchOffset; i++ { + // skip ahead to where we are close to wrap around... + w.d.fast.Reset() + } + w.d.fast.Reset() + _, err = w.Write(in) + if err != nil { + t.Fatal(err) + } + for range 50 { + // skip ahead again... This should wrap around... + w.d.fast.Reset() + } + w.d.fast.Reset() + + _, err = w.Write(in) + if err != nil { + t.Fatal(err) + } + for range (math.MaxUint32 - bufferReset) / maxMatchOffset { + // skip ahead to where we are close to wrap around... + w.d.fast.Reset() + } + + _, err = w.Write(in) + if err != nil { + t.Fatal(err) + } + err = w.Close() + if err != nil { + t.Fatal(err) + } + } + }) + } +} + // Test if two runs produce identical results // even when writing different sizes to the Writer. func TestDeterministic(t *testing.T) { @@ -171,6 +269,24 @@ func testDeterministic(i int, t *testing.T) { if !bytes.Equal(b1b, b2b) { t.Errorf("level %d did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b2b)) } + + // Test using io.WriterTo interface. + var b3 bytes.Buffer + br = bytes.NewBuffer(t1) + w, err = NewWriter(&b3, i) + if err != nil { + t.Fatal(err) + } + _, err = br.WriteTo(w) + if err != nil { + t.Fatal(err) + } + w.Close() + + b3b := b3.Bytes() + if !bytes.Equal(b1b, b3b) { + t.Errorf("level %d (io.WriterTo) did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b3b)) + } } // TestDeflateFast_Reset will test that encoding is consistent From 374779b85f6ae7f5ad5f87f7a78811612c74c953 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sat, 27 Sep 2025 16:27:38 +0200 Subject: [PATCH 2/5] [klauspost/deflate-improve-comp] don't use internal/byteorder Change-Id: I0ac5571da9585daba9491b360c9a6b4e0cecbcee --- src/compress/flate/unsafe_disabled.go | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/compress/flate/unsafe_disabled.go b/src/compress/flate/unsafe_disabled.go index 1444494693468e..c4ecd0fd0a9bb1 100644 --- a/src/compress/flate/unsafe_disabled.go +++ b/src/compress/flate/unsafe_disabled.go @@ -4,10 +4,6 @@ package flate -import ( - "internal/byteorder" -) - type indexer interface { int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64 } @@ -19,15 +15,26 @@ func loadLE8[I indexer](b []byte, i I) byte { // loadLE32 will load from b at index i. func loadLE32[I indexer](b []byte, i I) uint32 { - return byteorder.LEUint32(b[i:]) + b = b[i : i+4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 } // loadLE64 will load from b at index i. func loadLE64[I indexer](b []byte, i I) uint64 { - return byteorder.LEUint64(b[i:]) + b = b[i : i+8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 } // storeLE64 will store v at start of b. func storeLE64(b []byte, v uint64) { - byteorder.LEPutUint64(b, v) + _ = b[7] // early bounds check to guarantee safety of writes below + b[0] = byte(v) + b[1] = byte(v >> 8) + b[2] = byte(v >> 16) + b[3] = byte(v >> 24) + b[4] = byte(v >> 32) + b[5] = byte(v >> 40) + b[6] = byte(v >> 48) + b[7] = byte(v >> 56) } From 2c5d12a16952010142017cecdf431180163bc2af Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sat, 27 Sep 2025 16:54:32 +0200 Subject: [PATCH 3/5] [klauspost/deflate-improve-comp] Remove hash7 and use const for long table bytes. Change-Id: Ia141c7ec888bf51ceb6351d2a1c3f1501c2c4e12 --- src/compress/flate/deflatefast.go | 7 +------ src/compress/flate/huffman_bit_writer.go | 3 ++- src/compress/flate/level4.go | 16 ++++++++-------- src/compress/flate/level5.go | 14 +++++++------- src/compress/flate/level6.go | 12 ++++++------ 5 files changed, 24 insertions(+), 28 deletions(-) diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go index e132c55951b5ef..eef1896b6f5c63 100644 --- a/src/compress/flate/deflatefast.go +++ b/src/compress/flate/deflatefast.go @@ -35,6 +35,7 @@ func newFastEnc(level int) fastEnc { const ( tableBits = 15 // Bits used in the table tableSize = 1 << tableBits // Size of the table + hashLongBytes = 7 // Bytes used for long table hash baseMatchOffset = 1 // The smallest match offset baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 maxMatchOffset = 1 << 15 // The largest match offset @@ -93,12 +94,6 @@ type tableEntryPrev struct { Prev tableEntry } -// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash7(u uint64, h uint8) uint32 { - return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64)) -} - // hashLen returns a hash of the lowest mls bytes of with length output bits. // mls must be >=3 and <=8. Any other value will return hash for 4 bytes. // length should always be < 32. diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go index f5e50925db8802..585a9b4cf19032 100644 --- a/src/compress/flate/huffman_bit_writer.go +++ b/src/compress/flate/huffman_bit_writer.go @@ -412,8 +412,9 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { return 0, false } +// writeCode writes 'c' to the stream. +// Inline manually when performance is critical. func (w *huffmanBitWriter) writeCode(c hcode) { - // The function does not get inlined if we "& 63" the shift. w.bits |= c.code64() << (w.nbits & reg8SizeMask64) w.nbits += c.len() if w.nbits >= 48 { diff --git a/src/compress/flate/level4.go b/src/compress/flate/level4.go index f62168b64ed9e3..ceb899793e3148 100644 --- a/src/compress/flate/level4.go +++ b/src/compress/flate/level4.go @@ -82,7 +82,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { var t int32 for { nextHashS := hashLen(cv, tableBits, hashShortBytes) - nextHashL := hash7(cv, tableBits) + nextHashL := hashLen(cv, tableBits, hashLongBytes) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog @@ -106,7 +106,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { t = sCandidate.offset - e.cur if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { // Found a 4 match... - lCandidate = e.bTable[hash7(next, tableBits)] + lCandidate = e.bTable[hashLen(next, tableBits, hashLongBytes)] // If the next long is a candidate, check if we should use that instead... lOff := lCandidate.offset - e.cur @@ -155,7 +155,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { if int(s+8) < len(src) { cv := loadLE64(src, s) e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur} - e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur} + e.bTable[hashLen(cv, tableBits, hashLongBytes)] = tableEntry{offset: s + e.cur} } goto emitRemainder } @@ -166,8 +166,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { cv := loadLE64(src, i) t := tableEntry{offset: i + e.cur} t2 := tableEntry{offset: t.offset + 1} - e.bTable[hash7(cv, tableBits)] = t - e.bTable[hash7(cv>>8, tableBits)] = t2 + e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t + e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2 e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 i += 3 @@ -175,8 +175,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { cv := loadLE64(src, i) t := tableEntry{offset: i + e.cur} t2 := tableEntry{offset: t.offset + 1} - e.bTable[hash7(cv, tableBits)] = t - e.bTable[hash7(cv>>8, tableBits)] = t2 + e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t + e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2 e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 } } @@ -186,7 +186,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { x := loadLE64(src, s-1) o := e.cur + s - 1 prevHashS := hashLen(x, tableBits, hashShortBytes) - prevHashL := hash7(x, tableBits) + prevHashL := hashLen(x, tableBits, hashLongBytes) e.table[prevHashS] = tableEntry{offset: o} e.bTable[prevHashL] = tableEntry{offset: o} cv = x >> 8 diff --git a/src/compress/flate/level5.go b/src/compress/flate/level5.go index 5ef342eae0e8a2..29f1df27413b82 100644 --- a/src/compress/flate/level5.go +++ b/src/compress/flate/level5.go @@ -92,7 +92,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { var t int32 for { nextHashS := hashLen(cv, tableBits, hashShortBytes) - nextHashL := hash7(cv, tableBits) + nextHashL := hashLen(cv, tableBits, hashLongBytes) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog @@ -109,7 +109,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { eLong.Cur, eLong.Prev = entry, eLong.Cur nextHashS = hashLen(next, tableBits, hashShortBytes) - nextHashL = hash7(next, tableBits) + nextHashL = hashLen(next, tableBits, hashLongBytes) t = lCandidate.Cur.offset - e.cur if s-t < maxMatchOffset { @@ -196,7 +196,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // The skipped bytes are tested in Extend backwards, // and still picked up as part of the match if they do. const skipBeginning = 2 - eLong := e.bTable[hash7(loadLE64(src, sAt), tableBits)].Cur.offset + eLong := e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)].Cur.offset t2 := eLong - e.cur - l + skipBeginning s2 := s + skipBeginning off := s2 - t2 @@ -241,13 +241,13 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { cv := loadLE64(src, i) t := tableEntry{offset: i + e.cur} e.table[hashLen(cv, tableBits, hashShortBytes)] = t - eLong := &e.bTable[hash7(cv, tableBits)] + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] eLong.Cur, eLong.Prev = t, eLong.Cur // Do an long at i+1 cv >>= 8 t = tableEntry{offset: t.offset + 1} - eLong = &e.bTable[hash7(cv, tableBits)] + eLong = &e.bTable[hashLen(cv, tableBits, hashLongBytes)] eLong.Cur, eLong.Prev = t, eLong.Cur // We only have enough bits for a short entry at i+2 @@ -261,7 +261,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { cv := loadLE64(src, i) t := tableEntry{offset: i + e.cur} t2 := tableEntry{offset: t.offset + 1} - eLong := &e.bTable[hash7(cv, tableBits)] + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] eLong.Cur, eLong.Prev = t, eLong.Cur e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 } @@ -272,7 +272,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { x := loadLE64(src, s-1) o := e.cur + s - 1 prevHashS := hashLen(x, tableBits, hashShortBytes) - prevHashL := hash7(x, tableBits) + prevHashL := hashLen(x, tableBits, hashLongBytes) e.table[prevHashS] = tableEntry{offset: o} eLong := &e.bTable[prevHashL] eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur diff --git a/src/compress/flate/level6.go b/src/compress/flate/level6.go index 851a7155853eec..d709f31e21fc42 100644 --- a/src/compress/flate/level6.go +++ b/src/compress/flate/level6.go @@ -92,7 +92,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { var t int32 for { nextHashS := hashLen(cv, tableBits, hashShortBytes) - nextHashL := hash7(cv, tableBits) + nextHashL := hashLen(cv, tableBits, hashLongBytes) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog if nextS > sLimit { @@ -109,7 +109,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { // Calculate hashes of 'next' nextHashS = hashLen(next, tableBits, hashShortBytes) - nextHashL = hash7(next, tableBits) + nextHashL = hashLen(next, tableBits, hashLongBytes) t = lCandidate.Cur.offset - e.cur if s-t < maxMatchOffset { @@ -216,7 +216,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { // The skipped bytes are tested in extend backwards, // and still picked up as part of the match if they do. const skipBeginning = 2 - eLong := &e.bTable[hash7(loadLE64(src, sAt), tableBits)] + eLong := &e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)] // Test current t2 := eLong.Cur.offset - e.cur - l + skipBeginning s2 := s + skipBeginning @@ -269,7 +269,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { for i := nextS + 1; i < int32(len(src))-8; i += 2 { cv := loadLE64(src, i) e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur} - eLong := &e.bTable[hash7(cv, tableBits)] + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur } goto emitRemainder @@ -280,8 +280,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { cv := loadLE64(src, i) t := tableEntry{offset: i + e.cur} t2 := tableEntry{offset: t.offset + 1} - eLong := &e.bTable[hash7(cv, tableBits)] - eLong2 := &e.bTable[hash7(cv>>8, tableBits)] + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] + eLong2 := &e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] e.table[hashLen(cv, tableBits, hashShortBytes)] = t eLong.Cur, eLong.Prev = t, eLong.Cur eLong2.Cur, eLong2.Prev = t2, eLong2.Cur From f09b893f4892a4950daa68713908bc1e9f7c91b4 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sat, 27 Sep 2025 17:01:25 +0200 Subject: [PATCH 4/5] [klauspost/deflate-improve-comp] update expected zlib output Change-Id: I1cef87da8cf7a2f2b330115f8eeecb7bf825af76 --- src/compress/zlib/example_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compress/zlib/example_test.go b/src/compress/zlib/example_test.go index 70408895ffd5a0..7052973355eb92 100644 --- a/src/compress/zlib/example_test.go +++ b/src/compress/zlib/example_test.go @@ -19,7 +19,7 @@ func ExampleNewWriter() { w.Write([]byte("hello, world\n")) w.Close() fmt.Println(b.Bytes()) - // Output: [120 156 202 72 205 201 201 215 81 40 207 47 202 73 225 2 4 0 0 255 255 33 231 4 147] + // Output: [120 156 0 13 0 242 255 104 101 108 108 111 44 32 119 111 114 108 100 10 3 0 33 231 4 147] } func ExampleNewReader() { From f5d855e43f730c5b44760059fbf00fd153b1ff3e Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 28 Sep 2025 17:19:14 +0200 Subject: [PATCH 5/5] [klauspost/deflate-improve-comp] Use pre-compressed bytes for test. Change-Id: Ie3630fc4b51f30108909a3d5930ffe17851f4a94 --- src/debug/elf/file_test.go | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/debug/elf/file_test.go b/src/debug/elf/file_test.go index 0c1a7cf18aeb6e..733daae57772c6 100644 --- a/src/debug/elf/file_test.go +++ b/src/debug/elf/file_test.go @@ -7,7 +7,6 @@ package elf import ( "bytes" "compress/gzip" - "compress/zlib" "debug/dwarf" "encoding/binary" "errors" @@ -1560,18 +1559,9 @@ func TestIssue59208(t *testing.T) { zoffset := sec.Offset + uint64(sec.compressionOffset) copy(dn, data[:zoffset]) - ozd, err := sec.Data() - if err != nil { - t.Fatal(err) - } - buf := bytes.NewBuffer(nil) - wr := zlib.NewWriter(buf) // corrupt origin data same as COMPRESS_ZLIB - copy(ozd, []byte{1, 0, 0, 0}) - wr.Write(ozd) - wr.Close() - - copy(dn[zoffset:], buf.Bytes()) + // Insert zlib compressed sec.Data() block with `[]byte{1, 0, 0, 0}` as the first 4 bytes + copy(dn[zoffset:], []byte{0x78, 0x9c, 0x5c, 0x4d, 0xb9, 0xd, 0x80, 0x30, 0xc, 0x3c, 0x7, 0x27, 0xdc, 0xe, 0xc, 0x46, 0x4b, 0x8b, 0x14, 0x51, 0x20, 0x16, 0xa1, 0x67, 0x8b, 0x2c, 0x88, 0xec, 0x44, 0xc2, 0xe2, 0x8a, 0xdc, 0x1b, 0x59, 0x0, 0x28, 0xc, 0x34, 0x9, 0x7f, 0x22, 0x96, 0xa0, 0x13, 0x67, 0x27, 0xa1, 0x53, 0xea, 0x4e, 0x47, 0x58, 0x7a, 0x98, 0x8d, 0x26, 0xcd, 0xfb, 0x71, 0x21, 0x31, 0x87, 0x7f, 0xca, 0xf3, 0x1b, 0x7a, 0x21, 0xfa, 0x3f, 0x23, 0x4f, 0x3, 0x50, 0x7a, 0xb9, 0xda, 0xfc, 0xae, 0xc3, 0x35, 0x77, 0x1b, 0x94, 0xd5, 0x82, 0x37, 0x0, 0x0, 0xff, 0xff, 0x65, 0xfb, 0x7, 0x6e}) copy(dn[sec.Offset+sec.FileSize:], data[sec.Offset+sec.FileSize:]) nf, err := NewFile(bytes.NewReader(dn))