Skip to content

Commit

Permalink
internal/lz4block: Speed up noasm decoder
Browse files Browse the repository at this point in the history
When the compiler is told exactly how many bytes a copy call should
copy, and that number is at most 16, it will inline the call. Also, the
old code only took the short match shortcut when the short literal
shortcut was also taken. But long literals with short matches are
common.

Benchmark results on older Intel:

    goos: linux
    goarch: amd64
    pkg: github.com/pierrec/lz4/v4
    cpu: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz
                       │     old      │                 new                  │
                       │     B/s      │     B/s       vs base                │
    UncompressPg1661-8   327.9Mi ± 1%   549.7Mi ± 0%  +67.61% (p=0.000 n=10)
    UncompressDigits-8   1.111Gi ± 1%   1.499Gi ± 1%  +34.94% (p=0.000 n=10)
    UncompressTwain-8    348.3Mi ± 0%   579.4Mi ± 0%  +66.32% (p=0.000 n=10)
    UncompressRand-8     3.296Gi ± 0%   3.309Gi ± 1%        ~ (p=0.739 n=10)
    geomean              813.8Mi        1.108Gi       +39.40%

On newer AMD:

    goos: linux
    goarch: amd64
    pkg: github.com/pierrec/lz4/v4
    cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics
                        │     old      │                  new                  │
                        │     B/s      │      B/s       vs base                │
    UncompressPg1661-16   643.6Mi ± 2%   1076.9Mi ± 1%  +67.33% (p=0.000 n=10)
    UncompressDigits-16   2.808Gi ± 1%    3.786Gi ± 0%  +34.82% (p=0.000 n=10)
    UncompressTwain-16    702.8Mi ± 1%   1309.5Mi ± 7%  +86.32% (p=0.000 n=10)
    UncompressRand-16     6.878Gi ± 0%    6.850Gi ± 1%   -0.42% (p=0.009 n=10)
    geomean               1.699Gi         2.430Gi       +43.04%
  • Loading branch information
greatroar committed Jan 11, 2025
1 parent 0f7173a commit a3839dc
Showing 1 changed file with 23 additions and 34 deletions.
57 changes: 23 additions & 34 deletions internal/lz4block/decode_other.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,33 +32,7 @@ func decodeBlock(dst, src, dict []byte) (ret int) {

// Literals.
if lLen := b >> 4; lLen > 0 {
switch {
case lLen < 0xF && si+16 < uint(len(src)):
// Shortcut 1
// if we have enough room in src and dst, and the literals length
// is small enough (0..14) then copy all 16 bytes, even if not all
// are part of the literals.
copy(dst[di:], src[si:si+16])
si += lLen
di += lLen
if mLen := b & 0xF; mLen < 0xF {
// Shortcut 2
// if the match length (4..18) fits within the literals, then copy
// all 18 bytes, even if not all are part of the literals.
mLen += 4
if offset := u16(src[si:]); mLen <= offset && offset < di {
i := di - offset
// The remaining buffer may not hold 18 bytes.
// See https://github.com/pierrec/lz4/issues/51.
if end := i + 18; end <= uint(len(dst)) {
copy(dst[di:], dst[i:end])
si += 2
di += mLen
continue
}
}
}
case lLen == 0xF:
if lLen == 0xF {
for {
x := uint(src[si])
if lLen += x; int(lLen) < 0 {
Expand All @@ -69,30 +43,45 @@ func decodeBlock(dst, src, dict []byte) (ret int) {
break
}
}
fallthrough
default:
}
if lLen <= 16 && si+16 < uint(len(src)) {
// Shortcut 1: if we have enough room in src and dst, and the
// literal length is at most 16, then copy 16 bytes, even if not
// all are part of the literal. The compiler inlines this copy.
copy(dst[di:di+16], src[si:si+16])
} else {
copy(dst[di:di+lLen], src[si:si+lLen])
si += lLen
di += lLen
}
si += lLen
di += lLen
}

// Match.
mLen := b & 0xF
if si == uint(len(src)) && mLen == 0 {
break
} else if si >= uint(len(src)) {
return hasError
}
mLen += minMatch

offset := u16(src[si:])
if offset == 0 {
return hasError
}
si += 2

// Match.
mLen += minMatch
if mLen == minMatch+0xF {
if mLen <= 16 {
// Shortcut 2: if the match length is at most 16 and we're far
// enough from the end of dst, copy 16 bytes unconditionally
// so that the compiler can inline the copy.
if mLen <= offset && offset < di && di+16 <= uint(len(dst)) {
i := di - offset
copy(dst[di:di+16], dst[i:i+16])
di += mLen
continue
}
} else if mLen >= 15+minMatch {
for {
x := uint(src[si])
if mLen += x; int(mLen) < 0 {
Expand Down

0 comments on commit a3839dc

Please sign in to comment.