internal/lz4block: Speed up noasm decoder

When the compiler is told exactly how many bytes a copy call should copy, and that number is at most 16, it will inline the call. Also, the old code only took the short match shortcut when the short literal shortcut was also taken. But long literals with short matches are common. Benchmark results on older Intel: goos: linux goarch: amd64 pkg: github.com/pierrec/lz4/v4 cpu: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz │ old │ new │ │ B/s │ B/s vs base │ UncompressPg1661-8 327.9Mi ± 1% 549.7Mi ± 0% +67.61% (p=0.000 n=10) UncompressDigits-8 1.111Gi ± 1% 1.499Gi ± 1% +34.94% (p=0.000 n=10) UncompressTwain-8 348.3Mi ± 0% 579.4Mi ± 0% +66.32% (p=0.000 n=10) UncompressRand-8 3.296Gi ± 0% 3.309Gi ± 1% ~ (p=0.739 n=10) geomean 813.8Mi 1.108Gi +39.40% On newer AMD: goos: linux goarch: amd64 pkg: github.com/pierrec/lz4/v4 cpu: AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics │ old │ new │ │ B/s │ B/s vs base │ UncompressPg1661-16 643.6Mi ± 2% 1076.9Mi ± 1% +67.33% (p=0.000 n=10) UncompressDigits-16 2.808Gi ± 1% 3.786Gi ± 0% +34.82% (p=0.000 n=10) UncompressTwain-16 702.8Mi ± 1% 1309.5Mi ± 7% +86.32% (p=0.000 n=10) UncompressRand-16 6.878Gi ± 0% 6.850Gi ± 1% -0.42% (p=0.009 n=10) geomean 1.699Gi 2.430Gi +43.04%
pierrec · Jan 11, 2025 · a3839dc · a3839dc
1 parent 0f7173a
commit a3839dc
Showing 1 changed file with 23 additions and 34 deletions.
diff --git a/internal/lz4block/decode_other.go b/internal/lz4block/decode_other.go
@@ -32,33 +32,7 @@ func decodeBlock(dst, src, dict []byte) (ret int) {
 
 		// Literals.
 		if lLen := b >> 4; lLen > 0 {
-			switch {
-			case lLen < 0xF && si+16 < uint(len(src)):
-				// Shortcut 1
-				// if we have enough room in src and dst, and the literals length
-				// is small enough (0..14) then copy all 16 bytes, even if not all
-				// are part of the literals.
-				copy(dst[di:], src[si:si+16])
-				si += lLen
-				di += lLen
-				if mLen := b & 0xF; mLen < 0xF {
-					// Shortcut 2
-					// if the match length (4..18) fits within the literals, then copy
-					// all 18 bytes, even if not all are part of the literals.
-					mLen += 4
-					if offset := u16(src[si:]); mLen <= offset && offset < di {
-						i := di - offset
-						// The remaining buffer may not hold 18 bytes.
-						// See https://github.com/pierrec/lz4/issues/51.
-						if end := i + 18; end <= uint(len(dst)) {
-							copy(dst[di:], dst[i:end])
-							si += 2
-							di += mLen
-							continue
-						}
-					}
-				}
-			case lLen == 0xF:
+			if lLen == 0xF {
 				for {
 					x := uint(src[si])
 					if lLen += x; int(lLen) < 0 {
@@ -69,30 +43,45 @@ func decodeBlock(dst, src, dict []byte) (ret int) {
 						break
 					}
 				}
-				fallthrough
-			default:
+			}
+			if lLen <= 16 && si+16 < uint(len(src)) {
+				// Shortcut 1: if we have enough room in src and dst, and the
+				// literal length is at most 16, then copy 16 bytes, even if not
+				// all are part of the literal. The compiler inlines this copy.
+				copy(dst[di:di+16], src[si:si+16])
+			} else {
 				copy(dst[di:di+lLen], src[si:si+lLen])
-				si += lLen
-				di += lLen
 			}
+			si += lLen
+			di += lLen
 		}
 
+		// Match.
 		mLen := b & 0xF
 		if si == uint(len(src)) && mLen == 0 {
 			break
 		} else if si >= uint(len(src)) {
 			return hasError
 		}
+		mLen += minMatch
 
 		offset := u16(src[si:])
 		if offset == 0 {
 			return hasError
 		}
 		si += 2
 
-		// Match.
-		mLen += minMatch
-		if mLen == minMatch+0xF {
+		if mLen <= 16 {
+			// Shortcut 2: if the match length is at most 16 and we're far
+			// enough from the end of dst, copy 16 bytes unconditionally
+			// so that the compiler can inline the copy.
+			if mLen <= offset && offset < di && di+16 <= uint(len(dst)) {
+				i := di - offset
+				copy(dst[di:di+16], dst[i:i+16])
+				di += mLen
+				continue
+			}
+		} else if mLen >= 15+minMatch {
 			for {
 				x := uint(src[si])
 				if mLen += x; int(mLen) < 0 {