implement linked-block decompression

pierrec · Apr 3, 2021 · 8fa51c0 · 8fa51c0
1 parent 284f056
commit 8fa51c0
Show file tree

Hide file tree

Showing 19 changed files with 206 additions and 41 deletions.
diff --git a/bench_test.go b/bench_test.go
@@ -63,7 +63,7 @@ func BenchmarkUncompress(b *testing.B) {
 	b.ResetTimer()
 
 	for i := 0; i < b.N; i++ {
-		_, _ = lz4block.UncompressBlock(pg1661LZ4, buf)
+		_, _ = lz4block.UncompressBlock(pg1661LZ4, buf, nil)
 	}
 }
 

diff --git a/fuzz/lz4.go b/fuzz/lz4.go
@@ -81,7 +81,7 @@ func FuzzUncompressBlock(data []byte) int {
 	}
 	decomp = decomp[:len(data)]
 
-	n, err := lz4.UncompressBlock(data, decomp)
+	n, err := lz4.UncompressBlock(data, decomp, nil)
 	if n > len(decomp) {
 		panic("uncompressed length greater than buffer")
 	}

diff --git a/internal/lz4block/block.go b/internal/lz4block/block.go
@@ -41,11 +41,11 @@ func CompressBlockBound(n int) int {
 	return n + n/255 + 16
 }
 
-func UncompressBlock(src, dst []byte) (int, error) {
+func UncompressBlock(src, dst, dict []byte) (int, error) {
 	if len(src) == 0 {
 		return 0, nil
 	}
-	if di := decodeBlock(dst, src); di >= 0 {
+	if di := decodeBlock(dst, src, dict); di >= 0 {
 		return di, nil
 	}
 	return 0, lz4errors.ErrInvalidSourceShortBuffer

diff --git a/internal/lz4block/block_test.go b/internal/lz4block/block_test.go
@@ -59,7 +59,7 @@ func TestCompressUncompressBlock(t *testing.T) {
 
 		// Uncompress the data.
 		buf := make([]byte, len(src))
-		n, err = lz4block.UncompressBlock(zbuf, buf)
+		n, err = lz4block.UncompressBlock(zbuf, buf, nil)
 		if err != nil {
 			t.Fatal(err)
 		} else if n < 0 || n > len(buf) {

diff --git a/internal/lz4block/decode_amd64.s b/internal/lz4block/decode_amd64.s
@@ -16,9 +16,11 @@
 // R11 &dst
 // R12 short output end
 // R13 short input end
+// R14 &dict
+// R15 &dict + len(dict)
 
-// func decodeBlock(dst, src []byte) int
-TEXT ·decodeBlock(SB), NOSPLIT, $48-56
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOSPLIT, $48-80
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, R11
 	MOVQ dst_len+8(FP), R8
@@ -30,6 +32,10 @@ TEXT ·decodeBlock(SB), NOSPLIT, $48-56
 	JE   err_corrupt
 	ADDQ SI, R9
 
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	ADDQ R14, R15
+
 	// shortcut ends
 	// short output end
 	MOVQ R8, R12
@@ -96,6 +102,8 @@ loop:
 	// match length, we already have the offset.
 	CMPQ CX, $0xF
 	JEQ match_len_loop_pre
+	CMPQ DX, R11
+	JLT match_len_loop_pre
 	CMPQ DX, $8
 	JLT match_len_loop_pre
 	CMPQ AX, R11
@@ -280,7 +288,7 @@ copy_match:
 	// check BX is within dst
 	// if BX < &dst
 	CMPQ BX, R11
-	JLT err_short_buf
+	JLT copy_match_from_dict
 
 	// if offset + match_len < di
 	LEAQ (BX)(CX*1), AX
@@ -327,6 +335,81 @@ copy_interior_match:
 	ADDQ CX, DI
 	JMP loop
 
+copy_match_from_dict:
+	// CX = match_len
+	// BX = &dst + (di - offset)
+
+	// AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
+	MOVQ R11, AX
+	SUBQ BX, AX
+
+	// BX = &dict_end - dict_bytes_available
+	MOVQ R15, BX
+	SUBQ AX, BX
+
+	// check BX is within dict
+	// if BX < &dict
+	CMPQ BX, R14
+	JLT err_short_dict
+
+	// if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
+	CMPQ CX, AX
+	JLT memmove_match
+
+	// The match stretches over the dictionary and our block
+	// 1) copy what comes from the dictionary
+	// AX = dict_bytes_available = copy_size
+	// BX = &dict_end - copy_size
+	// CX = match_len
+
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ AX, 16(SP)
+	// store extra stuff we want to recover
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 16(SP), AX // copy_size
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX // match_len
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	ADDQ R14, R15
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	// di+=copy_size
+	ADDQ AX, DI
+
+	// 2) copy the rest from the current block
+	// CX = match_len - copy_size = rest_size
+	SUBQ AX, CX
+	MOVQ R11, BX
+
+	// check if we have a copy overlap
+	// AX = &dst + rest_size
+	MOVQ CX, AX
+	ADDQ BX, AX
+	// if &dst + rest_size > di, copy byte by byte
+	CMPQ AX, DI
+
+	JGT copy_match_loop
+
 memmove_match:
 	// memmove(to, from, len)
 	MOVQ DI, 0(SP)
@@ -354,18 +437,25 @@ memmove_match:
 	SUBQ $32, R12
 	MOVQ R9, R13
 	SUBQ $16, R13
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	ADDQ R14, R15
 
 	JMP loop
 
 err_corrupt:
-	MOVQ $-1, ret+48(FP)
+	MOVQ $-1, ret+72(FP)
 	RET
 
 err_short_buf:
-	MOVQ $-2, ret+48(FP)
+	MOVQ $-2, ret+72(FP)
+	RET
+
+err_short_dict:
+	MOVQ $-3, ret+72(FP)
 	RET
 
 end:
 	SUBQ R11, DI
-	MOVQ DI, ret+48(FP)
+	MOVQ DI, ret+72(FP)
 	RET
diff --git a/internal/lz4block/decode_arm.s b/internal/lz4block/decode_arm.s
@@ -19,12 +19,12 @@
 
 #define minMatch	$4
 
-// func decodeBlock(dst, src []byte) int
-TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-28
-	MOVW dst_base +0(FP), dst
-	MOVW dst_len  +4(FP), dstend
-	MOVW src_base+12(FP), src
-	MOVW src_len +16(FP), srcend
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-40
+	MOVW dst_base  +0(FP), dst
+	MOVW dst_len   +4(FP), dstend
+	MOVW src_base +12(FP), src
+	MOVW src_len  +16(FP), srcend
 
 	CMP $0, srcend
 	BEQ shortSrc
@@ -183,7 +183,7 @@ copyMatchDone:
 
 end:
 	SUB  dstorig, dst, tmp1
-	MOVW tmp1, ret+24(FP)
+	MOVW tmp1, ret+36(FP)
 	RET
 
 	// The three error cases have distinct labels so we can put different
@@ -193,5 +193,5 @@ shortDst:
 shortSrc:
 corrupt:
 	MOVW $-1, tmp1
-	MOVW tmp1, ret+24(FP)
+	MOVW tmp1, ret+36(FP)
 	RET
diff --git a/internal/lz4block/decode_asm.go b/internal/lz4block/decode_asm.go
@@ -6,4 +6,4 @@
 package lz4block
 
 //go:noescape
-func decodeBlock(dst, src []byte) int
+func decodeBlock(dst, src, dict []byte) int
diff --git a/internal/lz4block/decode_other.go b/internal/lz4block/decode_other.go
@@ -2,12 +2,15 @@
 
 package lz4block
 
-import "encoding/binary"
+import (
+	"encoding/binary"
+)
 
-func decodeBlock(dst, src []byte) (ret int) {
+func decodeBlock(dst, src, dict []byte) (ret int) {
 	// Restrict capacities so we don't read or write out of bounds.
 	dst = dst[:len(dst):len(dst)]
 	src = src[:len(src):len(src)]
+	dictLen := uint(len(dict))
 
 	const hasError = -2
 	defer func() {
@@ -38,7 +41,7 @@ func decodeBlock(dst, src []byte) (ret int) {
 					// if the match length (4..18) fits within the literals, then copy
 					// all 18 bytes, even if not all are part of the literals.
 					mLen += 4
-					if offset := u16(src[si:]); mLen <= offset {
+					if offset := u16(src[si:]); mLen <= offset && offset < di {
 						i := di - offset
 						end := i + 18
 						if end > uint(len(dst)) {
@@ -91,6 +94,38 @@ func decodeBlock(dst, src []byte) (ret int) {
 		mLen += minMatch
 
 		// Copy the match.
+		if di < offset {
+			// The match is beyond our block, meaning in the dictionary
+			if offset-di > mLen {
+				// The match is entirely contained in the dictionary. Just copy!
+				copy(dst[di:di+mLen], dict[dictLen+di-offset:dictLen+di-offset+mLen])
+				di = di + mLen
+			} else {
+				// The match stretches over the dictionary and our block
+				copySize := offset - di
+				restSize := mLen - copySize
+
+				copy(dst[di:di+copySize], dict[dictLen-copySize:])
+				di = di + copySize
+
+				if di < restSize {
+					// Overlap - we want to copy more than what we have available,
+					// so copy byte per byte.
+					copyFrom := 0
+					endOfMatch := di + restSize
+					for di < endOfMatch {
+						dst[di] = dst[copyFrom]
+						di = di + 1
+						copyFrom = copyFrom + 1
+					}
+				} else {
+					copy(dst[di:di+restSize], dst[0:restSize])
+					di = di + restSize
+				}
+			}
+			continue
+		}
+
 		expanded := dst[di-offset:]
 		if mLen > offset {
 			// Efficiently copy the match dst[di-offset:di] into the dst slice.

diff --git a/internal/lz4block/decode_test.go b/internal/lz4block/decode_test.go
@@ -125,7 +125,7 @@ func TestBlockDecode(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			buf := make([]byte, len(test.exp))
-			n := decodeBlock(buf, test.src)
+			n := decodeBlock(buf, test.src, nil)
 			if n < 0 {
 				t.Log(n)
 			}
@@ -167,7 +167,7 @@ func TestDecodeBlockInvalid(t *testing.T) {
 			}
 			dst = dst[:test.size]
 
-			r := decodeBlock(dst, []byte(test.src))
+			r := decodeBlock(dst, []byte(test.src), nil)
 			if r >= 0 {
 				t.Errorf("no error for %s", test.name)
 			}

diff --git a/internal/lz4stream/block.go b/internal/lz4stream/block.go
@@ -127,7 +127,7 @@ func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
 			blocks <- c
 			go func() {
 				defer block.Close(f)
-				data, err := block.Uncompress(f, size.Get(), false)
+				data, err := block.Uncompress(f, size.Get(), nil, false)
 				if err != nil {
 					b.closeR(err)
 				} else {
@@ -303,12 +303,12 @@ func (b *FrameDataBlock) Read(f *Frame, src io.Reader, cum uint32) (uint32, erro
 	return x, nil
 }
 
-func (b *FrameDataBlock) Uncompress(f *Frame, dst []byte, sum bool) ([]byte, error) {
+func (b *FrameDataBlock) Uncompress(f *Frame, dst, dict []byte, sum bool) ([]byte, error) {
 	if b.Size.Uncompressed() {
 		n := copy(dst, b.data)
 		dst = dst[:n]
 	} else {
-		n, err := lz4block.UncompressBlock(b.data, dst)
+		n, err := lz4block.UncompressBlock(b.data, dst, dict)
 		if err != nil {
 			return nil, err
 		}

diff --git a/internal/lz4stream/frame.go b/internal/lz4stream/frame.go
@@ -77,36 +77,40 @@ func (f *Frame) isLegacy() bool {
 	return f.Magic == frameMagicLegacy
 }
 
-func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
+func (f *Frame) ParseHeaders(src io.Reader) error {
 	if f.Magic > 0 {
 		// Header already read.
-		return nil, nil
+		return nil
 	}
 
 newFrame:
 	var err error
 	if f.Magic, err = f.readUint32(src); err != nil {
-		return nil, err
+		return err
 	}
 	switch m := f.Magic; {
 	case m == frameMagic || m == frameMagicLegacy:
 	// All 16 values of frameSkipMagic are valid.
 	case m>>8 == frameSkipMagic>>8:
 		skip, err := f.readUint32(src)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		if _, err := io.CopyN(ioutil.Discard, src, int64(skip)); err != nil {
-			return nil, err
+			return err
 		}
 		goto newFrame
 	default:
-		return nil, lz4errors.ErrInvalidFrame
+		return lz4errors.ErrInvalidFrame
 	}
 	if err := f.Descriptor.initR(f, src); err != nil {
-		return nil, err
+		return err
 	}
 	f.checksum.Reset()
+	return nil
+}
+
+func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
 	return f.Blocks.initR(f, num, src)
 }