diff --git a/bench_test.go b/bench_test.go
index 6111baf9..eadfcdf3 100644
--- a/bench_test.go
+++ b/bench_test.go
@@ -63,7 +63,7 @@ func BenchmarkUncompress(b *testing.B) {
 	b.ResetTimer()
 
 	for i := 0; i < b.N; i++ {
-		_, _ = lz4block.UncompressBlock(pg1661LZ4, buf)
+		_, _ = lz4block.UncompressBlock(pg1661LZ4, buf, nil)
 	}
 }
 
diff --git a/fuzz/lz4.go b/fuzz/lz4.go
index 13a107ba..f199b1e5 100644
--- a/fuzz/lz4.go
+++ b/fuzz/lz4.go
@@ -81,7 +81,7 @@ func FuzzUncompressBlock(data []byte) int {
 	}
 	decomp = decomp[:len(data)]
 
-	n, err := lz4.UncompressBlock(data, decomp)
+	n, err := lz4.UncompressBlock(data, decomp, nil)
 	if n > len(decomp) {
 		panic("uncompressed length greater than buffer")
 	}
diff --git a/internal/lz4block/block.go b/internal/lz4block/block.go
index 8b971da5..88319105 100644
--- a/internal/lz4block/block.go
+++ b/internal/lz4block/block.go
@@ -41,11 +41,11 @@ func CompressBlockBound(n int) int {
 	return n + n/255 + 16
 }
 
-func UncompressBlock(src, dst []byte) (int, error) {
+func UncompressBlock(src, dst, dict []byte) (int, error) {
 	if len(src) == 0 {
 		return 0, nil
 	}
-	if di := decodeBlock(dst, src); di >= 0 {
+	if di := decodeBlock(dst, src, dict); di >= 0 {
 		return di, nil
 	}
 	return 0, lz4errors.ErrInvalidSourceShortBuffer
diff --git a/internal/lz4block/block_test.go b/internal/lz4block/block_test.go
index bd068651..9a9b5d8b 100644
--- a/internal/lz4block/block_test.go
+++ b/internal/lz4block/block_test.go
@@ -59,7 +59,7 @@ func TestCompressUncompressBlock(t *testing.T) {
 
 		// Uncompress the data.
 		buf := make([]byte, len(src))
-		n, err = lz4block.UncompressBlock(zbuf, buf)
+		n, err = lz4block.UncompressBlock(zbuf, buf, nil)
 		if err != nil {
 			t.Fatal(err)
 		} else if n < 0 || n > len(buf) {
diff --git a/internal/lz4block/decode_amd64.s b/internal/lz4block/decode_amd64.s
index dd323300..dfcca572 100644
--- a/internal/lz4block/decode_amd64.s
+++ b/internal/lz4block/decode_amd64.s
@@ -16,9 +16,11 @@
 // R11 &dst
 // R12 short output end
 // R13 short input end
+// R14 &dict
+// R15 &dict + len(dict)
 
-// func decodeBlock(dst, src []byte) int
-TEXT ·decodeBlock(SB), NOSPLIT, $48-56
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOSPLIT, $48-80
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, R11
 	MOVQ dst_len+8(FP), R8
@@ -30,6 +32,10 @@ TEXT ·decodeBlock(SB), NOSPLIT, $48-56
 	JE   err_corrupt
 	ADDQ SI, R9
 
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	ADDQ R14, R15
+
 	// shortcut ends
 	// short output end
 	MOVQ R8, R12
@@ -96,6 +102,8 @@ loop:
 	// match length, we already have the offset.
 	CMPQ CX, $0xF
 	JEQ match_len_loop_pre
+	CMPQ DX, R11
+	JLT match_len_loop_pre
 	CMPQ DX, $8
 	JLT match_len_loop_pre
 	CMPQ AX, R11
@@ -280,7 +288,7 @@ copy_match:
 	// check BX is within dst
 	// if BX < &dst
 	CMPQ BX, R11
-	JLT err_short_buf
+	JLT copy_match_from_dict
 
 	// if offset + match_len < di
 	LEAQ (BX)(CX*1), AX
@@ -327,6 +335,81 @@ copy_interior_match:
 	ADDQ CX, DI
 	JMP loop
 
+copy_match_from_dict:
+	// CX = match_len
+	// BX = &dst + (di - offset)
+
+	// AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
+	MOVQ R11, AX
+	SUBQ BX, AX
+
+	// BX = &dict_end - dict_bytes_available
+	MOVQ R15, BX
+	SUBQ AX, BX
+
+	// check BX is within dict
+	// if BX < &dict
+	CMPQ BX, R14
+	JLT err_short_dict
+
+	// if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
+	CMPQ CX, AX
+	JLT memmove_match
+
+	// The match stretches over the dictionary and our block
+	// 1) copy what comes from the dictionary
+	// AX = dict_bytes_available = copy_size
+	// BX = &dict_end - copy_size
+	// CX = match_len
+
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ AX, 16(SP)
+	// store extra stuff we want to recover
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 16(SP), AX // copy_size
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX // match_len
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	ADDQ R14, R15
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	// di+=copy_size
+	ADDQ AX, DI
+
+	// 2) copy the rest from the current block
+	// CX = match_len - copy_size = rest_size
+	SUBQ AX, CX
+	MOVQ R11, BX
+
+	// check if we have a copy overlap
+	// AX = &dst + rest_size
+	MOVQ CX, AX
+	ADDQ BX, AX
+	// if &dst + rest_size > di, copy byte by byte
+	CMPQ AX, DI
+
+	JGT copy_match_loop
+
 memmove_match:
 	// memmove(to, from, len)
 	MOVQ DI, 0(SP)
@@ -354,18 +437,25 @@ memmove_match:
 	SUBQ $32, R12
 	MOVQ R9, R13
 	SUBQ $16, R13
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	ADDQ R14, R15
 
 	JMP loop
 
 err_corrupt:
-	MOVQ $-1, ret+48(FP)
+	MOVQ $-1, ret+72(FP)
 	RET
 
 err_short_buf:
-	MOVQ $-2, ret+48(FP)
+	MOVQ $-2, ret+72(FP)
+	RET
+
+err_short_dict:
+	MOVQ $-3, ret+72(FP)
 	RET
 
 end:
 	SUBQ R11, DI
-	MOVQ DI, ret+48(FP)
+	MOVQ DI, ret+72(FP)
 	RET
diff --git a/internal/lz4block/decode_arm.s b/internal/lz4block/decode_arm.s
index 64be9adc..defb00c7 100644
--- a/internal/lz4block/decode_arm.s
+++ b/internal/lz4block/decode_arm.s
@@ -19,12 +19,12 @@
 
 #define minMatch	$4
 
-// func decodeBlock(dst, src []byte) int
-TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-28
-	MOVW dst_base +0(FP), dst
-	MOVW dst_len  +4(FP), dstend
-	MOVW src_base+12(FP), src
-	MOVW src_len +16(FP), srcend
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-40
+	MOVW dst_base  +0(FP), dst
+	MOVW dst_len   +4(FP), dstend
+	MOVW src_base +12(FP), src
+	MOVW src_len  +16(FP), srcend
 
 	CMP $0, srcend
 	BEQ shortSrc
@@ -183,7 +183,7 @@ copyMatchDone:
 
 end:
 	SUB  dstorig, dst, tmp1
-	MOVW tmp1, ret+24(FP)
+	MOVW tmp1, ret+36(FP)
 	RET
 
 	// The three error cases have distinct labels so we can put different
@@ -193,5 +193,5 @@ shortDst:
 shortSrc:
 corrupt:
 	MOVW $-1, tmp1
-	MOVW tmp1, ret+24(FP)
+	MOVW tmp1, ret+36(FP)
 	RET
diff --git a/internal/lz4block/decode_asm.go b/internal/lz4block/decode_asm.go
index e26f8cd6..ca4b11aa 100644
--- a/internal/lz4block/decode_asm.go
+++ b/internal/lz4block/decode_asm.go
@@ -6,4 +6,4 @@
 package lz4block
 
 //go:noescape
-func decodeBlock(dst, src []byte) int
+func decodeBlock(dst, src, dict []byte) int
diff --git a/internal/lz4block/decode_other.go b/internal/lz4block/decode_other.go
index 52df2f2b..7ab0cd6e 100644
--- a/internal/lz4block/decode_other.go
+++ b/internal/lz4block/decode_other.go
@@ -2,12 +2,15 @@
 
 package lz4block
 
-import "encoding/binary"
+import (
+	"encoding/binary"
+)
 
-func decodeBlock(dst, src []byte) (ret int) {
+func decodeBlock(dst, src, dict []byte) (ret int) {
 	// Restrict capacities so we don't read or write out of bounds.
 	dst = dst[:len(dst):len(dst)]
 	src = src[:len(src):len(src)]
+	dictLen := uint(len(dict))
 
 	const hasError = -2
 	defer func() {
@@ -38,7 +41,7 @@ func decodeBlock(dst, src []byte) (ret int) {
 					// if the match length (4..18) fits within the literals, then copy
 					// all 18 bytes, even if not all are part of the literals.
 					mLen += 4
-					if offset := u16(src[si:]); mLen <= offset {
+					if offset := u16(src[si:]); mLen <= offset && offset < di {
 						i := di - offset
 						end := i + 18
 						if end > uint(len(dst)) {
@@ -91,6 +94,38 @@ func decodeBlock(dst, src []byte) (ret int) {
 		mLen += minMatch
 
 		// Copy the match.
+		if di < offset {
+			// The match is beyond our block, meaning in the dictionary
+			if offset-di > mLen {
+				// The match is entirely contained in the dictionary. Just copy!
+				copy(dst[di:di+mLen], dict[dictLen+di-offset:dictLen+di-offset+mLen])
+				di = di + mLen
+			} else {
+				// The match stretches over the dictionary and our block
+				copySize := offset - di
+				restSize := mLen - copySize
+
+				copy(dst[di:di+copySize], dict[dictLen-copySize:])
+				di = di + copySize
+
+				if di < restSize {
+					// Overlap - we want to copy more than what we have available,
+					// so copy byte per byte.
+					copyFrom := 0
+					endOfMatch := di + restSize
+					for di < endOfMatch {
+						dst[di] = dst[copyFrom]
+						di = di + 1
+						copyFrom = copyFrom + 1
+					}
+				} else {
+					copy(dst[di:di+restSize], dst[0:restSize])
+					di = di + restSize
+				}
+			}
+			continue
+		}
+
 		expanded := dst[di-offset:]
 		if mLen > offset {
 			// Efficiently copy the match dst[di-offset:di] into the dst slice.
diff --git a/internal/lz4block/decode_test.go b/internal/lz4block/decode_test.go
index 017ee03f..6ba9ee6f 100644
--- a/internal/lz4block/decode_test.go
+++ b/internal/lz4block/decode_test.go
@@ -125,7 +125,7 @@ func TestBlockDecode(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			buf := make([]byte, len(test.exp))
-			n := decodeBlock(buf, test.src)
+			n := decodeBlock(buf, test.src, nil)
 			if n < 0 {
 				t.Log(n)
 			}
@@ -167,7 +167,7 @@ func TestDecodeBlockInvalid(t *testing.T) {
 			}
 			dst = dst[:test.size]
 
-			r := decodeBlock(dst, []byte(test.src))
+			r := decodeBlock(dst, []byte(test.src), nil)
 			if r >= 0 {
 				t.Errorf("no error for %s", test.name)
 			}
diff --git a/internal/lz4stream/block.go b/internal/lz4stream/block.go
index c7b929fd..446c74a4 100644
--- a/internal/lz4stream/block.go
+++ b/internal/lz4stream/block.go
@@ -127,7 +127,7 @@ func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
 			blocks <- c
 			go func() {
 				defer block.Close(f)
-				data, err := block.Uncompress(f, size.Get(), false)
+				data, err := block.Uncompress(f, size.Get(), nil, false)
 				if err != nil {
 					b.closeR(err)
 				} else {
@@ -303,12 +303,12 @@ func (b *FrameDataBlock) Read(f *Frame, src io.Reader, cum uint32) (uint32, erro
 	return x, nil
 }
 
-func (b *FrameDataBlock) Uncompress(f *Frame, dst []byte, sum bool) ([]byte, error) {
+func (b *FrameDataBlock) Uncompress(f *Frame, dst, dict []byte, sum bool) ([]byte, error) {
 	if b.Size.Uncompressed() {
 		n := copy(dst, b.data)
 		dst = dst[:n]
 	} else {
-		n, err := lz4block.UncompressBlock(b.data, dst)
+		n, err := lz4block.UncompressBlock(b.data, dst, dict)
 		if err != nil {
 			return nil, err
 		}
diff --git a/internal/lz4stream/frame.go b/internal/lz4stream/frame.go
index cfbd5674..18192a94 100644
--- a/internal/lz4stream/frame.go
+++ b/internal/lz4stream/frame.go
@@ -77,16 +77,16 @@ func (f *Frame) isLegacy() bool {
 	return f.Magic == frameMagicLegacy
 }
 
-func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
+func (f *Frame) ParseHeaders(src io.Reader) error {
 	if f.Magic > 0 {
 		// Header already read.
-		return nil, nil
+		return nil
 	}
 
 newFrame:
 	var err error
 	if f.Magic, err = f.readUint32(src); err != nil {
-		return nil, err
+		return err
 	}
 	switch m := f.Magic; {
 	case m == frameMagic || m == frameMagicLegacy:
@@ -94,19 +94,23 @@ newFrame:
 	case m>>8 == frameSkipMagic>>8:
 		skip, err := f.readUint32(src)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		if _, err := io.CopyN(ioutil.Discard, src, int64(skip)); err != nil {
-			return nil, err
+			return err
 		}
 		goto newFrame
 	default:
-		return nil, lz4errors.ErrInvalidFrame
+		return lz4errors.ErrInvalidFrame
 	}
 	if err := f.Descriptor.initR(f, src); err != nil {
-		return nil, err
+		return err
 	}
 	f.checksum.Reset()
+	return nil
+}
+
+func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
 	return f.Blocks.initR(f, num, src)
 }
 
diff --git a/internal/lz4stream/frame_test.go b/internal/lz4stream/frame_test.go
index 907a8071..1f850c07 100644
--- a/internal/lz4stream/frame_test.go
+++ b/internal/lz4stream/frame_test.go
@@ -99,7 +99,7 @@ func TestFrameDataBlock(t *testing.T) {
 				t.Fatal(err)
 			}
 			buf := make([]byte, size)
-			buf, err := block.Uncompress(f, buf, false)
+			buf, err := block.Uncompress(f, buf, nil, false)
 			if err != nil {
 				t.Fatal(err)
 			}
diff --git a/lz4.go b/lz4.go
index c585d406..a62022e0 100644
--- a/lz4.go
+++ b/lz4.go
@@ -35,7 +35,17 @@ func CompressBlockBound(n int) int {
 //
 // An error is returned if the source data is invalid or the destination buffer is too small.
 func UncompressBlock(src, dst []byte) (int, error) {
-	return lz4block.UncompressBlock(src, dst)
+	return lz4block.UncompressBlock(src, dst, nil)
+}
+
+// UncompressBlockWithDict uncompresses the source buffer into the destination one using a
+// dictionary, and returns the uncompressed size.
+//
+// The destination buffer must be sized appropriately.
+//
+// An error is returned if the source data is invalid or the destination buffer is too small.
+func UncompressBlockWithDict(src, dst, dict []byte) (int, error) {
+	return lz4block.UncompressBlock(src, dst, dict)
 }
 
 // A Compressor compresses data into the LZ4 block format.
diff --git a/options.go b/options.go
index 4e1b6703..46a87380 100644
--- a/options.go
+++ b/options.go
@@ -2,10 +2,11 @@ package lz4
 
 import (
 	"fmt"
-	"github.com/pierrec/lz4/v4/internal/lz4block"
-	"github.com/pierrec/lz4/v4/internal/lz4errors"
 	"reflect"
 	"runtime"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
 )
 
 //go:generate go run golang.org/x/tools/cmd/stringer -type=BlockSize,CompressionLevel -output options_gen.go
diff --git a/reader.go b/reader.go
index 403aaf69..f8458807 100644
--- a/reader.go
+++ b/reader.go
@@ -40,6 +40,7 @@ type Reader struct {
 	idx     int              // size of pending data
 	handler func(int)
 	cum     uint32
+	dict    []byte
 }
 
 func (*Reader) private() {}
@@ -77,6 +78,15 @@ func (r *Reader) isNotConcurrent() bool {
 }
 
 func (r *Reader) init() error {
+	err := r.frame.ParseHeaders(r.src)
+	if err != nil {
+		return err
+	}
+	if !r.frame.Descriptor.Flags.BlockIndependence() {
+		// We can't decompress dependent blocks concurrently.
+		// Instead of throwing an error to the user, silently drop concurrency
+		r.num = 1
+	}
 	data, err := r.frame.InitR(r.src, r.num)
 	if err != nil {
 		return err
@@ -162,10 +172,20 @@ func (r *Reader) read(buf []byte) (int, error) {
 		direct = true
 		dst = buf
 	}
-	dst, err = block.Uncompress(r.frame, dst, true)
+	dst, err = block.Uncompress(r.frame, dst, r.dict, true)
 	if err != nil {
 		return 0, err
 	}
+	if !r.frame.Descriptor.Flags.BlockIndependence() {
+		if len(r.dict)+len(dst) > 128*1024 {
+			preserveSize := 64*1024 - len(dst)
+			if preserveSize < 0 {
+				preserveSize = 0
+			}
+			r.dict = r.dict[len(r.dict)-preserveSize:]
+		}
+		r.dict = append(r.dict, dst...)
+	}
 	r.cum += uint32(len(dst))
 	if direct {
 		return len(dst), nil
diff --git a/reader_test.go b/reader_test.go
index 7c5beebc..ab3c2bc1 100644
--- a/reader_test.go
+++ b/reader_test.go
@@ -40,6 +40,10 @@ func TestReader(t *testing.T) {
 			name:   "testdata/Mark.Twain-Tom.Sawyer_long.txt.lz4",
 			isText: true,
 		},
+		{
+			name:   "testdata/Mark.Twain-Tom.Sawyer_linked.txt.lz4",
+			isText: true,
+		},
 		{
 			name:   "testdata/pg1661.txt.lz4",
 			isText: false,
diff --git a/testdata/Mark.Twain-Tom.Sawyer_linked.txt b/testdata/Mark.Twain-Tom.Sawyer_linked.txt
new file mode 120000
index 00000000..40907708
--- /dev/null
+++ b/testdata/Mark.Twain-Tom.Sawyer_linked.txt
@@ -0,0 +1 @@
+Mark.Twain-Tom.Sawyer_long.txt
\ No newline at end of file
diff --git a/testdata/Mark.Twain-Tom.Sawyer_linked.txt.lz4 b/testdata/Mark.Twain-Tom.Sawyer_linked.txt.lz4
new file mode 100644
index 00000000..f78f77dd
Binary files /dev/null and b/testdata/Mark.Twain-Tom.Sawyer_linked.txt.lz4 differ
diff --git a/writer_test.go b/writer_test.go
index e43b0fcb..a945f84a 100644
--- a/writer_test.go
+++ b/writer_test.go
@@ -175,7 +175,7 @@ func TestIssue51(t *testing.T) {
 	zbuf = zbuf[:n]
 
 	buf := make([]byte, 8192)
-	n, err = lz4block.UncompressBlock(zbuf, buf)
+	n, err = lz4block.UncompressBlock(zbuf, buf, nil)
 	if err != nil {
 		t.Fatal(err)
 	}