generated from maragudk/template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchunk_test.go
157 lines (149 loc) · 4.71 KB
/
chunk_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
package gai_test
import (
"context"
"testing"
"maragu.dev/gai"
"maragu.dev/is"
)
func TestFixedSizeChunker(t *testing.T) {
t.Run("Chunk", func(t *testing.T) {
tests := []struct {
name string
size int
overlap float64
text string
expected []string
}{
{
name: "empty text",
size: 5,
overlap: 0,
text: "",
expected: nil,
},
{
name: "text shorter than chunk size",
size: 10,
overlap: 0,
text: "This is a short text.",
expected: []string{"This is a short text ."},
},
{
name: "simple chunks no overlap",
size: 5,
overlap: 0,
text: "The quick brown fox jumps over the lazy dog. The fox was very quick indeed.",
expected: []string{
"The quick brown fox jumps",
"over the lazy dog .",
"The fox was very quick",
"indeed .",
},
},
{
name: "chunks with 0.2 overlap",
size: 5,
overlap: 0.2,
text: "The quick brown fox jumps over the lazy dog. The fox was very quick indeed.",
expected: []string{
"The quick brown fox jumps",
"jumps over the lazy dog",
"dog . The fox was",
"was very quick indeed .",
},
},
{
name: "chunks with 0.5 overlap",
size: 6,
overlap: 0.5,
text: "Machine learning models can process natural language to perform various tasks such as translation sentiment analysis and text generation.",
expected: []string{
"Machine learning models can process natural",
"can process natural language to perform",
"language to perform various tasks such",
"various tasks such as translation sentiment",
"as translation sentiment analysis and text",
"analysis and text generation .",
},
},
{
name: "longer text realistic case",
size: 10,
overlap: 0.3,
text: "Effective chunking strategies are crucial for processing large documents in natural language processing applications. When working with language models that have token limits, proper text segmentation ensures that context is preserved across segments. Overlapping chunks can help maintain coherence between segments, avoiding information loss at chunk boundaries. The ideal chunk size and overlap ratio depend on the specific use case and the characteristics of the text being processed.",
expected: []string{
"Effective chunking strategies are crucial for processing large documents in",
"large documents in natural language processing applications . When working",
". When working with language models that have token limits",
"have token limits , proper text segmentation ensures that context",
"ensures that context is preserved across segments . Overlapping chunks",
". Overlapping chunks can help maintain coherence between segments ,",
"between segments , avoiding information loss at chunk boundaries .",
"chunk boundaries . The ideal chunk size and overlap ratio",
"and overlap ratio depend on the specific use case and",
"use case and the characteristics of the text being processed",
"text being processed .",
},
},
{
name: "high overlap (0.9)",
size: 5,
overlap: 0.9,
text: "One two three four five six seven eight nine ten eleven twelve.",
expected: []string{
"One two three four five",
"two three four five six",
"three four five six seven",
"four five six seven eight",
"five six seven eight nine",
"six seven eight nine ten",
"seven eight nine ten eleven",
"eight nine ten eleven twelve",
"nine ten eleven twelve .",
},
},
{
name: "full overlap (1.0)",
size: 3,
overlap: 1.0,
text: "Alpha beta gamma delta epsilon zeta eta.",
expected: []string{
"Alpha beta gamma",
"beta gamma delta",
"gamma delta epsilon",
"delta epsilon zeta",
"epsilon zeta eta",
"zeta eta .",
},
},
}
// Run tests
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a new chunker with the test parameters
chunker := gai.NewFixedSizeChunker(gai.NewFixedSizeChunkerOptions{
Tokenizer: &gai.NaiveWordTokenizer{},
Size: tt.size,
Overlap: tt.overlap,
})
// Get chunks
got := chunker.Chunk(context.Background(), tt.text)
// Verify results
t.Logf("Got %d chunks:", len(got))
for i, chunk := range got {
t.Logf(" %d: %q", i, chunk)
}
t.Logf("Expected %d chunks:", len(tt.expected))
for i, chunk := range tt.expected {
t.Logf(" %d: %q", i, chunk)
}
is.Equal(t, len(tt.expected), len(got))
for i, chunk := range got {
if i < len(tt.expected) {
is.Equal(t, tt.expected[i], chunk)
}
}
})
}
})
}