Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public interface IdGenerator {

/**
* Generate a unique ID for the given content. Note: some generator, such as the
* random generator might not dependant on or use the content parameters.
* random generator might not dependent on or use the content parameters.
* @param contents the content to generate an ID for.
* @return the generated ID.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,44 +63,61 @@ private List<Document> doSplitDocuments(List<Document> documents) {
List<String> texts = new ArrayList<>();
List<Map<String, Object>> metadataList = new ArrayList<>();
List<ContentFormatter> formatters = new ArrayList<>();
List<Double> scores = new ArrayList<>();
List<String> originalIds = new ArrayList<>();

for (Document doc : documents) {
texts.add(doc.getText());
metadataList.add(doc.getMetadata());
formatters.add(doc.getContentFormatter());
scores.add(doc.getScore());
originalIds.add(doc.getId());
}

return createDocuments(texts, formatters, metadataList);
return createDocuments(texts, formatters, metadataList, scores, originalIds);
}

private List<Document> createDocuments(List<String> texts, List<ContentFormatter> formatters,
List<Map<String, Object>> metadataList) {
List<Map<String, Object>> metadataList, List<Double> scores, List<String> originalIds) {

// Process the data in a column oriented way and recreate the Document
List<Document> documents = new ArrayList<>();

for (int i = 0; i < texts.size(); i++) {
String text = texts.get(i);
Map<String, Object> metadata = metadataList.get(i);
Double originalScore = scores.get(i);
String originalId = originalIds.get(i);

List<String> chunks = splitText(text);
if (chunks.size() > 1) {
logger.info("Splitting up document into " + chunks.size() + " chunks.");
}
for (String chunk : chunks) {
// only primitive values are in here -
Map<String, Object> metadataCopy = metadata.entrySet()

for (int chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) {
String chunk = chunks.get(chunkIndex);

Map<String, Object> enhancedMetadata = metadata.entrySet()
.stream()
.filter(e -> e.getKey() != null && e.getValue() != null)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
Document newDoc = new Document(chunk, metadataCopy);

enhancedMetadata.put("parent_document_id", originalId);
enhancedMetadata.put("chunk_index", chunkIndex);
enhancedMetadata.put("total_chunks", chunks.size());

Document newDoc = Document.builder()
.text(chunk)
.metadata(enhancedMetadata)
.score(originalScore)
.build();

if (this.copyContentFormatter) {
// Transfer the content-formatter of the parent to the chunked
// documents it was slit into.
// documents it was split into.
newDoc.setContentFormatter(formatters.get(i));
}

// TODO copy over other properties.
documents.add(newDoc);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,22 @@ public void testSplitText() {
assertThat(chunks.get(3).getText())
.isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.");

// Verify that the same, merged metadata is copied to all chunks.
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
// Verify that the original metadata is copied to all chunks (including
// chunk-specific fields)
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
"total_chunks");

// Verify chunk indices are correct
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");

Expand Down Expand Up @@ -148,7 +161,6 @@ public void pageNoChunkSplit() {
@Test
public void pageWithChunkSplit() {
// given

var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough."
+ "1The most oppressive thing about the labyrinth is that you are constantly "
+ "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
Expand Down Expand Up @@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() {
assertThat(chunks.get(0).getText()).isEqualTo("In the end, writing arises when man");
assertThat(chunks.get(1).getText()).isEqualTo(" realizes that memory is not enough.");

// Verify that the same, merged metadata is copied to all chunks.
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
assertThat(chunks.get(1).getMetadata()).containsKeys("key1");
// Verify that the original metadata is copied to all chunks (with chunk-specific
// fields)
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
"total_chunks");

// Verify chunk indices are different
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);

// Verify that the content formatters are copied from the parents to the chunks.
assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter);
assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter);
}

@Test
public void testScorePreservation() {
// given
Double originalScore = 0.95;
var doc = Document.builder()
.text("This is a test document that will be split into multiple chunks.")
.metadata(Map.of("source", "test.txt"))
.score(originalScore)
.build();

// when
List<Document> chunks = testTextSplitter.apply(List.of(doc));

// then
assertThat(chunks).hasSize(2);
assertThat(chunks.get(0).getScore()).isEqualTo(originalScore);
assertThat(chunks.get(1).getScore()).isEqualTo(originalScore);
}

@Test
public void testParentDocumentTracking() {
// given
var doc1 = new Document("First document content for testing splitting functionality.",
Map.of("source", "doc1.txt"));
var doc2 = new Document("Second document content for testing splitting functionality.",
Map.of("source", "doc2.txt"));

String originalId1 = doc1.getId();
String originalId2 = doc2.getId();

// when
List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));

// then
assertThat(chunks).hasSize(4);

// Verify parent document tracking for doc1 chunks
assertThat(chunks.get(0).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
assertThat(chunks.get(1).getMetadata().get("parent_document_id")).isEqualTo(originalId1);

// Verify parent document tracking for doc2 chunks
assertThat(chunks.get(2).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
assertThat(chunks.get(3).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
}

@Test
public void testChunkMetadataInformation() {
// given
var doc = new Document("This is a longer document that will be split into exactly two chunks for testing.",
Map.of("source", "test.txt"));

// when
List<Document> chunks = testTextSplitter.apply(List.of(doc));

// then
assertThat(chunks).hasSize(2);

// Verify chunk index and total chunks for first chunk
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
assertThat(chunks.get(0).getMetadata().get("total_chunks")).isEqualTo(2);

// Verify chunk index and total chunks for second chunk
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
assertThat(chunks.get(1).getMetadata().get("total_chunks")).isEqualTo(2);

// Verify original metadata is preserved
assertThat(chunks.get(0).getMetadata().get("source")).isEqualTo("test.txt");
assertThat(chunks.get(1).getMetadata().get("source")).isEqualTo("test.txt");
}

@Test
public void testEnhancedMetadataWithMultipleDocuments() {
// given
var doc1 = Document.builder()
.text("First document with score and metadata.")
.metadata(Map.of("type", "article", "priority", "high"))
.score(0.8)
.build();

var doc2 = Document.builder()
.text("Second document with different score.")
.metadata(Map.of("type", "report", "priority", "medium"))
.score(0.6)
.build();

String originalId1 = doc1.getId();
String originalId2 = doc2.getId();

// when
List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));

// then
assertThat(chunks).hasSize(4);

// Verify first document chunks
for (int i = 0; i < 2; i++) {
Document chunk = chunks.get(i);
assertThat(chunk.getScore()).isEqualTo(0.8);
assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId1);
assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i);
assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
assertThat(chunk.getMetadata().get("type")).isEqualTo("article");
assertThat(chunk.getMetadata().get("priority")).isEqualTo("high");
}

// Verify second document chunks
for (int i = 2; i < 4; i++) {
Document chunk = chunks.get(i);
assertThat(chunk.getScore()).isEqualTo(0.6);
assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId2);
assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i - 2);
assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
assertThat(chunk.getMetadata().get("type")).isEqualTo("report");
assertThat(chunk.getMetadata().get("priority")).isEqualTo("medium");
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,22 @@ public void testTokenTextSplitterBuilderWithAllFields() {
assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but");
assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");

// Verify that the same, merged metadata is copied to all chunks.
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
// Verify that the original metadata is copied to all chunks (including
// chunk-specific fields)
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
"total_chunks");
assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
"total_chunks");

// Verify chunk indices are correct
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);

assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
Expand Down