Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions app/web_ui/src/lib/api_schema.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5084,6 +5084,11 @@ export interface components {
* @description The score of the chunk, which depends on the similarity metric used.
*/
similarity: number | null;
/**
* Page Number
* @description The page number (0-indexed) this chunk belongs to. Only set when the extraction has page_offsets.
*/
page_number?: number | null;
};
/** SearchToolApiDescription */
SearchToolApiDescription: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
chunk_idx: number
chunk_text: string
similarity: number | null
page_number?: number | null
}> = []

onMount(async () => {
Expand Down Expand Up @@ -366,7 +367,8 @@
>
Document: {result.document_id}
</a>
(Chunk #{result.chunk_idx})
(Chunk #{result.chunk_idx} | Page: {result.page_number ??
"N/A"})
</div>
<div>
Score: {result.similarity !== null
Expand Down
54 changes: 48 additions & 6 deletions libs/core/kiln_ai/adapters/chunkers/base_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

class TextChunk(BaseModel):
text: str = Field(description="The text of the chunk.")
page_number: int | None = Field(
default=None,
description="The page number (0-indexed) this chunk belongs to. Only set when page_offsets are provided.",
)


class ChunkingResult(BaseModel):
Expand All @@ -27,15 +31,53 @@ class BaseChunker(ABC):
def __init__(self, chunker_config: ChunkerConfig):
self.chunker_config = chunker_config

async def chunk(self, text: str) -> ChunkingResult:
if not text:
async def chunk(
self, text: str, page_offsets: list[int] | None = None
) -> ChunkingResult:
if not text or not clean_up_text(text):
return ChunkingResult(chunks=[])

sanitized_text = clean_up_text(text)
if not sanitized_text:
return ChunkingResult(chunks=[])
chunking_result = await self._chunk(text)

if page_offsets is not None and len(page_offsets) > 0:
search_start = 0
for chunk in chunking_result.chunks:
chunk_start_offset = text.find(chunk.text, search_start)
if chunk_start_offset == -1:
logger.warning(
f"Chunk text not found in sanitized text starting from offset {search_start}. "
"This may indicate an issue with the chunker implementation."
)
Comment on lines +45 to +50
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The text.find() method can be inefficient for large texts or frequent calls. Consider using a more efficient string searching algorithm or pre-processing the text if performance becomes a bottleneck. Also, consider adding a more descriptive error message to the logger.

Suggested change
chunk_start_offset = text.find(chunk.text, search_start)
if chunk_start_offset == -1:
logger.warning(
f"Chunk text not found in sanitized text starting from offset {search_start}. "
"This may indicate an issue with the chunker implementation."
)
chunk_start_offset = text.find(chunk.text, search_start)
if chunk_start_offset == -1:
logger.warning(
f"Chunk text not found in sanitized text starting from offset {search_start}. "
"This may indicate an issue with the chunker implementation. "
f"Chunk text: '{chunk.text[:50]}...'"
)
chunk.page_number = None

chunk.page_number = None
else:
page_number = self._find_page_number(
chunk_start_offset, page_offsets
)
chunk.page_number = page_number
search_start = chunk_start_offset + 1

return chunking_result

def _find_page_number(
self, chunk_offset: int, page_offsets: list[int]
) -> int | None:
"""
Find the page number for a chunk at the given offset.

Returns the page number (0-indexed) that the chunk belongs to,
or None if the offset is before the first page.
"""
if not page_offsets:
return None

if chunk_offset < page_offsets[0]:
return 0
Comment on lines +73 to +74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Returning 0 here might not be the most intuitive behavior. If the chunk_offset is before the first page, it might be better to return None to indicate that the chunk doesn't belong to any page. This would require updating the type hint to Optional[int].

Suggested change
if chunk_offset < page_offsets[0]:
return 0
if chunk_offset < page_offsets[0]:
return None


for i in range(len(page_offsets) - 1, -1, -1):
if chunk_offset >= page_offsets[i]:
return i

return await self._chunk(sanitized_text)
return None

@abstractmethod
async def _chunk(self, text: str) -> ChunkingResult:
Expand Down
123 changes: 115 additions & 8 deletions libs/core/kiln_ai/adapters/chunkers/test_base_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ async def test_base_chunker_concrete_chunker(chunker: WhitespaceChunker):
assert len(output.chunks) == 2


async def test_base_chunker_calls_clean_up_text(chunker: WhitespaceChunker):
async def test_base_chunker_checks_clean_up_text_for_empty(chunker: WhitespaceChunker):
"""Test that clean_up_text is still called to check if text becomes empty."""
with patch(
"kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
) as mock_clean_up_text:
Expand All @@ -58,10 +59,116 @@ async def test_base_chunker_empty_text(chunker: WhitespaceChunker):


async def test_base_chunker_empty_text_after_clean_up(chunker: WhitespaceChunker):
with patch(
"kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
) as mock_clean_up_text:
mock_clean_up_text.side_effect = clean_up_text
chunks = await chunker.chunk("\n\n ")
mock_clean_up_text.assert_called_once_with("\n\n ")
assert chunks == ChunkingResult(chunks=[])
chunks = await chunker.chunk("\n\n ")
assert chunks == ChunkingResult(chunks=[])


async def test_base_chunker_page_number_without_offsets(chunker: WhitespaceChunker):
"""Test that page_number is None when page_offsets are not provided."""
output = await chunker.chunk("Hello world test")
assert len(output.chunks) == 3
for chunk in output.chunks:
assert chunk.page_number is None


async def test_base_chunker_page_number_with_offsets(chunker: WhitespaceChunker):
"""Test that page_number is correctly assigned when page_offsets are provided."""
text = "Page0 content here Page1 content here Page2 content here"
page_offsets = [0, 20, 40]

output = await chunker.chunk(text, page_offsets=page_offsets)

assert len(output.chunks) == 9
assert output.chunks[0].text == "Page0"
assert output.chunks[0].page_number == 0

assert output.chunks[1].text == "content"
assert output.chunks[1].page_number == 0

assert output.chunks[2].text == "here"
assert output.chunks[2].page_number == 0


async def test_base_chunker_page_number_multiple_pages(chunker: WhitespaceChunker):
"""Test page number assignment across multiple pages."""
text = (
"Start of page 0. " * 10 + "Start of page 1. " * 10 + "Start of page 2. " * 10
)
page_offsets = [0, 160, 320]

output = await chunker.chunk(text, page_offsets=page_offsets)

assert len(output.chunks) > 0

search_start = 0
for chunk in output.chunks:
chunk_offset = text.find(chunk.text, search_start)
if chunk_offset == -1:
# Chunk text not found, skip validation
continue
search_start = chunk_offset + 1

if chunk_offset < 160:
assert chunk.page_number == 0, (
f"Chunk '{chunk.text}' at offset {chunk_offset} should be page 0"
)
elif chunk_offset < 320:
assert chunk.page_number == 1, (
f"Chunk '{chunk.text}' at offset {chunk_offset} should be page 1"
)
else:
assert chunk.page_number == 2, (
f"Chunk '{chunk.text}' at offset {chunk_offset} should be page 2"
)


async def test_base_chunker_page_number_edge_cases(chunker: WhitespaceChunker):
"""Test edge cases for page number calculation."""
text = "Test content"

# Empty page_offsets
output = await chunker.chunk(text, page_offsets=[])
for chunk in output.chunks:
assert chunk.page_number is None

# Single page
output = await chunker.chunk(text, page_offsets=[0])
for chunk in output.chunks:
assert chunk.page_number == 0

# Chunk at exact page boundary
text = "Page0" + " " * 15 + "Page1"
page_offsets = [0, 20]
output = await chunker.chunk(text, page_offsets=page_offsets)
for chunk in output.chunks:
chunk_offset = text.find(chunk.text)
if chunk_offset < 20:
assert chunk.page_number == 0
else:
assert chunk.page_number == 1


async def test_find_page_number_method(chunker: WhitespaceChunker):
"""Test the _find_page_number helper method directly."""
page_offsets = [0, 100, 200]

# Test various offsets
assert chunker._find_page_number(0, page_offsets) == 0
assert chunker._find_page_number(50, page_offsets) == 0
assert chunker._find_page_number(99, page_offsets) == 0
assert chunker._find_page_number(100, page_offsets) == 1
assert chunker._find_page_number(150, page_offsets) == 1
assert chunker._find_page_number(199, page_offsets) == 1
assert chunker._find_page_number(200, page_offsets) == 2
assert chunker._find_page_number(250, page_offsets) == 2

# Test edge cases
assert chunker._find_page_number(-10, page_offsets) == 0
assert chunker._find_page_number(1000, page_offsets) == 2

# Test empty offsets
assert chunker._find_page_number(50, []) is None

# Test single page
assert chunker._find_page_number(0, [0]) == 0
assert chunker._find_page_number(100, [0]) == 0
84 changes: 84 additions & 0 deletions libs/core/kiln_ai/adapters/chunkers/test_fixed_window_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,90 @@ async def test_fixed_window_chunker_removes_consecutive_whitespace(
assert len(output.chunks) > 1


async def test_fixed_window_chunker_page_numbers_single_page(
mock_fixed_window_chunker_factory,
):
"""Test page number assignment for chunks on a single page."""
chunker = mock_fixed_window_chunker_factory(100, 10)
text = "This is a test sentence. This is another test sentence."
page_offsets = [0]

output = await chunker.chunk(text, page_offsets=page_offsets)

assert len(output.chunks) > 0
for chunk in output.chunks:
assert chunk.page_number == 0


async def test_fixed_window_chunker_page_numbers_multiple_pages(
mock_fixed_window_chunker_factory,
):
"""Test page number assignment across multiple pages."""
chunker = mock_fixed_window_chunker_factory(50, 5)
# Create text that spans multiple pages
page0_text = "Page 0 content. " * 10
page1_text = "Page 1 content. " * 10
page2_text = "Page 2 content. " * 10
text = page0_text + page1_text + page2_text

page_offsets = [0, len(page0_text), len(page0_text) + len(page1_text)]

output = await chunker.chunk(text, page_offsets=page_offsets)

assert len(output.chunks) > 0

for chunk in output.chunks:
chunk_offset = text.find(chunk.text)
if chunk_offset < page_offsets[1]:
assert chunk.page_number == 0, (
f"Chunk at offset {chunk_offset} should be page 0"
)
elif chunk_offset < page_offsets[2]:
assert chunk.page_number == 1, (
f"Chunk at offset {chunk_offset} should be page 1"
)
else:
assert chunk.page_number == 2, (
f"Chunk at offset {chunk_offset} should be page 2"
)


async def test_fixed_window_chunker_page_numbers_without_offsets(
mock_fixed_window_chunker_factory,
):
"""Test that page_number is None when page_offsets are not provided."""
chunker = mock_fixed_window_chunker_factory(100, 10)
text = "This is a test sentence. This is another test sentence."

output = await chunker.chunk(text)

assert len(output.chunks) > 0
for chunk in output.chunks:
assert chunk.page_number is None


async def test_fixed_window_chunker_page_numbers_at_boundaries(
mock_fixed_window_chunker_factory,
):
"""Test page number assignment at exact page boundaries."""
chunker = mock_fixed_window_chunker_factory(20, 0)
text = "A" * 20 + "B" * 20 + "C" * 20
page_offsets = [0, 20, 40]

output = await chunker.chunk(text, page_offsets=page_offsets)

assert len(output.chunks) > 0

for chunk in output.chunks:
chunk_offset = text.find(chunk.text)
if chunk_offset < 20:
assert chunk.page_number == 0
elif chunk_offset < 40:
assert chunk.page_number == 1
else:
assert chunk.page_number == 2


@pytest.mark.parametrize(
"whitespace_length",
[100_000, 1_000_000, 5_000_000, 10_000_000],
Expand Down
38 changes: 38 additions & 0 deletions libs/core/kiln_ai/adapters/chunkers/test_semantic_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ def create_chunker(config: ChunkerConfig) -> SemanticChunker:

# Create mock semantic splitter
mock_splitter = MagicMock()
mock_node1 = MagicMock()
mock_node1.get_content.return_value = "First semantic chunk."
mock_node2 = MagicMock()
mock_node2.get_content.return_value = "Second semantic chunk."
mock_splitter.abuild_semantic_nodes_from_documents = AsyncMock(
return_value=[mock_node1, mock_node2]
)
mock_splitter_class.return_value = mock_splitter

return SemanticChunker(config)
Expand Down Expand Up @@ -292,6 +299,8 @@ async def test_semantic_chunker_real_integration(tmp_path):
# Basic sanity assertions
assert len(result.chunks) >= 1
assert all(isinstance(c.text, str) and len(c.text) > 0 for c in result.chunks)
# Page numbers should be None when not provided
assert all(c.page_number is None for c in result.chunks)

assert (
"The history of spices is, in many ways, the history of globalization itself."
Expand All @@ -300,3 +309,32 @@ async def test_semantic_chunker_real_integration(tmp_path):

# flaky assertion, may fail sometimes
assert len(result.chunks) >= 3


async def test_semantic_chunker_page_numbers_without_offsets(
semantic_chunker_factory, semantic_chunker_config
):
"""Test that page_number is None when page_offsets are not provided."""
chunker = semantic_chunker_factory(semantic_chunker_config)
result = await chunker.chunk("Test content here.")

assert len(result.chunks) > 0
for chunk in result.chunks:
assert chunk.page_number is None


async def test_semantic_chunker_page_numbers_with_offsets(
semantic_chunker_factory, semantic_chunker_config
):
"""Test page number assignment when page_offsets are provided."""
chunker = semantic_chunker_factory(semantic_chunker_config)
text = "First semantic chunk. Second semantic chunk."
page_offsets = [0, 25]

result = await chunker.chunk(text, page_offsets=page_offsets)

assert len(result.chunks) > 0
# All chunks should have page numbers assigned
for chunk in result.chunks:
assert chunk.page_number is not None
assert chunk.page_number in [0, 1]
4 changes: 4 additions & 0 deletions libs/core/kiln_ai/adapters/extractors/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ class ExtractionOutput(BaseModel):
description="The format of the extracted data."
)
content: str = Field(description="The extracted data.")
page_offsets: list[int] | None = Field(
default=None,
description="Character offsets marking the start of each page. The int at index i is the char offset marking the start of page i (0-indexed).",
)


class BaseExtractor(ABC):
Expand Down
Loading
Loading