Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ benchmark_index/

# Test coverage
.coverage
benchmark_data
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ test:
# run linting, typechecking, and tests
check:
pre-commit run --all-files

typecheck:
pre-commit run mypy --all-files
2 changes: 1 addition & 1 deletion examples/basic/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def main():
indexer.index_directory(docs_dir, glob_pattern="**/*.md")

# Search
documents, distances = indexer.search(query, n_results=3)
documents, distances, _ = indexer.search(query, n_results=3)

# Display results
console.print(f"\nResults for: [cyan]{query}[/cyan]\n")
Expand Down
2 changes: 1 addition & 1 deletion examples/code-search/search_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def main():
continue

# Search with chunk grouping
documents, distances = indexer.search(
documents, distances, _ = indexer.search(
query,
n_results=5,
group_chunks=True,
Expand Down
2 changes: 1 addition & 1 deletion examples/knowledge-base/search_kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def main(query: str | None, index_dir: Path, interactive: bool, show_content: bo
def do_search(search_query: str):
"""Perform search and display results."""
# Search with chunk grouping
documents, distances = indexer.search(
documents, distances, _ = indexer.search(
search_query,
n_results=5,
group_chunks=True,
Expand Down
2 changes: 1 addition & 1 deletion gptme_rag/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def search_operation():
)
total_results = 0
for query in queries:
results, _ = indexer.search(query, n_results=n_results)
results, _, _ = indexer.search(query, n_results=n_results)
total_results += len(results)
return {
"items_processed": len(queries),
Expand Down
133 changes: 114 additions & 19 deletions gptme_rag/cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import os
import signal
Expand All @@ -9,13 +10,15 @@
from rich.console import Console
from rich.logging import RichHandler
from rich.syntax import Syntax
from tqdm import tqdm

from .benchmark import RagBenchmark
from .indexing.indexer import Indexer
from .indexing.watcher import FileWatcher
from .query.context_assembler import ContextAssembler

console = Console()
logger = logging.getLogger(__name__)

# TODO: change this to a more appropriate location
default_persist_dir = Path.home() / ".cache" / "gptme" / "rag"
Expand All @@ -35,9 +38,7 @@ def cli(verbose: bool):


@cli.command()
@click.argument(
"directory", type=click.Path(exists=True, file_okay=False, path_type=Path)
)
@click.argument("paths", nargs=-1, type=click.Path(exists=True, path_type=Path))
@click.option(
"--pattern", "-p", default="**/*.*", help="Glob pattern for files to index"
)
Expand All @@ -47,18 +48,53 @@ def cli(verbose: bool):
default=default_persist_dir,
help="Directory to persist the index",
)
def index(directory: Path, pattern: str, persist_dir: Path):
"""Index documents in a directory."""
def index(paths: list[Path], pattern: str, persist_dir: Path):
"""Index documents in one or more directories."""
if not paths:
console.print("❌ No paths provided", style="red")
return

try:
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
console.print(f"Indexing files in {directory} with pattern {pattern}")

# Index the files
n_indexed = indexer.index_directory(directory, pattern)
# First, collect all documents
all_documents = []
with console.status("Collecting documents...") as status:
for path in paths:
if path.is_file():
status.update(f"Processing file: {path}")
else:
status.update(f"Processing directory: {path}")
documents = indexer.collect_documents(path)
all_documents.extend(documents)

if not all_documents:
console.print("No documents found to index", style="yellow")
return

# Then process them with a progress bar
n_files = len(set(doc.metadata.get("source", "") for doc in all_documents))
n_chunks = len(all_documents)

logger.info(f"Found {n_files} files to index ({n_chunks} chunks)")

with tqdm(
total=n_chunks,
desc="Indexing documents",
unit="chunk",
disable=not sys.stdout.isatty(),
) as pbar:
for progress in indexer.add_documents_progress(all_documents):
pbar.update(progress)

console.print(f"✅ Successfully indexed {n_indexed} files", style="green")
console.print(
f"✅ Successfully indexed {n_files} files ({n_chunks} chunks)",
style="green",
)
except Exception as e:
console.print(f"❌ Error indexing directory: {e}", style="red")
if logger.isEnabledFor(logging.DEBUG):
console.print_exception()


@cli.command()
Expand All @@ -74,6 +110,12 @@ def index(directory: Path, pattern: str, persist_dir: Path):
@click.option("--max-tokens", default=4000, help="Maximum tokens in context window")
@click.option("--show-context", is_flag=True, help="Show the full context content")
@click.option("--raw", is_flag=True, help="Skip syntax highlighting")
@click.option("--explain", is_flag=True, help="Show scoring explanations")
@click.option(
"--weights",
type=click.STRING,
help="Custom scoring weights as JSON string, e.g. '{\"recency_boost\": 0.3}'",
)
def search(
query: str,
paths: list[Path],
Expand All @@ -82,21 +124,44 @@ def search(
max_tokens: int,
show_context: bool,
raw: bool,
explain: bool,
weights: str | None,
):
"""Search the index and assemble context."""
paths = [path.resolve() for path in paths]

# Hide ChromaDB output during initialization and search
with console.status("Initializing..."):
# Parse custom weights if provided
scoring_weights = None
if weights:
try:
scoring_weights = json.loads(weights)
except json.JSONDecodeError as e:
console.print(f"❌ Invalid weights JSON: {e}", style="red")
return
except Exception as e:
console.print(f"❌ Error parsing weights: {e}", style="red")
return

# Temporarily redirect stdout to suppress ChromaDB output
stdout = sys.stdout
sys.stdout = open(os.devnull, "w")
try:
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
assembler = ContextAssembler(max_tokens=max_tokens)
documents, distances = indexer.search(
query, n_results=n_results, paths=paths
indexer = Indexer(
persist_directory=persist_dir,
enable_persist=True,
scoring_weights=scoring_weights,
)
assembler = ContextAssembler(max_tokens=max_tokens)
if explain:
documents, distances, explanations = indexer.search(
query, n_results=n_results, paths=paths, explain=True
)
else:
documents, distances, _ = indexer.search(
query, n_results=n_results, paths=paths
)
finally:
sys.stdout.close()
sys.stdout = stdout
Expand Down Expand Up @@ -128,20 +193,50 @@ def search(
for i, doc in enumerate(documents):
source = doc.metadata.get("source", "unknown")
distance = distances[i]
relevance = 1 - distance # Convert distance to similarity score

# Show document header with relevance score
console.print(
f"\n[cyan]{i+1}. {source}[/cyan] [yellow](relevance: {relevance:.2f})[/yellow]"
)
# Show document header
console.print(f"\n[cyan]{i+1}. {source}[/cyan]")

# Show scoring explanation if requested
if explain and explanations: # Make sure explanations is not None
explanation = explanations[i]
console.print("\n[bold]Scoring Breakdown:[/bold]")

# Show individual score components
scores = explanation.get("scores", {})
for factor, score in scores.items():
# Color code the scores
if score > 0:
score_color = "green"
sign = "+"
elif score < 0:
score_color = "red"
sign = ""
else:
score_color = "yellow"
sign = " "

# Print score and explanation
console.print(
f" {factor:15} [{score_color}]{sign}{score:>6.3f}[/{score_color}] | {explanation['explanations'][factor]}"
)

# Show total score
total = explanation["total_score"]
console.print(f"\n {'Total':15} [bold blue]{total:>7.3f}[/bold blue]")
else:
# Just show the base relevance score
relevance = 1 - distance
console.print(f"[yellow](relevance: {relevance:.2f})[/yellow]")

# Use file extension as lexer (strip the dot)
lexer = doc.metadata.get("extension", "").lstrip(".") or "text"

# Extract preview content (first ~200 chars)
preview = doc.content[:200] + ("..." if len(doc.content) > 200 else "")

# Display with syntax highlighting
# Display preview with syntax highlighting
console.print("\n[bold]Preview:[/bold]")
syntax = Syntax(
preview,
lexer,
Expand Down
39 changes: 21 additions & 18 deletions gptme_rag/indexing/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,42 +72,45 @@ def process_text(
}
return

# Process in chunks
# Process text in chunks based on tokens
chunk_start = 0
chunk_count = 0

while chunk_start < len(tokens):
# Calculate chunk end
chunk_end = min(chunk_start + self.chunk_size, len(tokens))

# Decode chunk
# Get chunk tokens and decode
chunk_tokens = tokens[chunk_start:chunk_end]
chunk_text = self.encoding.decode(chunk_tokens)

# Create chunk metadata
chunk_metadata = {
**(metadata or {}),
"chunk_index": chunk_count,
"token_count": len(chunk_tokens),
"total_chunks": total_chunks,
"chunk_start": chunk_start,
"chunk_end": chunk_end,
}

yield {
"text": chunk_text,
"metadata": chunk_metadata,
"metadata": {
**(metadata or {}),
"chunk_index": chunk_count,
"token_count": len(chunk_tokens),
"total_chunks": total_chunks,
"chunk_start": chunk_start,
"chunk_end": chunk_end,
"is_chunk": True,
},
}

# Move to next chunk
chunk_start = chunk_end - self.chunk_overlap
# Calculate next chunk start
if chunk_end == len(tokens):
# If we've reached the end, we're done
break

# Move forward by at least one token, considering overlap
next_start = chunk_start + max(1, self.chunk_size - self.chunk_overlap)
chunk_start = min(next_start, len(tokens) - 1)
chunk_count += 1

# Check stopping conditions
# Check max chunks limit
if self.max_chunks and chunk_count >= self.max_chunks:
return
if len(tokens) - chunk_start <= self.chunk_overlap:
return
break

except Exception as e:
logger.error(f"Error processing text: {e}")
Expand Down
Loading
Loading