gptme · ErikBjare · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,4 @@ benchmark_index/
 
 # Test coverage
 .coverage
+benchmark_data
diff --git a/Makefile b/Makefile
@@ -4,3 +4,6 @@ test:
 # run linting, typechecking, and tests
 check:
 	pre-commit run --all-files
+
+typecheck:
+	pre-commit run mypy --all-files
diff --git a/examples/basic/search.py b/examples/basic/search.py
@@ -26,7 +26,7 @@ def main():
         indexer.index_directory(docs_dir, glob_pattern="**/*.md")
 
         # Search
-        documents, distances = indexer.search(query, n_results=3)
+        documents, distances, _ = indexer.search(query, n_results=3)
 
         # Display results
         console.print(f"\nResults for: [cyan]{query}[/cyan]\n")

diff --git a/examples/code-search/search_docs.py b/examples/code-search/search_docs.py
@@ -80,7 +80,7 @@ def main():
                     continue
 
                 # Search with chunk grouping
-                documents, distances = indexer.search(
+                documents, distances, _ = indexer.search(
                     query,
                     n_results=5,
                     group_chunks=True,

diff --git a/examples/knowledge-base/search_kb.py b/examples/knowledge-base/search_kb.py
@@ -84,7 +84,7 @@ def main(query: str | None, index_dir: Path, interactive: bool, show_content: bo
         def do_search(search_query: str):
             """Perform search and display results."""
             # Search with chunk grouping
-            documents, distances = indexer.search(
+            documents, distances, _ = indexer.search(
                 search_query,
                 n_results=5,
                 group_chunks=True,

diff --git a/gptme_rag/benchmark.py b/gptme_rag/benchmark.py
@@ -169,7 +169,7 @@ def search_operation():
             )
             total_results = 0
             for query in queries:
-                results, _ = indexer.search(query, n_results=n_results)
+                results, _, _ = indexer.search(query, n_results=n_results)
                 total_results += len(results)
             return {
                 "items_processed": len(queries),

diff --git a/gptme_rag/cli.py b/gptme_rag/cli.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import os
 import signal
@@ -9,13 +10,15 @@
 from rich.console import Console
 from rich.logging import RichHandler
 from rich.syntax import Syntax
+from tqdm import tqdm
 
 from .benchmark import RagBenchmark
 from .indexing.indexer import Indexer
 from .indexing.watcher import FileWatcher
 from .query.context_assembler import ContextAssembler
 
 console = Console()
+logger = logging.getLogger(__name__)
 
 # TODO: change this to a more appropriate location
 default_persist_dir = Path.home() / ".cache" / "gptme" / "rag"
@@ -35,9 +38,7 @@ def cli(verbose: bool):
 
 
 @cli.command()
-@click.argument(
-    "directory", type=click.Path(exists=True, file_okay=False, path_type=Path)
-)
+@click.argument("paths", nargs=-1, type=click.Path(exists=True, path_type=Path))
 @click.option(
     "--pattern", "-p", default="**/*.*", help="Glob pattern for files to index"
 )
@@ -47,18 +48,53 @@ def cli(verbose: bool):
     default=default_persist_dir,
     help="Directory to persist the index",
 )
-def index(directory: Path, pattern: str, persist_dir: Path):
-    """Index documents in a directory."""
+def index(paths: list[Path], pattern: str, persist_dir: Path):
+    """Index documents in one or more directories."""
+    if not paths:
+        console.print("❌ No paths provided", style="red")
+        return
+
     try:
         indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
-        console.print(f"Indexing files in {directory} with pattern {pattern}")
 
-        # Index the files
-        n_indexed = indexer.index_directory(directory, pattern)
+        # First, collect all documents
+        all_documents = []
+        with console.status("Collecting documents...") as status:
+            for path in paths:
+                if path.is_file():
+                    status.update(f"Processing file: {path}")
+                else:
+                    status.update(f"Processing directory: {path}")
+                documents = indexer.collect_documents(path)
+                all_documents.extend(documents)
+
+        if not all_documents:
+            console.print("No documents found to index", style="yellow")
+            return
+
+        # Then process them with a progress bar
+        n_files = len(set(doc.metadata.get("source", "") for doc in all_documents))
+        n_chunks = len(all_documents)
+
+        logger.info(f"Found {n_files} files to index ({n_chunks} chunks)")
+
+        with tqdm(
+            total=n_chunks,
+            desc="Indexing documents",
+            unit="chunk",
+            disable=not sys.stdout.isatty(),
+        ) as pbar:
+            for progress in indexer.add_documents_progress(all_documents):
+                pbar.update(progress)
 
-        console.print(f"✅ Successfully indexed {n_indexed} files", style="green")
+        console.print(
+            f"✅ Successfully indexed {n_files} files ({n_chunks} chunks)",
+            style="green",
+        )
     except Exception as e:
         console.print(f"❌ Error indexing directory: {e}", style="red")
+        if logger.isEnabledFor(logging.DEBUG):
+            console.print_exception()
 
 
 @cli.command()
@@ -74,6 +110,12 @@ def index(directory: Path, pattern: str, persist_dir: Path):
 @click.option("--max-tokens", default=4000, help="Maximum tokens in context window")
 @click.option("--show-context", is_flag=True, help="Show the full context content")
 @click.option("--raw", is_flag=True, help="Skip syntax highlighting")
+@click.option("--explain", is_flag=True, help="Show scoring explanations")
+@click.option(
+    "--weights",
+    type=click.STRING,
+    help="Custom scoring weights as JSON string, e.g. '{\"recency_boost\": 0.3}'",
+)
 def search(
     query: str,
     paths: list[Path],
@@ -82,21 +124,44 @@ def search(
     max_tokens: int,
     show_context: bool,
     raw: bool,
+    explain: bool,
+    weights: str | None,
 ):
     """Search the index and assemble context."""
     paths = [path.resolve() for path in paths]
 
     # Hide ChromaDB output during initialization and search
     with console.status("Initializing..."):
+        # Parse custom weights if provided
+        scoring_weights = None
+        if weights:
+            try:
+                scoring_weights = json.loads(weights)
+            except json.JSONDecodeError as e:
+                console.print(f"❌ Invalid weights JSON: {e}", style="red")
+                return
+            except Exception as e:
+                console.print(f"❌ Error parsing weights: {e}", style="red")
+                return
+
         # Temporarily redirect stdout to suppress ChromaDB output
         stdout = sys.stdout
         sys.stdout = open(os.devnull, "w")
         try:
-            indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
-            assembler = ContextAssembler(max_tokens=max_tokens)
-            documents, distances = indexer.search(
-                query, n_results=n_results, paths=paths
+            indexer = Indexer(
+                persist_directory=persist_dir,
+                enable_persist=True,
+                scoring_weights=scoring_weights,
             )
+            assembler = ContextAssembler(max_tokens=max_tokens)
+            if explain:
+                documents, distances, explanations = indexer.search(
+                    query, n_results=n_results, paths=paths, explain=True
+                )
+            else:
+                documents, distances, _ = indexer.search(
+                    query, n_results=n_results, paths=paths
+                )
         finally:
             sys.stdout.close()
             sys.stdout = stdout
@@ -128,20 +193,50 @@ def search(
     for i, doc in enumerate(documents):
         source = doc.metadata.get("source", "unknown")
         distance = distances[i]
-        relevance = 1 - distance  # Convert distance to similarity score
 
-        # Show document header with relevance score
-        console.print(
-            f"\n[cyan]{i+1}. {source}[/cyan] [yellow](relevance: {relevance:.2f})[/yellow]"
-        )
+        # Show document header
+        console.print(f"\n[cyan]{i+1}. {source}[/cyan]")
+
+        # Show scoring explanation if requested
+        if explain and explanations:  # Make sure explanations is not None
+            explanation = explanations[i]
+            console.print("\n[bold]Scoring Breakdown:[/bold]")
+
+            # Show individual score components
+            scores = explanation.get("scores", {})
+            for factor, score in scores.items():
+                # Color code the scores
+                if score > 0:
+                    score_color = "green"
+                    sign = "+"
+                elif score < 0:
+                    score_color = "red"
+                    sign = ""
+                else:
+                    score_color = "yellow"
+                    sign = " "
+
+                # Print score and explanation
+                console.print(
+                    f"  {factor:15} [{score_color}]{sign}{score:>6.3f}[/{score_color}] | {explanation['explanations'][factor]}"
+                )
+
+            # Show total score
+            total = explanation["total_score"]
+            console.print(f"\n  {'Total':15} [bold blue]{total:>7.3f}[/bold blue]")
+        else:
+            # Just show the base relevance score
+            relevance = 1 - distance
+            console.print(f"[yellow](relevance: {relevance:.2f})[/yellow]")
 
         # Use file extension as lexer (strip the dot)
         lexer = doc.metadata.get("extension", "").lstrip(".") or "text"
 
         # Extract preview content (first ~200 chars)
         preview = doc.content[:200] + ("..." if len(doc.content) > 200 else "")
 
-        # Display with syntax highlighting
+        # Display preview with syntax highlighting
+        console.print("\n[bold]Preview:[/bold]")
         syntax = Syntax(
             preview,
             lexer,

diff --git a/gptme_rag/indexing/document_processor.py b/gptme_rag/indexing/document_processor.py
@@ -72,42 +72,45 @@ def process_text(
                 }
                 return
 
-            # Process in chunks
+            # Process text in chunks based on tokens
             chunk_start = 0
             chunk_count = 0
 
             while chunk_start < len(tokens):
                 # Calculate chunk end
                 chunk_end = min(chunk_start + self.chunk_size, len(tokens))
 
-                # Decode chunk
+                # Get chunk tokens and decode
                 chunk_tokens = tokens[chunk_start:chunk_end]
                 chunk_text = self.encoding.decode(chunk_tokens)
 
                 # Create chunk metadata
-                chunk_metadata = {
-                    **(metadata or {}),
-                    "chunk_index": chunk_count,
-                    "token_count": len(chunk_tokens),
-                    "total_chunks": total_chunks,
-                    "chunk_start": chunk_start,
-                    "chunk_end": chunk_end,
-                }
-
                 yield {
                     "text": chunk_text,
-                    "metadata": chunk_metadata,
+                    "metadata": {
+                        **(metadata or {}),
+                        "chunk_index": chunk_count,
+                        "token_count": len(chunk_tokens),
+                        "total_chunks": total_chunks,
+                        "chunk_start": chunk_start,
+                        "chunk_end": chunk_end,
+                        "is_chunk": True,
+                    },
                 }
 
-                # Move to next chunk
-                chunk_start = chunk_end - self.chunk_overlap
+                # Calculate next chunk start
+                if chunk_end == len(tokens):
+                    # If we've reached the end, we're done
+                    break
+
+                # Move forward by at least one token, considering overlap
+                next_start = chunk_start + max(1, self.chunk_size - self.chunk_overlap)
+                chunk_start = min(next_start, len(tokens) - 1)
                 chunk_count += 1
 
-                # Check stopping conditions
+                # Check max chunks limit
                 if self.max_chunks and chunk_count >= self.max_chunks:
-                    return
-                if len(tokens) - chunk_start <= self.chunk_overlap:
-                    return
+                    break
 
         except Exception as e:
             logger.error(f"Error processing text: {e}")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -44,3 +44,4 @@ benchmark_index/

		# Test coverage
		.coverage
		benchmark_data