From 76f465f261735b7a4b566d6ecd0168d8b2dcfa04 Mon Sep 17 00:00:00 2001 From: Prashant Jaiswal Date: Fri, 19 Dec 2025 02:23:01 -0800 Subject: [PATCH] Update Qdrant client so that it works --- packages/providers/qdrant/vectordb/README.md | 24 ++++++++- .../nlweb_qdrant_vectordb/__init__.py | 4 +- .../nlweb_qdrant_vectordb/qdrant_client.py | 54 ++++++++++--------- 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/packages/providers/qdrant/vectordb/README.md b/packages/providers/qdrant/vectordb/README.md index 64de675..ee455c2 100644 --- a/packages/providers/qdrant/vectordb/README.md +++ b/packages/providers/qdrant/vectordb/README.md @@ -30,7 +30,7 @@ Create `config.yaml`: retrieval: provider: qdrant import_path: nlweb_qdrant_vectordb.qdrant_client - class_name: QdrantClient + class_name: QdrantVectorClient api_endpoint_env: QDRANT_URL # Optional for remote Qdrant api_key_env: QDRANT_API_KEY # Optional for remote Qdrant database_path_env: QDRANT_PATH # Optional for local Qdrant @@ -70,14 +70,34 @@ results = await retriever.search( ## Features -- Vector similarity search with Qdrant +- Vector similarity search with Qdrant using 1536-dimensional embeddings - Support for both remote and local Qdrant instances - HNSW-based efficient similarity search - Configurable collection names - API key authentication for remote instances - Local file-based storage option +- Automatic query embedding using NLWeb's embedding providers - Compatible with NLWeb Protocol v0.5 +## Data Format + +The Qdrant provider expects documents with the following payload structure: +- `url`: Document URL (string) +- `content`: Full document content as JSON string (string) +- `type`: Document type (string) +- `site`: Site identifier for filtering (string) +- `embedding`: 1536-dimensional vector (stored separately in Qdrant) + +Example payload: +```json +{ + "url": "https://example.com/page", + "content": "{\"@type\": \"Article\", \"name\": \"Title\", \"description\": \"...\"}", + "type": "Article", + "site": "example.com" +} +``` + ## Creating Your Own Provider Package Use this package as a template: diff --git a/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/__init__.py b/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/__init__.py index 59e4485..60662ea 100644 --- a/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/__init__.py +++ b/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/__init__.py @@ -5,7 +5,7 @@ NLWeb Qdrant Vector Database Provider """ -from nlweb_qdrant_vectordb.qdrant_client import QdrantClient +from nlweb_qdrant_vectordb.qdrant_client import QdrantVectorClient from nlweb_qdrant_vectordb.qdrant_writer import QdrantWriter -__all__ = ["QdrantClient", "QdrantWriter"] +__all__ = ["QdrantVectorClient", "QdrantWriter"] diff --git a/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/qdrant_client.py b/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/qdrant_client.py index 661d7c8..66a9907 100644 --- a/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/qdrant_client.py +++ b/packages/providers/qdrant/vectordb/nlweb_qdrant_vectordb/qdrant_client.py @@ -19,7 +19,7 @@ from nlweb_core.retriever import VectorDBClientInterface -class QdrantClient(VectorDBClientInterface): +class QdrantVectorClient(VectorDBClientInterface): """ Client for Qdrant vector database operations, providing a unified interface for indexing, storing, and retrieving vector-based search results. @@ -131,11 +131,11 @@ async def _get_qdrant_client(self) -> AsyncQdrantClient: try: params = self._create_client_params() - # Create client with the determined parameters + # Create async client client = AsyncQdrantClient(**params) # Test connection by getting collections - collections = await client.get_collections() + await client.get_collections() # Store in cache with lock with self._client_lock: @@ -258,7 +258,7 @@ def _create_site_filter(self, site: Union[str, List[str]]): def _format_results(self, search_result: List[models.ScoredPoint]) -> List[List[str]]: """ - Format Qdrant search results to match expected API: [url, text_json, name, site]. + Format Qdrant search results to match expected API: [url, content_json, type, site]. Args: search_result: Qdrant search results @@ -270,11 +270,12 @@ def _format_results(self, search_result: List[models.ScoredPoint]) -> List[List[ for item in search_result: payload = item.payload url = payload.get("url", "") - schema = payload.get("schema_json", "") - name = payload.get("name", "") + # Content is stored as JSON string in Qdrant payload + content = payload.get("content", "") + type_name = payload.get("type", "") site_name = payload.get("site", "") - results.append([url, schema, name, site_name]) + results.append([url, content, type_name, site_name]) return results @@ -298,11 +299,12 @@ async def search( query_params: Additional query parameters Returns: - List[List[str]]: List of search results in format [url, text_json, name, site] + List[List[str]]: List of search results in format [url, content_json, type, site] """ collection_name = collection_name or self.default_collection_name try: + # Embed the query using NLWeb's embedding function start_embed = time.time() embedding = await get_embedding(query, query_params=query_params) embed_time = time.time() - start_embed @@ -314,24 +316,24 @@ async def search( filter_condition = self._create_site_filter(site) # Ensure collection exists before searching - collection_created = await self.ensure_collection_exists( - collection_name, len(embedding) + if not await client.collection_exists(collection_name): + # Collection doesn't exist, return empty results + return [] + + # Perform the search using async client query_points method + search_response = await client.query_points( + collection_name=collection_name, + query=embedding, + limit=num_results, + query_filter=filter_condition, + with_payload=True, # Critical: fetches the content back ) - if collection_created: - # Collection was just created, return empty results - results = [] - else: - # Perform the search - search_result = await client.search( - collection_name=collection_name, - query_vector=embedding, - limit=num_results, - query_filter=filter_condition, - with_payload=True, - ) + + # Extract points from response + search_result = search_response.points - # Format the results - results = self._format_results(search_result) + # Format the results + results = self._format_results(search_result) retrieve_time = time.time() - start_retrieve @@ -380,8 +382,8 @@ async def search_by_url( payload = item.payload formatted_result = [ payload.get("url", ""), - payload.get("schema_json", ""), - payload.get("name", ""), + payload.get("content", ""), # Content as JSON string + payload.get("type", ""), payload.get("site", ""), ]