Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions packages/providers/qdrant/vectordb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Create `config.yaml`:
retrieval:
provider: qdrant
import_path: nlweb_qdrant_vectordb.qdrant_client
class_name: QdrantClient
class_name: QdrantVectorClient
api_endpoint_env: QDRANT_URL # Optional for remote Qdrant
api_key_env: QDRANT_API_KEY # Optional for remote Qdrant
database_path_env: QDRANT_PATH # Optional for local Qdrant
Expand Down Expand Up @@ -70,14 +70,34 @@ results = await retriever.search(

## Features

- Vector similarity search with Qdrant
- Vector similarity search with Qdrant using 1536-dimensional embeddings
- Support for both remote and local Qdrant instances
- HNSW-based efficient similarity search
- Configurable collection names
- API key authentication for remote instances
- Local file-based storage option
- Automatic query embedding using NLWeb's embedding providers
- Compatible with NLWeb Protocol v0.5

## Data Format

The Qdrant provider expects documents with the following payload structure:
- `url`: Document URL (string)
- `content`: Full document content as JSON string (string)
- `type`: Document type (string)
- `site`: Site identifier for filtering (string)
- `embedding`: 1536-dimensional vector (stored separately in Qdrant)

Example payload:
```json
{
"url": "https://example.com/page",
"content": "{\"@type\": \"Article\", \"name\": \"Title\", \"description\": \"...\"}",
"type": "Article",
"site": "example.com"
}
```

## Creating Your Own Provider Package

Use this package as a template:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
NLWeb Qdrant Vector Database Provider
"""

from nlweb_qdrant_vectordb.qdrant_client import QdrantClient
from nlweb_qdrant_vectordb.qdrant_client import QdrantVectorClient
from nlweb_qdrant_vectordb.qdrant_writer import QdrantWriter

__all__ = ["QdrantClient", "QdrantWriter"]
__all__ = ["QdrantVectorClient", "QdrantWriter"]
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from nlweb_core.retriever import VectorDBClientInterface


class QdrantClient(VectorDBClientInterface):
class QdrantVectorClient(VectorDBClientInterface):
"""
Client for Qdrant vector database operations, providing a unified interface for
indexing, storing, and retrieving vector-based search results.
Expand Down Expand Up @@ -131,11 +131,11 @@ async def _get_qdrant_client(self) -> AsyncQdrantClient:
try:
params = self._create_client_params()

# Create client with the determined parameters
# Create async client
client = AsyncQdrantClient(**params)

# Test connection by getting collections
collections = await client.get_collections()
await client.get_collections()

# Store in cache with lock
with self._client_lock:
Expand Down Expand Up @@ -258,7 +258,7 @@ def _create_site_filter(self, site: Union[str, List[str]]):

def _format_results(self, search_result: List[models.ScoredPoint]) -> List[List[str]]:
"""
Format Qdrant search results to match expected API: [url, text_json, name, site].
Format Qdrant search results to match expected API: [url, content_json, type, site].

Args:
search_result: Qdrant search results
Expand All @@ -270,11 +270,12 @@ def _format_results(self, search_result: List[models.ScoredPoint]) -> List[List[
for item in search_result:
payload = item.payload
url = payload.get("url", "")
schema = payload.get("schema_json", "")
name = payload.get("name", "")
# Content is stored as JSON string in Qdrant payload
content = payload.get("content", "")
type_name = payload.get("type", "")
site_name = payload.get("site", "")

results.append([url, schema, name, site_name])
results.append([url, content, type_name, site_name])

return results

Expand All @@ -298,11 +299,12 @@ async def search(
query_params: Additional query parameters

Returns:
List[List[str]]: List of search results in format [url, text_json, name, site]
List[List[str]]: List of search results in format [url, content_json, type, site]
"""
collection_name = collection_name or self.default_collection_name

try:
# Embed the query using NLWeb's embedding function
start_embed = time.time()
embedding = await get_embedding(query, query_params=query_params)
embed_time = time.time() - start_embed
Expand All @@ -314,24 +316,24 @@ async def search(
filter_condition = self._create_site_filter(site)

# Ensure collection exists before searching
collection_created = await self.ensure_collection_exists(
collection_name, len(embedding)
if not await client.collection_exists(collection_name):
# Collection doesn't exist, return empty results
return []

# Perform the search using async client query_points method
search_response = await client.query_points(
collection_name=collection_name,
query=embedding,
limit=num_results,
query_filter=filter_condition,
with_payload=True, # Critical: fetches the content back
)
if collection_created:
# Collection was just created, return empty results
results = []
else:
# Perform the search
search_result = await client.search(
collection_name=collection_name,
query_vector=embedding,
limit=num_results,
query_filter=filter_condition,
with_payload=True,
)

# Extract points from response
search_result = search_response.points

# Format the results
results = self._format_results(search_result)
# Format the results
results = self._format_results(search_result)

retrieve_time = time.time() - start_retrieve

Expand Down Expand Up @@ -380,8 +382,8 @@ async def search_by_url(
payload = item.payload
formatted_result = [
payload.get("url", ""),
payload.get("schema_json", ""),
payload.get("name", ""),
payload.get("content", ""), # Content as JSON string
payload.get("type", ""),
payload.get("site", ""),
]

Expand Down
Loading