diff --git a/pyproject.toml b/pyproject.toml index 3bbba8ac0..9e4ee831e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ endee = [ "endee==0.1.10" ] lindorm = [ "opensearch-py" ] seekdb = [ "mysql-connector-python" ] pinot = [ "requests" ] +logosdb = [ "logosdb" ] [project.urls] Repository = "https://github.com/zilliztech/VectorDBBench" diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py index 4be8d0424..0d39ccd69 100644 --- a/vectordb_bench/backend/clients/__init__.py +++ b/vectordb_bench/backend/clients/__init__.py @@ -63,6 +63,7 @@ class DB(Enum): PolarDB = "PolarDB" Pinot = "Pinot" SeekDB = "SeekDB" + LogosDB = "LogosDB" @property def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915 @@ -269,6 +270,11 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915 return SeekDB + if self == DB.LogosDB: + from .logosdb.logosdb import LogosDB + + return LogosDB + msg = f"Unknown DB: {self.name}" raise ValueError(msg) @@ -477,6 +483,11 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901, PLR0915 return SeekDBConfig + if self == DB.LogosDB: + from .logosdb.config import LogosDBConfig + + return LogosDBConfig + msg = f"Unknown DB: {self.name}" raise ValueError(msg) @@ -667,6 +678,11 @@ def case_config_cls( # noqa: C901, PLR0911, PLR0912, PLR0915 return _seekdb_case_config.get(index_type) + if self == DB.LogosDB: + from .logosdb.config import LogosDBIndexConfig + + return LogosDBIndexConfig + # DB.Pinecone, DB.Redis return EmptyDBCaseConfig diff --git a/vectordb_bench/backend/clients/logosdb/__init__.py b/vectordb_bench/backend/clients/logosdb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vectordb_bench/backend/clients/logosdb/cli.py b/vectordb_bench/backend/clients/logosdb/cli.py new file mode 100644 index 000000000..cfa18b889 --- /dev/null +++ b/vectordb_bench/backend/clients/logosdb/cli.py @@ -0,0 +1,44 @@ +from typing import Annotated, Unpack + +import click + +from vectordb_bench.backend.clients import DB +from vectordb_bench.cli.cli import ( + CommonTypedDict, + cli, + click_parameter_decorators_from_typed_dict, + run, +) + +DBTYPE = DB.LogosDB + + +class LogosDBTypedDict(CommonTypedDict): + uri: Annotated[ + str, + click.option( + "--uri", + type=str, + help="Path to LogosDB directory (local embedded DB)", + required=False, + default="/tmp/vectordbbench_logosdb", + show_default=True, + ), + ] + + +@cli.command() +@click_parameter_decorators_from_typed_dict(LogosDBTypedDict) +def LogosDB(**parameters: Unpack[LogosDBTypedDict]): + from .config import LogosDBConfig, LogosDBIndexConfig + + # LogosDB is documented as single-process; disable concurrent search + # until a thread-safe concurrent runner is available. + parameters["search_concurrent"] = False + + run( + db=DBTYPE, + db_config=LogosDBConfig(uri=parameters["uri"]), + db_case_config=LogosDBIndexConfig(), + **parameters, + ) diff --git a/vectordb_bench/backend/clients/logosdb/config.py b/vectordb_bench/backend/clients/logosdb/config.py new file mode 100644 index 000000000..0d6bc58cc --- /dev/null +++ b/vectordb_bench/backend/clients/logosdb/config.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel + +from ..api import DBCaseConfig, DBConfig, MetricType + + +class LogosDBConfig(DBConfig): + uri: str = "/tmp/vectordbbench_logosdb" + + def to_dict(self) -> dict: + return {"uri": self.uri} + + +class LogosDBIndexConfig(BaseModel, DBCaseConfig): + metric_type: MetricType | None = None + + def parse_metric(self) -> int: + import logosdb + + if self.metric_type == MetricType.L2: + return logosdb.DIST_L2 + if self.metric_type == MetricType.IP: + return logosdb.DIST_IP + return logosdb.DIST_COSINE + + def index_param(self) -> dict: + return {} + + def search_param(self) -> dict: + return {} diff --git a/vectordb_bench/backend/clients/logosdb/logosdb.py b/vectordb_bench/backend/clients/logosdb/logosdb.py new file mode 100644 index 000000000..84bfee045 --- /dev/null +++ b/vectordb_bench/backend/clients/logosdb/logosdb.py @@ -0,0 +1,85 @@ +import logging +import shutil +from collections.abc import Iterable +from contextlib import contextmanager +from pathlib import Path + +import numpy as np + +from ..api import VectorDB +from .config import LogosDBIndexConfig + +log = logging.getLogger(__name__) + + +class LogosDB(VectorDB): + def __init__( + self, + dim: int, + db_config: dict, + db_case_config: LogosDBIndexConfig, + collection_name: str = "LogosDBCollection", + drop_old: bool = False, + name: str = "LogosDB", + **kwargs, + ): + self.name = name + self.db_config = db_config + self.case_config = db_case_config + self.dim = dim + self.uri = db_config["uri"] + self.db = None + + if drop_old and Path(self.uri).exists(): + log.info(f"{self.name} drop_old: removing {self.uri}") + shutil.rmtree(self.uri) + + import logosdb as _logosdb + + distance = self.case_config.parse_metric() + db = _logosdb.DB(self.uri, dim=self.dim, distance=distance) + log.info(f"{self.name} initialized at {self.uri} dim={dim} distance={distance}") + del db + + @contextmanager + def init(self): + import logosdb as _logosdb + + distance = self.case_config.parse_metric() + self.db = _logosdb.DB(self.uri, dim=self.dim, distance=distance) + try: + yield + finally: + del self.db + self.db = None + + def insert_embeddings( + self, + embeddings: Iterable[list[float]], + metadata: list[int], + **kwargs, + ) -> tuple[int, Exception]: + assert self.db is not None + try: + embeddings_arr = np.array(list(embeddings), dtype=np.float32) + texts = [str(m) for m in metadata] + self.db.put_batch(embeddings_arr, texts=texts) + return len(metadata), None + except Exception as e: + log.warning(f"{self.name} insert_embeddings error: {e}") + return 0, e + + def search_embedding( + self, + query: list[float], + k: int = 100, + filters: dict | None = None, + timeout: int | None = None, + ) -> list[int]: + assert self.db is not None + q = np.array(query, dtype=np.float32) + hits = self.db.search(q, top_k=k) + return [int(h.text) for h in hits] + + def optimize(self, data_size: int | None = None): + log.info(f"{self.name} optimize: HNSW index is built incrementally, no explicit step needed") diff --git a/vectordb_bench/cli/vectordbbench.py b/vectordb_bench/cli/vectordbbench.py index eca3dbc52..698a80805 100644 --- a/vectordb_bench/cli/vectordbbench.py +++ b/vectordb_bench/cli/vectordbbench.py @@ -15,6 +15,7 @@ from ..backend.clients.hologres.cli import HologresHGraph from ..backend.clients.lancedb.cli import LanceDB from ..backend.clients.lindorm.cli import LindormHNSW, LindormIVFBQ, LindormIVFPQ +from ..backend.clients.logosdb.cli import LogosDB from ..backend.clients.mariadb.cli import MariaDBHNSW from ..backend.clients.memorydb.cli import MemoryDB from ..backend.clients.milvus.cli import MilvusAutoIndex @@ -97,6 +98,7 @@ cli.add_command(PolarDBHNSWPQ) cli.add_command(PolarDBHNSWSQ) cli.add_command(SeekDBHNSW) +cli.add_command(LogosDB) if __name__ == "__main__":