From 95c1f18221827539caca84a385d442b68de99828 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Wed, 31 Jan 2024 01:07:22 +0530 Subject: [PATCH 1/3] Add marqo vector store --- evadb/catalog/catalog_type.py | 1 + evadb/interfaces/relational/db.py | 2 +- .../parser/lark_visitor/_create_statements.py | 2 + evadb/third_party/vector_stores/marqo.py | 118 ++++++++++++++++++ evadb/third_party/vector_stores/utils.py | 7 ++ evadb/utils/generic_utils.py | 18 +++ setup.py | 3 + 7 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 evadb/third_party/vector_stores/marqo.py diff --git a/evadb/catalog/catalog_type.py b/evadb/catalog/catalog_type.py index 5da568779..6282bb63d 100644 --- a/evadb/catalog/catalog_type.py +++ b/evadb/catalog/catalog_type.py @@ -119,6 +119,7 @@ class VectorStoreType(EvaDBEnum): CHROMADB # noqa: F821 WEAVIATE # noqa: F821 MILVUS # noqa: F821 + MARQO # noqa: F821 class VideoColumnName(EvaDBEnum): diff --git a/evadb/interfaces/relational/db.py b/evadb/interfaces/relational/db.py index 714593a8a..d4835c425 100644 --- a/evadb/interfaces/relational/db.py +++ b/evadb/interfaces/relational/db.py @@ -269,7 +269,7 @@ def create_vector_index( table_name (str): Name of the table. expr (str): Expression used to build the vector index. - using (str): Method used for indexing, can be `FAISS` or `QDRANT` or `PINECONE` or `CHROMADB` or `WEAVIATE` or `MILVUS`. + using (str): Method used for indexing, can be `FAISS` or `QDRANT` or `PINECONE` or `CHROMADB` or `WEAVIATE` or `MILVUS` or `MARQO`. Returns: EvaDBCursor: The EvaDBCursor object. diff --git a/evadb/parser/lark_visitor/_create_statements.py b/evadb/parser/lark_visitor/_create_statements.py index 175f0087e..cf15a76e8 100644 --- a/evadb/parser/lark_visitor/_create_statements.py +++ b/evadb/parser/lark_visitor/_create_statements.py @@ -304,6 +304,8 @@ def vector_store_type(self, tree): vector_store_type = VectorStoreType.WEAVIATE elif str.upper(token) == "MILVUS": vector_store_type = VectorStoreType.MILVUS + elif str.upper(token) == "MARQO": + vector_store_type = VectorStoreType.MARQO return vector_store_type diff --git a/evadb/third_party/vector_stores/marqo.py b/evadb/third_party/vector_stores/marqo.py new file mode 100644 index 000000000..c0c3e18a3 --- /dev/null +++ b/evadb/third_party/vector_stores/marqo.py @@ -0,0 +1,118 @@ +from typing import List + +from evadb.third_party.vector_stores.types import ( + FeaturePayload, + VectorIndexQuery, + VectorIndexQueryResult, + VectorStore, +) +from evadb.utils.generic_utils import try_to_import_marqo_client + +_marqo_client_instance = None + +required_params = ["url", "index_name"] + + +def get_marqo_client(url: str, api_key: str=None): + global _marqo_client_instance + if _marqo_client_instance is None: + try_to_import_marqo_client() + import marqo as mq + _marqo_client_instance = mq.Client(url=url, api_key=api_key) + return _marqo_client_instance + + +class MarqoVectorStore(VectorStore): + def __init__(self, index_name: str, url: str = "http://0.0.0.0:8882", api_key=None) -> None: + self._client = get_marqo_client(url=url) + self._index_name = index_name + + def create(self, vector_dim: int): + + # Delete index if exists already + if self._index_name in [i.index_name for i in self._client.get_indexes()['results']]: + self.delete() + + # create fresh + # Refer here for details - https://docs.marqo.ai/2.0.0/API-Reference/Indexes/create_index/ + self._client.create_index( + index_name=self._index_name, + settings_dict={ + 'index_defaults': { + 'model': 'no_model', + 'model_properties': { + 'dimensions': vector_dim + }, + + 'normalize_embeddings': True, + 'ann_parameters':{ + 'space_type': 'cosinesimil' + } + } + } + ) + + def add(self, payload: List[FeaturePayload]): + + ids = [int(row.id) for row in payload] + embeddings = [row.embedding for row in payload] + + data = [] + for _id, _emb in zip(ids, embeddings): + _id = str(_id) + data.append( + { + '_id': _id, + 'evadb_data':{ + 'vector': _emb + } + } + ) + + # For reference and more information + # check - https://docs.marqo.ai/1.4.0/Guides/Advanced-Usage/document_fields/#custom-vector-object + self._client.index( + index_name=self._index_name + ).add_documents( + documents=data, + mappings={ + 'evadb_data':{ + 'type': 'custom_vector' + } + }, + tensor_fields=['evadb_data'], + auto_refresh=True, + client_batch_size=64 + ) + + + def delete(self) -> None: + self._client.delete_index(index_name=self._index_name) + + def query( + self, + query: VectorIndexQuery, + ) -> VectorIndexQueryResult: + response = self._client.index( + self._index_name).search( + context={ + 'tensor':[ + { + 'vector': list(query.embedding), + 'weight' : 1 + } + ], + }, + limit=query.top_k + ) + + similarities, ids = [], [] + + for result in response['hits']: + ids.append(result['_id']) + + # Because it is similarity score + similarities.append(1-result['_score']) + + return VectorIndexQueryResult(similarities=similarities, ids=ids) + diff --git a/evadb/third_party/vector_stores/utils.py b/evadb/third_party/vector_stores/utils.py index 9c12fc6fb..8478126ca 100644 --- a/evadb/third_party/vector_stores/utils.py +++ b/evadb/third_party/vector_stores/utils.py @@ -15,6 +15,7 @@ from evadb.catalog.catalog_type import VectorStoreType from evadb.third_party.vector_stores.chromadb import ChromaDBVectorStore from evadb.third_party.vector_stores.faiss import FaissVectorStore +from evadb.third_party.vector_stores.marqo import MarqoVectorStore from evadb.third_party.vector_stores.milvus import MilvusVectorStore from evadb.third_party.vector_stores.pinecone import PineconeVectorStore from evadb.third_party.vector_stores.qdrant import QdrantVectorStore @@ -67,5 +68,11 @@ def init_vector_store( validate_kwargs(kwargs, allowed_params, required_params) return MilvusVectorStore(index_name, **kwargs) + elif vector_store_type == VectorStoreType.MARQO: + from evadb.third_party.vector_stores.marqo import required_params + + validate_kwargs(kwargs, required_params, required_params) + return MarqoVectorStore(index_name, **kwargs) + else: raise Exception(f"Vector store {vector_store_type} not supported") diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index 426719f87..f021129ba 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -593,6 +593,16 @@ def try_to_import_milvus_client(): ) +def try_to_import_marqo_client(): + try: + import marqo # noqa: F401 + except ImportError: + raise ValueError( + """Could not import marqo python package. + Please install it with `pip install marqo`.""" + ) + + def is_qdrant_available() -> bool: try: try_to_import_qdrant_client() @@ -633,6 +643,14 @@ def is_milvus_available() -> bool: return False +def is_marqo_available() -> bool: + try: + try_to_import_marqo_client() + return True + except ValueError: + return False + + ############################## ## UTILS ############################## diff --git a/setup.py b/setup.py index e3d211ece..e0dd5ee70 100644 --- a/setup.py +++ b/setup.py @@ -116,6 +116,8 @@ def read(path, encoding="utf-8"): milvus_libs = ["pymilvus>=2.3.0"] +marqo_libs = ["marqo"] + postgres_libs = [ "psycopg2", @@ -177,6 +179,7 @@ def read(path, encoding="utf-8"): "chromadb": chromadb_libs, "milvus": milvus_libs, "weaviate": weaviate_libs, + "marqo": marqo_libs, "postgres": postgres_libs, "ludwig": ludwig_libs, "sklearn": sklearn_libs, From 5a5bc6f862c455b810b79534b786e81f2e6f8668 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Wed, 31 Jan 2024 22:12:54 +0530 Subject: [PATCH 2/3] Fix formatting --- evadb/third_party/vector_stores/marqo.py | 99 +++++++++++------------- 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/evadb/third_party/vector_stores/marqo.py b/evadb/third_party/vector_stores/marqo.py index c0c3e18a3..204d801a1 100644 --- a/evadb/third_party/vector_stores/marqo.py +++ b/evadb/third_party/vector_stores/marqo.py @@ -1,3 +1,17 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import List from evadb.third_party.vector_stores.types import ( @@ -13,24 +27,29 @@ required_params = ["url", "index_name"] -def get_marqo_client(url: str, api_key: str=None): +def get_marqo_client(url: str, api_key: str = None): global _marqo_client_instance if _marqo_client_instance is None: try_to_import_marqo_client() - import marqo as mq + import marqo as mq + _marqo_client_instance = mq.Client(url=url, api_key=api_key) return _marqo_client_instance class MarqoVectorStore(VectorStore): - def __init__(self, index_name: str, url: str = "http://0.0.0.0:8882", api_key=None) -> None: + def __init__( + self, index_name: str, url: str = "http://0.0.0.0:8882", api_key=None + ) -> None: self._client = get_marqo_client(url=url) self._index_name = index_name def create(self, vector_dim: int): # Delete index if exists already - if self._index_name in [i.index_name for i in self._client.get_indexes()['results']]: + if self._index_name in [ + i.index_name for i in self._client.get_indexes()["results"] + ]: self.delete() # create fresh @@ -38,53 +57,34 @@ def create(self, vector_dim: int): self._client.create_index( index_name=self._index_name, settings_dict={ - 'index_defaults': { - 'model': 'no_model', - 'model_properties': { - 'dimensions': vector_dim - }, - - 'normalize_embeddings': True, - 'ann_parameters':{ - 'space_type': 'cosinesimil' - } + "index_defaults": { + "model": "no_model", + "model_properties": {"dimensions": vector_dim}, + "normalize_embeddings": True, + "ann_parameters": {"space_type": "cosinesimil"}, } - } + }, ) def add(self, payload: List[FeaturePayload]): ids = [int(row.id) for row in payload] embeddings = [row.embedding for row in payload] - + data = [] for _id, _emb in zip(ids, embeddings): _id = str(_id) - data.append( - { - '_id': _id, - 'evadb_data':{ - 'vector': _emb - } - } - ) - + data.append({"_id": _id, "evadb_data": {"vector": _emb}}) + # For reference and more information # check - https://docs.marqo.ai/1.4.0/Guides/Advanced-Usage/document_fields/#custom-vector-object - self._client.index( - index_name=self._index_name - ).add_documents( + self._client.index(index_name=self._index_name).add_documents( documents=data, - mappings={ - 'evadb_data':{ - 'type': 'custom_vector' - } - }, - tensor_fields=['evadb_data'], + mappings={"evadb_data": {"type": "custom_vector"}}, + tensor_fields=["evadb_data"], auto_refresh=True, - client_batch_size=64 - ) - + client_batch_size=64, + ) def delete(self) -> None: self._client.delete_index(index_name=self._index_name) @@ -93,26 +93,19 @@ def query( self, query: VectorIndexQuery, ) -> VectorIndexQueryResult: - response = self._client.index( - self._index_name).search( - context={ - 'tensor':[ - { - 'vector': list(query.embedding), - 'weight' : 1 - } - ], - }, - limit=query.top_k - ) + response = self._client.index(self._index_name).search( + context={ + "tensor": [{"vector": list(query.embedding), "weight": 1}], + }, + limit=query.top_k, + ) similarities, ids = [], [] - for result in response['hits']: - ids.append(result['_id']) + for result in response["hits"]: + ids.append(result["_id"]) # Because it is similarity score - similarities.append(1-result['_score']) + similarities.append(1 - result["_score"]) return VectorIndexQueryResult(similarities=similarities, ids=ids) - From b143ec83c9cb728cd7fe176d4f150abb76b8be24 Mon Sep 17 00:00:00 2001 From: Aakash Thatte Date: Wed, 31 Jan 2024 22:13:20 +0530 Subject: [PATCH 3/3] Add docs for marqo --- docs/_toc.yml | 1 + .../reference/vector_databases/marqo.rst | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 docs/source/reference/vector_databases/marqo.rst diff --git a/docs/_toc.yml b/docs/_toc.yml index 38309dbcf..361cb5a60 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -91,6 +91,7 @@ parts: - file: source/reference/vector_databases/pinecone - file: source/reference/vector_databases/milvus - file: source/reference/vector_databases/weaviate + - file: source/reference/vector_databases/marqo - file: source/reference/ai/index title: AI Engines diff --git a/docs/source/reference/vector_databases/marqo.rst b/docs/source/reference/vector_databases/marqo.rst new file mode 100644 index 000000000..3c9a1ff17 --- /dev/null +++ b/docs/source/reference/vector_databases/marqo.rst @@ -0,0 +1,27 @@ +Marqo +========== + +Marqo is more than a vector database, it's an end-to-end vector search engine for both text and images. +Vector generation, storage and retrieval are handled out of the box through a single API. +Connection is based on the `marqo `_ client library. + +Dependency +---------- + +* marqo==2.1.0 + + +Setup +----- + +To use marqo, you need a URL and API key to your instance. +Here are `instructions to setup local instance `_. +Cloud offering can also be used. + + +Create Index +----------------- + +.. code-block:: sql + + CREATE INDEX index_name ON table_name (data) USING MARQO; \ No newline at end of file