diff --git a/code/.env.template b/code/.env.template index b50fa2e7f..bce0701c0 100644 --- a/code/.env.template +++ b/code/.env.template @@ -27,6 +27,8 @@ SNOWFLAKE_PAT="" # One of https://docs.snowflake.com/en/user-guide/snowflake-cortex/vector-embeddings#text-embedding-models SNOWFLAKE_EMBEDDING_MODEL=snowflake-arctic-embed-l-v2.0 +DC_API_KEY="" + # Fully qualified name of the cortex search service in your snowflake account # For example TEMP.NLWEB.NLWEB_SAMPLE # if you used snowflake.sql with --database TEMP --schema NLWEB diff --git a/code/config/config_retrieval.yaml b/code/config/config_retrieval.yaml index 22c95f6d5..64152f332 100644 --- a/code/config/config_retrieval.yaml +++ b/code/config/config_retrieval.yaml @@ -40,3 +40,8 @@ endpoints: index_name: SNOWFLAKE_CORTEX_SEARCH_SERVICE db_type: snowflake_cortex_search + data_commons: + api_key_env: DC_API_KEY + index_name: nlweb + db_type: data_commons + diff --git a/code/retrieval/data_commons_client.py b/code/retrieval/data_commons_client.py new file mode 100644 index 000000000..f84be8c87 --- /dev/null +++ b/code/retrieval/data_commons_client.py @@ -0,0 +1,63 @@ +import httpx +import json +from typing import Any, Dict, List, Optional, Union +from utils.logging_config_helper import get_configured_logger +from config.config import CONFIG +from utils.logger import LogLevel + +logger = get_configured_logger("data_commons_client") + +_BASE_URL = 'https://nl.datacommons.org/nodejs/query' +_POINT_PARAMS = f'allCharts=1&mode=toolformer_rig&idx=base_uae_mem' + +class DataCommonsSearchClient: + """ + Adapts the DC NL API to the VectorDBClientInterface. + """ + _cfg = None + + def __init__(self, endpoint_name: Optional[str] = None): + self._cfg = CONFIG.retrieval_endpoints[endpoint_name] + + async def deleted_documents_by_site(self, site: str, **kwargs) -> int: + raise NotImplementedError("Deletion not implemented yet") + + async def upload_documents(self, documents: List[Dict[str, Any]], **kwargs) -> int: + raise NotImplementedError("Incremental updates not implemented yet") + + async def search(self, query: str, site: Union[str, List[str]], num_results: int=50, **kwargs) -> List[List[str]]: + query = query.strip().replace(' ', '+') + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f'{_BASE_URL}/?&q={query}&key={self._cfg.api_key}&{_POINT_PARAMS}', + timeout=60, + ) + response.raise_for_status() + results = [] + + for c in response.json().get('charts', []): + ctype = c.get('type') + if ctype == 'HIGHLIGHT': + continue + results.append([ + c.get('dcUrl', ''), json.dumps(c), c.get('title', ''), "", + ]) + return results + except Exception as e: + logger.exception(f"Error in DataCommonsSearchClient.search") + logger.log_with_context( + LogLevel.ERROR, + "Data Commons Search retrieval failed", + { + "error_type": type(e).__name__, + "error_message": str(e), + } + ) + raise + + async def search_by_url(self, url: str, **kwargs) -> Optional[List[str]]: + raise NotImplementedError("Search by url not implemented yet") + + async def search_all_sites(self, query: str, num_results: int = 50, **kwargs) -> List[List[str]]: + return await self.search(query, site="", num_results=num_results, **kwargs) \ No newline at end of file diff --git a/code/retrieval/retriever.py b/code/retrieval/retriever.py index c3a3f8c3c..7d4f3a874 100644 --- a/code/retrieval/retriever.py +++ b/code/retrieval/retriever.py @@ -21,6 +21,7 @@ from retrieval.milvus_client import MilvusVectorClient from retrieval.qdrant import QdrantVectorClient from retrieval.snowflake_client import SnowflakeCortexSearchClient +from retrieval.data_commons_client import DataCommonsSearchClient logger = get_configured_logger("retriever") @@ -174,6 +175,8 @@ async def get_client(self) -> VectorDBClientInterface: client = QdrantVectorClient(self.endpoint_name) elif self.db_type == "snowflake_cortex_search": client = SnowflakeCortexSearchClient(self.endpoint_name) + elif self.db_type == 'data_commons': + client = DataCommonsSearchClient(self.endpoint_name) else: error_msg = f"Unsupported database type: {self.db_type}" logger.error(error_msg) diff --git a/static/dropdown-interface.js b/static/dropdown-interface.js index be4b2d84c..691a9ed5a 100644 --- a/static/dropdown-interface.js +++ b/static/dropdown-interface.js @@ -311,7 +311,8 @@ export class DropdownInterface { { id: 'milvus_1', name: 'Milvus' }, { id: 'qdrant_local', name: 'Qdrant Local' }, { id: 'qdrant_url', name: 'Qdrant URL' }, - { id: 'snowflake_cortex_search_1', name: 'Snowflake_Cortex_Search' } + { id: 'snowflake_cortex_search_1', name: 'Snowflake_Cortex_Search' }, + { id: 'data_commons', name: 'DataCommons'} ]; } } \ No newline at end of file