diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py index 55048d6c..3d0bc800 100644 --- a/graphgen/bases/__init__.py +++ b/graphgen/bases/__init__.py @@ -4,6 +4,7 @@ from .base_llm_wrapper import BaseLLMWrapper from .base_partitioner import BasePartitioner from .base_reader import BaseReader +from .base_searcher import BaseSearcher from .base_splitter import BaseSplitter from .base_storage import ( BaseGraphStorage, diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py new file mode 100644 index 00000000..f680ab04 --- /dev/null +++ b/graphgen/bases/base_searcher.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, List + + +class BaseSearcher(ABC): + """ + Abstract base class for searching and retrieving data. + """ + + @abstractmethod + async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: + """ + Search for data based on the given query. + + :param query: The searcher query. + :param kwargs: Additional keyword arguments for the searcher. + :return: List of dictionaries containing the searcher results. + """ diff --git a/graphgen/configs/search_config.yaml b/graphgen/configs/search_config.yaml new file mode 100644 index 00000000..69b3b9c0 --- /dev/null +++ b/graphgen/configs/search_config.yaml @@ -0,0 +1,8 @@ +pipeline: + - name: read + params: + input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + + - name: search + params: + data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 088e7b3b..827f57fe 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -58,7 +58,6 @@ def __init__( self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage( self.working_dir, namespace="_meta" ) - self.full_docs_storage: JsonKVStorage = JsonKVStorage( self.working_dir, namespace="full_docs" ) @@ -69,9 +68,8 @@ def __init__( self.working_dir, namespace="graph" ) self.search_storage: JsonKVStorage = JsonKVStorage( - self.working_dir, namespace="search" + self.working_dir, namespace="searcher" ) - self.rephrase_storage: JsonKVStorage = JsonKVStorage( self.working_dir, namespace="rephrase" ) @@ -181,41 +179,33 @@ async def build_kg(self): return _add_entities_and_relations - @op("search", deps=["chunk"]) + @op("search", deps=["read"]) @async_to_sync_method async def search(self, search_config: Dict): - logger.info( - "Search is %s", "enabled" if search_config["enabled"] else "disabled" + logger.info("[Search] %s ...", ", ".join(search_config["data_sources"])) + + seeds = await self.meta_storage.get_new_data(self.full_docs_storage) + if len(seeds) == 0: + logger.warning("All documents are already been searched") + return + search_results = await search_all( + seed_data=seeds, + **search_config, + ) + + _add_search_keys = await self.search_storage.filter_keys( + list(search_results.keys()) ) - if search_config["enabled"]: - logger.info("[Search] %s ...", ", ".join(search_config["search_types"])) - all_nodes = await self.graph_storage.get_all_nodes() - all_nodes_names = [node[0] for node in all_nodes] - new_search_entities = await self.full_docs_storage.filter_keys( - all_nodes_names - ) - logger.info( - "[Search] Found %d entities to search", len(new_search_entities) - ) - _add_search_data = await search_all( - search_types=search_config["search_types"], - search_entities=new_search_entities, - ) - if _add_search_data: - await self.search_storage.upsert(_add_search_data) - logger.info("[Search] %d entities searched", len(_add_search_data)) - - # Format search results for inserting - search_results = [] - for _, search_data in _add_search_data.items(): - search_results.extend( - [ - {"content": search_data[key]} - for key in list(search_data.keys()) - ] - ) - # TODO: fix insert after search - # await self.insert() + search_results = { + k: v for k, v in search_results.items() if k in _add_search_keys + } + if len(search_results) == 0: + logger.warning("All search results are already in the storage") + return + await self.search_storage.upsert(search_results) + await self.search_storage.index_done_callback() + await self.meta_storage.mark_done(self.full_docs_storage) + await self.meta_storage.index_done_callback() @op("quiz_and_judge", deps=["build_kg"]) @async_to_sync_method diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 4580d537..8565824f 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -25,10 +25,10 @@ RDFReader, TXTReader, ) -from .search.db.uniprot_search import UniProtSearch -from .search.kg.wiki_search import WikiSearch -from .search.web.bing_search import BingSearch -from .search.web.google_search import GoogleSearch +from .searcher.db.uniprot_searcher import UniProtSearch +from .searcher.kg.wiki_search import WikiSearch +from .searcher.web.bing_search import BingSearch +from .searcher.web.google_search import GoogleSearch from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter from .storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage, NetworkXStorage from .tokenizer import Tokenizer diff --git a/graphgen/models/search/__init__.py b/graphgen/models/searcher/__init__.py similarity index 100% rename from graphgen/models/search/__init__.py rename to graphgen/models/searcher/__init__.py diff --git a/graphgen/models/search/db/__init__.py b/graphgen/models/searcher/db/__init__.py similarity index 100% rename from graphgen/models/search/db/__init__.py rename to graphgen/models/searcher/db/__init__.py diff --git a/graphgen/models/search/db/uniprot_search.py b/graphgen/models/searcher/db/uniprot_searcher.py similarity index 63% rename from graphgen/models/search/db/uniprot_search.py rename to graphgen/models/searcher/db/uniprot_searcher.py index 6e7d2bfb..4856ea90 100644 --- a/graphgen/models/search/db/uniprot_search.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -1,18 +1,27 @@ +import re from io import StringIO from typing import Dict, Optional from Bio import ExPASy, SeqIO, SwissProt, UniProt from Bio.Blast import NCBIWWW, NCBIXML - +from requests.exceptions import RequestException +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher from graphgen.utils import logger -class UniProtSearch: +class UniProtSearch(BaseSearcher): """ - UniProt Search client to search with UniProt. + UniProt Search client to searcher with UniProt. 1) Get the protein by accession number. - 2) Search with keywords or protein names (fuzzy search). - 3) Search with FASTA sequence (BLAST search). + 2) Search with keywords or protein names (fuzzy searcher). + 3) Search with FASTA sequence (BLAST searcher). """ def get_by_accession(self, accession: str) -> Optional[dict]: @@ -21,6 +30,8 @@ def get_by_accession(self, accession: str) -> Optional[dict]: record = SwissProt.read(handle) handle.close() return self._swissprot_to_dict(record) + except RequestException: # network-related errors + raise except Exception as exc: # pylint: disable=broad-except logger.error("Accession %s not found: %s", accession, exc) return None @@ -51,7 +62,7 @@ def _swissprot_to_dict(record: SwissProt.Record) -> dict: def get_best_hit(self, keyword: str) -> Optional[Dict]: """ Search UniProt with a keyword and return the best hit. - :param keyword: The search keyword. + :param keyword: The searcher keyword. :return: A dictionary containing the best hit information or None if not found. """ if not keyword.strip(): @@ -64,15 +75,17 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]: return None return self.get_by_accession(hit["primaryAccession"]) + except RequestException: + raise except Exception as e: # pylint: disable=broad-except logger.error("Keyword %s not found: %s", keyword, e) - return None + return None def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: """ Search UniProt with a FASTA sequence and return the best hit. :param fasta_sequence: The FASTA sequence. - :param threshold: E-value threshold for BLAST search. + :param threshold: E-value threshold for BLAST searcher. :return: A dictionary containing the best hit information or None if not found. """ try: @@ -90,6 +103,7 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: # UniProtKB/Swiss-Prot BLAST API try: + logger.debug("Performing BLAST searcher for the given sequence: %s", seq) result_handle = NCBIWWW.qblast( program="blastp", database="swissprot", @@ -98,8 +112,10 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: expect=threshold, ) blast_record = NCBIXML.read(result_handle) + except RequestException: + raise except Exception as e: # pylint: disable=broad-except - logger.error("BLAST search failed: %s", e) + logger.error("BLAST searcher failed: %s", e) return None if not blast_record.alignments: @@ -116,3 +132,44 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: # like sp|P01308.1|INS_HUMAN accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id return self.get_by_accession(accession) + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + async def search( + self, query: str, threshold: float = 0.7, **kwargs + ) -> Optional[Dict]: + """ + Search UniProt with either an accession number, keyword, or FASTA sequence. + :param query: The searcher query (accession number, keyword, or FASTA sequence). + :param threshold: E-value threshold for BLAST searcher. + :return: A dictionary containing the best hit information or None if not found. + """ + + # auto detect query type + if not query or not isinstance(query, str): + logger.error("Empty or non-string input.") + return None + query = query.strip() + + logger.debug("UniProt searcher query: %s", query) + # check if fasta sequence + if query.startswith(">") or re.fullmatch( + r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I + ): + result = self.get_by_fasta(query, threshold) + + # check if accession number + elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I): + result = self.get_by_accession(query) + + else: + # otherwise treat as keyword + result = self.get_best_hit(query) + + if result: + result["_search_query"] = query + return result diff --git a/graphgen/models/search/kg/__init__.py b/graphgen/models/searcher/kg/__init__.py similarity index 100% rename from graphgen/models/search/kg/__init__.py rename to graphgen/models/searcher/kg/__init__.py diff --git a/graphgen/models/search/kg/wiki_search.py b/graphgen/models/searcher/kg/wiki_search.py similarity index 100% rename from graphgen/models/search/kg/wiki_search.py rename to graphgen/models/searcher/kg/wiki_search.py diff --git a/graphgen/models/search/web/__init__.py b/graphgen/models/searcher/web/__init__.py similarity index 100% rename from graphgen/models/search/web/__init__.py rename to graphgen/models/searcher/web/__init__.py diff --git a/graphgen/models/search/web/bing_search.py b/graphgen/models/searcher/web/bing_search.py similarity index 90% rename from graphgen/models/search/web/bing_search.py rename to graphgen/models/searcher/web/bing_search.py index d52815df..77ae2110 100644 --- a/graphgen/models/search/web/bing_search.py +++ b/graphgen/models/searcher/web/bing_search.py @@ -9,7 +9,7 @@ class BingSearch: """ - Bing Search client to search with Bing. + Bing Search client to searcher with Bing. """ def __init__(self, subscription_key: str): @@ -18,9 +18,9 @@ def __init__(self, subscription_key: str): def search(self, query: str, num_results: int = 1): """ Search with Bing and return the contexts. - :param query: The search query. + :param query: The searcher query. :param num_results: The number of results to return. - :return: A list of search results. + :return: A list of searcher results. """ params = {"q": query, "mkt": BING_MKT, "count": num_results} response = requests.get( diff --git a/graphgen/models/search/web/google_search.py b/graphgen/models/searcher/web/google_search.py similarity index 88% rename from graphgen/models/search/web/google_search.py rename to graphgen/models/searcher/web/google_search.py index 0b045723..0d598f3a 100644 --- a/graphgen/models/search/web/google_search.py +++ b/graphgen/models/searcher/web/google_search.py @@ -9,9 +9,9 @@ class GoogleSearch: def __init__(self, subscription_key: str, cx: str): """ - Initialize the Google Search client with the subscription key and custom search engine ID. + Initialize the Google Search client with the subscription key and custom searcher engine ID. :param subscription_key: Your Google API subscription key. - :param cx: Your custom search engine ID. + :param cx: Your custom searcher engine ID. """ self.subscription_key = subscription_key self.cx = cx @@ -19,9 +19,9 @@ def __init__(self, subscription_key: str, cx: str): def search(self, query: str, num_results: int = 1): """ Search with Google and return the contexts. - :param query: The search query. + :param query: The searcher query. :param num_results: The number of results to return. - :return: A list of search results. + :return: A list of searcher results. """ params = { "key": self.subscription_key, diff --git a/graphgen/operators/search/kg/__init__.py b/graphgen/operators/search/kg/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/graphgen/operators/search/kg/search_wikipedia.py b/graphgen/operators/search/kg/search_wikipedia.py deleted file mode 100644 index 05449fe1..00000000 --- a/graphgen/operators/search/kg/search_wikipedia.py +++ /dev/null @@ -1,58 +0,0 @@ -from tqdm.asyncio import tqdm_asyncio as tqdm_async - -from graphgen.models import WikiSearch -from graphgen.utils import logger - - -async def _process_single_entity( - entity_name: str, - wiki_search_client: WikiSearch, -) -> str | None: - """ - Process single entity by searching Wikipedia - :param entity_name - :param wiki_search_client - :return: summary of the entity or None if not found - """ - search_results = await wiki_search_client.search(entity_name) - if not search_results: - return None - - summary = None - try: - summary = await wiki_search_client.summary(search_results[-1]) - logger.info( - "Entity %s search result: %s summary: %s", - entity_name, - str(search_results), - summary, - ) - except Exception as e: # pylint: disable=broad-except - logger.error("Error processing entity %s: %s", entity_name, str(e)) - - return summary - - -async def search_wikipedia( - wiki_search_client: WikiSearch, - entities: set[str], -) -> dict: - """ - Search wikipedia for entities - - :param wiki_search_client: wiki search client - :param entities: list of entities to search - :return: nodes with search results - """ - wiki_data = {} - - async for entity in tqdm_async( - entities, desc="Searching Wikipedia", total=len(entities) - ): - try: - summary = await _process_single_entity(entity, wiki_search_client) - if summary: - wiki_data[entity] = summary - except Exception as e: # pylint: disable=broad-except - logger.error("Error processing entity %s: %s", entity, str(e)) - return wiki_data diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py index d7ecbea1..99c71a79 100644 --- a/graphgen/operators/search/search_all.py +++ b/graphgen/operators/search/search_all.py @@ -1,82 +1,49 @@ """ To use Google Web Search API, follow the instructions [here](https://developers.google.com/custom-search/v1/overview) -to get your Google search api key. +to get your Google searcher api key. To use Bing Web Search API, follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) and obtain your Bing subscription key. """ -import os -from graphgen.utils import logger +from graphgen.utils import logger, run_concurrent async def search_all( - search_types: dict, search_entities: set[str] -) -> dict[str, dict[str, str]]: + seed_data: dict, + data_sources: list[str], +) -> dict: """ - :param search_types - :param search_entities: list of entities to search - :return: nodes with search results + Perform searches across multiple search types and aggregate the results. + :param seed_data: A dictionary containing seed data with entity names. + :param data_sources: A list of search types to perform (e.g., "wikipedia", "google", "bing", "uniprot"). + :return: A dictionary with """ results = {} - for search_type in search_types: - if search_type == "wikipedia": - from graphgen.models import WikiSearch - from graphgen.operators.search.kg.search_wikipedia import search_wikipedia + for data_source in data_sources: + if data_source == "uniprot": + from graphgen.models import UniProtSearch - wiki_search_client = WikiSearch() + uniprot_search_client = UniProtSearch() - wiki_results = await search_wikipedia(wiki_search_client, search_entities) - for entity_name, description in wiki_results.items(): - if description: - results[entity_name] = {"wikipedia": description} - elif search_type == "google": - from graphgen.models import GoogleSearch - from graphgen.operators.search.web.search_google import search_google - - google_search_client = GoogleSearch( - subscription_key=os.environ["GOOGLE_SEARCH_API_KEY"], - cx=os.environ["GOOGLE_SEARCH_CX"], - ) - - google_results = await search_google(google_search_client, search_entities) - for entity_name, description in google_results.items(): - if description: - results[entity_name] = results.get(entity_name, {}) - results[entity_name]["google"] = description - elif search_type == "bing": - from graphgen.models import BingSearch - from graphgen.operators.search.web.search_bing import search_bing - - bing_search_client = BingSearch( - subscription_key=os.environ["BING_SEARCH_API_KEY"] + data = list(seed_data.values()) + data = [d["content"] for d in data if "content" in d] + data = list(set(data)) # Remove duplicates + uniprot_results = await run_concurrent( + uniprot_search_client.search, + data, + desc="Searching UniProt database", + unit="keyword", ) - - bing_results = await search_bing(bing_search_client, search_entities) - for entity_name, description in bing_results.items(): - if description: - results[entity_name] = results.get(entity_name, {}) - results[entity_name]["bing"] = description - elif search_type == "uniprot": - # from graphgen.models import UniProtSearch - # from graphgen.operators.search.db.search_uniprot import search_uniprot - # - # uniprot_search_client = UniProtSearch() - # - # uniprot_results = await search_uniprot( - # uniprot_search_client, search_entities - # ) - raise NotImplementedError( - "Processing of UniProt search results is not implemented yet." - ) - else: - logger.error("Search type %s is not supported yet.", search_type) + logger.error("Data source %s not supported.", data_source) continue + results[data_source] = uniprot_results + return results diff --git a/graphgen/operators/search/web/__init__.py b/graphgen/operators/search/web/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/graphgen/operators/search/web/search_bing.py b/graphgen/operators/search/web/search_bing.py deleted file mode 100644 index 69f65f7b..00000000 --- a/graphgen/operators/search/web/search_bing.py +++ /dev/null @@ -1,53 +0,0 @@ -import trafilatura -from tqdm.asyncio import tqdm_asyncio as tqdm_async - -from graphgen.models import BingSearch -from graphgen.utils import logger - - -async def _process_single_entity( - entity_name: str, bing_search_client: BingSearch -) -> str | None: - """ - Process single entity by searching Bing. - :param entity_name: The name of the entity to search. - :param bing_search_client: The Bing search client. - :return: Summary of the entity or None if not found. - """ - search_results = bing_search_client.search(entity_name) - if not search_results: - return None - - # Get more details from the first search result - first_result = search_results[0] - content = trafilatura.fetch_url(first_result["url"]) - summary = trafilatura.extract(content, include_comments=False, include_links=False) - summary = summary.strip() - logger.info( - "Entity %s search result: %s", - entity_name, - summary, - ) - return summary - - -async def search_bing( - bing_search_client: BingSearch, - entities: set[str], -) -> dict[str, str]: - """ - Search with Bing and return the contexts. - :return: - """ - bing_data = {} - - async for entity in tqdm_async( - entities, desc="Searching Bing", total=len(entities) - ): - try: - summary = await _process_single_entity(entity, bing_search_client) - if summary: - bing_data[entity] = summary - except Exception as e: # pylint: disable=broad-except - logger.error("Error processing entity %s: %s", entity, str(e)) - return bing_data diff --git a/graphgen/operators/search/web/search_google.py b/graphgen/operators/search/web/search_google.py deleted file mode 100644 index 803ce107..00000000 --- a/graphgen/operators/search/web/search_google.py +++ /dev/null @@ -1,49 +0,0 @@ -import trafilatura -from tqdm.asyncio import tqdm_asyncio as tqdm_async - -from graphgen.models import GoogleSearch -from graphgen.utils import logger - - -async def _process_single_entity( - entity_name: str, google_search_client: GoogleSearch -) -> str | None: - search_results = google_search_client.search(entity_name) - if not search_results: - return None - - # Get more details from the first search result - first_result = search_results[0] - content = trafilatura.fetch_url(first_result["link"]) - summary = trafilatura.extract(content, include_comments=False, include_links=False) - summary = summary.strip() - logger.info( - "Entity %s search result: %s", - entity_name, - summary, - ) - return summary - - -async def search_google( - google_search_client: GoogleSearch, - entities: set[str], -) -> dict: - """ - Search with Google and return the contexts. - :param google_search_client: Google search client - :param entities: list of entities to search - :return: - """ - google_data = {} - - async for entity in tqdm_async( - entities, desc="Searching Google", total=len(entities) - ): - try: - summary = await _process_single_entity(entity, google_search_client) - if summary: - google_data[entity] = summary - except Exception as e: # pylint: disable=broad-except - logger.error("Error processing entity %s: %s", entity, str(e)) - return google_data diff --git a/graphgen/templates/search_judgement.py b/graphgen/templates/search_judgement.py index e85b0097..19b21840 100644 --- a/graphgen/templates/search_judgement.py +++ b/graphgen/templates/search_judgement.py @@ -1,16 +1,16 @@ # pylint: disable=C0301 TEMPLATE: str = """-Goal- -Please select the most relevant search result for the given entity. -The name and description of the entity are provided. The search results are provided as a list. -Please select the most relevant search result from the list. If none of the search results are relevant, please select 'None of the above'. +Please select the most relevant searcher result for the given entity. +The name and description of the entity are provided. The searcher results are provided as a list. +Please select the most relevant searcher result from the list. If none of the searcher results are relevant, please select 'None of the above'. Steps: 1. Read the name and description of the entity. -2. Read the search results. For each search result, compare it with the entity name and description to determine if it is relevant. +2. Read the searcher results. For each searcher result, compare it with the entity name and description to determine if it is relevant. -3. Select the most relevant search result from the list. If none of the search results are relevant, select 'None of the above'. +3. Select the most relevant searcher result from the list. If none of the searcher results are relevant, select 'None of the above'. 4. Output your selection directly, please do not provide any additional information. diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py index 31e0d23c..ac63f87b 100644 --- a/graphgen/utils/run_concurrent.py +++ b/graphgen/utils/run_concurrent.py @@ -10,77 +10,6 @@ R = TypeVar("R") -# async def run_concurrent( -# coro_fn: Callable[[T], Awaitable[R]], -# items: List[T], -# *, -# desc: str = "processing", -# unit: str = "item", -# progress_bar: Optional[gr.Progress] = None, -# ) -> List[R]: -# tasks = [asyncio.create_task(coro_fn(it)) for it in items] -# -# results = [] -# async for future in tqdm_async( -# tasks, desc=desc, unit=unit -# ): -# try: -# result = await future -# results.append(result) -# except Exception as e: # pylint: disable=broad-except -# logger.exception("Task failed: %s", e) -# -# if progress_bar is not None: -# progress_bar((len(results)) / len(items), desc=desc) -# -# if progress_bar is not None: -# progress_bar(1.0, desc=desc) -# return results - -# results = await tqdm_async.gather(*tasks, desc=desc, unit=unit) -# -# ok_results = [] -# for idx, res in enumerate(results): -# if isinstance(res, Exception): -# logger.exception("Task failed: %s", res) -# if progress_bar: -# progress_bar((idx + 1) / len(items), desc=desc) -# continue -# ok_results.append(res) -# if progress_bar: -# progress_bar((idx + 1) / len(items), desc=desc) -# -# if progress_bar: -# progress_bar(1.0, desc=desc) -# return ok_results - -# async def run_concurrent( -# coro_fn: Callable[[T], Awaitable[R]], -# items: List[T], -# *, -# desc: str = "processing", -# unit: str = "item", -# progress_bar: Optional[gr.Progress] = None, -# ) -> List[R]: -# tasks = [asyncio.create_task(coro_fn(it)) for it in items] -# -# results = [] -# # 使用同步方式更新进度条,避免异步冲突 -# for i, task in enumerate(asyncio.as_completed(tasks)): -# try: -# result = await task -# results.append(result) -# # 同步更新进度条 -# if progress_bar is not None: -# # 在同步上下文中更新进度 -# progress_bar((i + 1) / len(items), desc=desc) -# except Exception as e: -# logger.exception("Task failed: %s", e) -# results.append(e) -# -# return results - - async def run_concurrent( coro_fn: Callable[[T], Awaitable[R]], items: List[T], diff --git a/resources/input_examples/search_demo.jsonl b/resources/input_examples/search_demo.jsonl new file mode 100644 index 00000000..6409a805 --- /dev/null +++ b/resources/input_examples/search_demo.jsonl @@ -0,0 +1,5 @@ +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"type": "text", "content": "MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKTTVIALREIEEGLINNQILDVRERQEQQEQEAAELQAVTAIAEGRR"} +{"type": "text", "content": "GSHMLCAISGKVPRRPVLSPKSRTIFEKSLLEQYVKDTGNDPITNEPLSIEEIVEIVPSAQ"}