From bb42b7f1de8db192bdbd101d4a35e38b77db790e Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Wed, 26 Nov 2025 11:37:59 +0800 Subject: [PATCH] workflow: add search_uniprot example --- graphgen/configs/search_config.yaml | 2 +- .../models/searcher/db/uniprot_searcher.py | 23 +++++++++++-------- scripts/search/search_uniprot.sh | 3 +++ 3 files changed, 17 insertions(+), 11 deletions(-) create mode 100644 scripts/search/search_uniprot.sh diff --git a/graphgen/configs/search_config.yaml b/graphgen/configs/search_config.yaml index 37e65818..ff110786 100644 --- a/graphgen/configs/search_config.yaml +++ b/graphgen/configs/search_config.yaml @@ -1,7 +1,7 @@ pipeline: - name: read params: - input_file: resources/input_examples/search_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - name: search params: diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index a74b623e..f5542f8c 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -27,12 +27,16 @@ def _get_pool(): return ThreadPoolExecutor(max_workers=10) +# ensure only one BLAST searcher at a time +_blast_lock = asyncio.Lock() + + class UniProtSearch(BaseSearcher): """ UniProt Search client to searcher with UniProt. 1) Get the protein by accession number. 2) Search with keywords or protein names (fuzzy searcher). - 3) Search with FASTA sequence (BLAST searcher). + 3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async. """ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"): @@ -230,22 +234,21 @@ async def search( if query.startswith(">") or re.fullmatch( r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I ): - coro = loop.run_in_executor( - _get_pool(), self.get_by_fasta, query, threshold - ) + async with _blast_lock: + result = await loop.run_in_executor( + _get_pool(), self.get_by_fasta, query, threshold + ) # check if accession number elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I): - coro = loop.run_in_executor(_get_pool(), self.get_by_accession, query) + result = await loop.run_in_executor( + _get_pool(), self.get_by_accession, query + ) else: # otherwise treat as keyword - coro = loop.run_in_executor(_get_pool(), self.get_best_hit, query) + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) - result = await coro if result: result["_search_query"] = query return result - - -# TODO: use local UniProt database for large-scale searchs diff --git a/scripts/search/search_uniprot.sh b/scripts/search/search_uniprot.sh new file mode 100644 index 00000000..642040af --- /dev/null +++ b/scripts/search/search_uniprot.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file graphgen/configs/search_config.yaml \ +--output_dir cache/