1+ import re
12from io import StringIO
23from typing import Dict , Optional
34
45from Bio import ExPASy , SeqIO , SwissProt , UniProt
56from Bio .Blast import NCBIWWW , NCBIXML
6-
7+ from requests .exceptions import RequestException
8+ from tenacity import (
9+ retry ,
10+ retry_if_exception_type ,
11+ stop_after_attempt ,
12+ wait_exponential ,
13+ )
14+
15+ from graphgen .bases import BaseSearcher
716from graphgen .utils import logger
817
918
10- class UniProtSearch :
19+ class UniProtSearch ( BaseSearcher ) :
1120 """
12- UniProt Search client to search with UniProt.
21+ UniProt Search client to searcher with UniProt.
1322 1) Get the protein by accession number.
14- 2) Search with keywords or protein names (fuzzy search ).
15- 3) Search with FASTA sequence (BLAST search ).
23+ 2) Search with keywords or protein names (fuzzy searcher ).
24+ 3) Search with FASTA sequence (BLAST searcher ).
1625 """
1726
1827 def get_by_accession (self , accession : str ) -> Optional [dict ]:
@@ -21,6 +30,8 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
2130 record = SwissProt .read (handle )
2231 handle .close ()
2332 return self ._swissprot_to_dict (record )
33+ except RequestException : # network-related errors
34+ raise
2435 except Exception as exc : # pylint: disable=broad-except
2536 logger .error ("Accession %s not found: %s" , accession , exc )
2637 return None
@@ -51,7 +62,7 @@ def _swissprot_to_dict(record: SwissProt.Record) -> dict:
5162 def get_best_hit (self , keyword : str ) -> Optional [Dict ]:
5263 """
5364 Search UniProt with a keyword and return the best hit.
54- :param keyword: The search keyword.
65+ :param keyword: The searcher keyword.
5566 :return: A dictionary containing the best hit information or None if not found.
5667 """
5768 if not keyword .strip ():
@@ -64,15 +75,17 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]:
6475 return None
6576 return self .get_by_accession (hit ["primaryAccession" ])
6677
78+ except RequestException :
79+ raise
6780 except Exception as e : # pylint: disable=broad-except
6881 logger .error ("Keyword %s not found: %s" , keyword , e )
69- return None
82+ return None
7083
7184 def get_by_fasta (self , fasta_sequence : str , threshold : float ) -> Optional [Dict ]:
7285 """
7386 Search UniProt with a FASTA sequence and return the best hit.
7487 :param fasta_sequence: The FASTA sequence.
75- :param threshold: E-value threshold for BLAST search .
88+ :param threshold: E-value threshold for BLAST searcher .
7689 :return: A dictionary containing the best hit information or None if not found.
7790 """
7891 try :
@@ -90,6 +103,7 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
90103
91104 # UniProtKB/Swiss-Prot BLAST API
92105 try :
106+ logger .debug ("Performing BLAST searcher for the given sequence: %s" , seq )
93107 result_handle = NCBIWWW .qblast (
94108 program = "blastp" ,
95109 database = "swissprot" ,
@@ -98,8 +112,10 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
98112 expect = threshold ,
99113 )
100114 blast_record = NCBIXML .read (result_handle )
115+ except RequestException :
116+ raise
101117 except Exception as e : # pylint: disable=broad-except
102- logger .error ("BLAST search failed: %s" , e )
118+ logger .error ("BLAST searcher failed: %s" , e )
103119 return None
104120
105121 if not blast_record .alignments :
@@ -116,3 +132,44 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
116132 # like sp|P01308.1|INS_HUMAN
117133 accession = hit_id .split ("|" )[1 ].split ("." )[0 ] if "|" in hit_id else hit_id
118134 return self .get_by_accession (accession )
135+
136+ @retry (
137+ stop = stop_after_attempt (5 ),
138+ wait = wait_exponential (multiplier = 1 , min = 4 , max = 10 ),
139+ retry = retry_if_exception_type (RequestException ),
140+ reraise = True ,
141+ )
142+ async def search (
143+ self , query : str , threshold : float = 0.7 , ** kwargs
144+ ) -> Optional [Dict ]:
145+ """
146+ Search UniProt with either an accession number, keyword, or FASTA sequence.
147+ :param query: The searcher query (accession number, keyword, or FASTA sequence).
148+ :param threshold: E-value threshold for BLAST searcher.
149+ :return: A dictionary containing the best hit information or None if not found.
150+ """
151+
152+ # auto detect query type
153+ if not query or not isinstance (query , str ):
154+ logger .error ("Empty or non-string input." )
155+ return None
156+ query = query .strip ()
157+
158+ logger .debug ("UniProt searcher query: %s" , query )
159+ # check if fasta sequence
160+ if query .startswith (">" ) or re .fullmatch (
161+ r"[ACDEFGHIKLMNPQRSTVWY\s]+" , query , re .I
162+ ):
163+ result = self .get_by_fasta (query , threshold )
164+
165+ # check if accession number
166+ elif re .fullmatch (r"[A-NR-Z0-9]{6,10}" , query , re .I ):
167+ result = self .get_by_accession (query )
168+
169+ else :
170+ # otherwise treat as keyword
171+ result = self .get_best_hit (query )
172+
173+ if result :
174+ result ["_search_query" ] = query
175+ return result
0 commit comments