44
55from Bio import ExPASy , SeqIO , SwissProt , UniProt
66from Bio .Blast import NCBIWWW , NCBIXML
7-
7+ from requests .exceptions import RequestException
8+ from tenacity import (
9+ retry ,
10+ retry_if_exception_type ,
11+ stop_after_attempt ,
12+ wait_exponential ,
13+ )
14+
15+ from graphgen .bases import BaseSearcher
816from graphgen .utils import logger
917
1018
11- class UniProtSearch :
19+ class UniProtSearch ( BaseSearcher ) :
1220 """
13- UniProt Search client to search with UniProt.
21+ UniProt Search client to searcher with UniProt.
1422 1) Get the protein by accession number.
15- 2) Search with keywords or protein names (fuzzy search ).
16- 3) Search with FASTA sequence (BLAST search ).
23+ 2) Search with keywords or protein names (fuzzy searcher ).
24+ 3) Search with FASTA sequence (BLAST searcher ).
1725 """
1826
1927 def get_by_accession (self , accession : str ) -> Optional [dict ]:
@@ -22,6 +30,8 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
2230 record = SwissProt .read (handle )
2331 handle .close ()
2432 return self ._swissprot_to_dict (record )
33+ except RequestException : # network-related errors
34+ raise
2535 except Exception as exc : # pylint: disable=broad-except
2636 logger .error ("Accession %s not found: %s" , accession , exc )
2737 return None
@@ -52,7 +62,7 @@ def _swissprot_to_dict(record: SwissProt.Record) -> dict:
5262 def get_best_hit (self , keyword : str ) -> Optional [Dict ]:
5363 """
5464 Search UniProt with a keyword and return the best hit.
55- :param keyword: The search keyword.
65+ :param keyword: The searcher keyword.
5666 :return: A dictionary containing the best hit information or None if not found.
5767 """
5868 if not keyword .strip ():
@@ -65,15 +75,17 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]:
6575 return None
6676 return self .get_by_accession (hit ["primaryAccession" ])
6777
78+ except RequestException :
79+ raise
6880 except Exception as e : # pylint: disable=broad-except
6981 logger .error ("Keyword %s not found: %s" , keyword , e )
70- return None
82+ return None
7183
7284 def get_by_fasta (self , fasta_sequence : str , threshold : float ) -> Optional [Dict ]:
7385 """
7486 Search UniProt with a FASTA sequence and return the best hit.
7587 :param fasta_sequence: The FASTA sequence.
76- :param threshold: E-value threshold for BLAST search .
88+ :param threshold: E-value threshold for BLAST searcher .
7789 :return: A dictionary containing the best hit information or None if not found.
7890 """
7991 try :
@@ -91,6 +103,7 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
91103
92104 # UniProtKB/Swiss-Prot BLAST API
93105 try :
106+ logger .debug ("Performing BLAST searcher for the given sequence: %s" , seq )
94107 result_handle = NCBIWWW .qblast (
95108 program = "blastp" ,
96109 database = "swissprot" ,
@@ -99,8 +112,10 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
99112 expect = threshold ,
100113 )
101114 blast_record = NCBIXML .read (result_handle )
115+ except RequestException :
116+ raise
102117 except Exception as e : # pylint: disable=broad-except
103- logger .error ("BLAST search failed: %s" , e )
118+ logger .error ("BLAST searcher failed: %s" , e )
104119 return None
105120
106121 if not blast_record .alignments :
@@ -118,11 +133,19 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
118133 accession = hit_id .split ("|" )[1 ].split ("." )[0 ] if "|" in hit_id else hit_id
119134 return self .get_by_accession (accession )
120135
121- def get_any (self , query : str , threshold : float = 1e-5 ) -> Optional [Dict ]:
136+ @retry (
137+ stop = stop_after_attempt (5 ),
138+ wait = wait_exponential (multiplier = 1 , min = 4 , max = 10 ),
139+ retry = retry_if_exception_type (RequestException ),
140+ reraise = True ,
141+ )
142+ async def search (
143+ self , query : str , threshold : float = 0.7 , ** kwargs
144+ ) -> Optional [Dict ]:
122145 """
123146 Search UniProt with either an accession number, keyword, or FASTA sequence.
124- :param query: The search query (accession number, keyword, or FASTA sequence).
125- :param threshold: E-value threshold for BLAST search .
147+ :param query: The searcher query (accession number, keyword, or FASTA sequence).
148+ :param threshold: E-value threshold for BLAST searcher .
126149 :return: A dictionary containing the best hit information or None if not found.
127150 """
128151
@@ -132,15 +155,21 @@ def get_any(self, query: str, threshold: float = 1e-5) -> Optional[Dict]:
132155 return None
133156 query = query .strip ()
134157
158+ logger .debug ("UniProt searcher query: %s" , query )
135159 # check if fasta sequence
136160 if query .startswith (">" ) or re .fullmatch (
137161 r"[ACDEFGHIKLMNPQRSTVWY\s]+" , query , re .I
138162 ):
139- return self .get_by_fasta (query , threshold )
163+ result = self .get_by_fasta (query , threshold )
140164
141165 # check if accession number
142- if re .fullmatch (r"[A-NR-Z0-9]{6,10}" , query , re .I ):
143- return self .get_by_accession (query )
166+ elif re .fullmatch (r"[A-NR-Z0-9]{6,10}" , query , re .I ):
167+ result = self .get_by_accession (query )
168+
169+ else :
170+ # otherwise treat as keyword
171+ result = self .get_best_hit (query )
144172
145- # otherwise treat as keyword
146- return self .get_best_hit (query )
173+ if result :
174+ result ["_search_query" ] = query
175+ return result
0 commit comments