@@ -41,6 +41,38 @@ def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"):
4141 Entrez .tool = tool
4242 Entrez .timeout = 60 # 60 seconds timeout
4343
44+ @staticmethod
45+ def _gene_record_to_dict (gene_record , gene_id : str ) -> dict :
46+ """
47+ Convert an Entrez gene record to a dictionary.
48+ :param gene_record: The Entrez gene record (list from Entrez.read).
49+ :param gene_id: The gene ID.
50+ :return: A dictionary containing gene information.
51+ """
52+ if not gene_record :
53+ raise ValueError ("Empty gene record" )
54+
55+ gene_data = gene_record [0 ]
56+ gene_ref = gene_data .get ("Entrezgene_gene" , {}).get ("Gene-ref" , {})
57+
58+ organism = (
59+ gene_data .get ("Entrezgene_source" , {})
60+ .get ("BioSource" , {})
61+ .get ("BioSource_org" , {})
62+ .get ("Org-ref" , {})
63+ .get ("Org-ref_taxname" , "N/A" )
64+ )
65+
66+ return {
67+ "molecule_type" : "DNA" ,
68+ "database" : "NCBI" ,
69+ "id" : gene_id ,
70+ "gene_name" : gene_ref .get ("Gene-ref_locus" , "N/A" ),
71+ "gene_description" : gene_ref .get ("Gene-ref_desc" , "N/A" ),
72+ "organism" : organism ,
73+ "url" : f"https://www.ncbi.nlm.nih.gov/gene/{ gene_id } " ,
74+ }
75+
4476 def get_by_gene_id (self , gene_id : str ) -> Optional [dict ]:
4577 """
4678 Get gene information by Gene ID.
@@ -54,26 +86,7 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
5486 gene_record = Entrez .read (handle )
5587 if not gene_record :
5688 return None
57-
58- gene_data = gene_record [0 ]
59- gene_ref = gene_data .get ("Entrezgene_gene" , {}).get ("Gene-ref" , {})
60-
61- organism = (
62- gene_data .get ("Entrezgene_source" , {})
63- .get ("BioSource" , {})
64- .get ("BioSource_org" , {})
65- .get ("Org-ref" , {})
66- .get ("Org-ref_taxname" , "N/A" )
67- )
68- return {
69- "molecule_type" : "DNA" ,
70- "database" : "NCBI" ,
71- "id" : gene_id ,
72- "gene_name" : gene_ref .get ("Gene-ref_locus" , "N/A" ),
73- "gene_description" : gene_ref .get ("Gene-ref_desc" , "N/A" ),
74- "organism" : organism ,
75- "url" : f"https://www.ncbi.nlm.nih.gov/gene/{ gene_id } " ,
76- }
89+ return self ._gene_record_to_dict (gene_record , gene_id )
7790 finally :
7891 handle .close ()
7992 except RequestException :
@@ -82,14 +95,36 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
8295 logger .error ("Gene ID %s not found: %s" , gene_id , exc )
8396 return None
8497
98+ @staticmethod
99+ def _accession_to_dict (accession : str , sequence : str , header : str , title : str , organism : str ) -> dict :
100+ """
101+ Convert accession information to a dictionary.
102+ :param accession: NCBI accession number.
103+ :param sequence: DNA sequence.
104+ :param header: FASTA header.
105+ :param title: Sequence title.
106+ :param organism: Organism name.
107+ :return: A dictionary containing sequence information.
108+ """
109+ return {
110+ "molecule_type" : "DNA" ,
111+ "database" : "NCBI" ,
112+ "id" : accession ,
113+ "title" : title ,
114+ "organism" : organism ,
115+ "sequence" : sequence ,
116+ "sequence_length" : len (sequence ),
117+ "url" : f"https://www.ncbi.nlm.nih.gov/nuccore/{ accession } " ,
118+ }
119+
85120 def get_by_accession (self , accession : str ) -> Optional [dict ]:
86121 """
87122 Get sequence information by accession number.
88123 :param accession: NCBI accession number (e.g., NM_000546).
89124 :return: A dictionary containing sequence information or None if not found.
90125 """
91126 try :
92- time .sleep (0.35 ) # 遵守速率限制
127+ time .sleep (0.35 ) # Comply with rate limit
93128 handle = Entrez .efetch (
94129 db = "nuccore" ,
95130 id = accession ,
@@ -120,16 +155,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
120155 finally :
121156 summary_handle .close ()
122157
123- return {
124- "molecule_type" : "DNA" ,
125- "database" : "NCBI" ,
126- "id" : accession ,
127- "title" : title ,
128- "organism" : organism ,
129- "sequence" : sequence ,
130- "sequence_length" : len (sequence ),
131- "url" : f"https://www.ncbi.nlm.nih.gov/nuccore/{ accession } " ,
132- }
158+ return self ._accession_to_dict (accession , sequence , header , title , organism )
133159 finally :
134160 handle .close ()
135161 except RequestException :
@@ -138,7 +164,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
138164 logger .error ("Accession %s not found: %s" , accession , exc )
139165 return None
140166
141- def search_by_keyword (self , keyword : str ) -> Optional [dict ]:
167+ def get_best_hit (self , keyword : str ) -> Optional [dict ]:
142168 """
143169 Search NCBI Gene database with a keyword and return the best hit.
144170 :param keyword: The search keyword (e.g., gene name).
@@ -148,7 +174,7 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]:
148174 return None
149175
150176 try :
151- time .sleep (0.35 ) # 遵守速率限制
177+ time .sleep (0.35 ) # Comply with rate limit
152178 # Search gene database
153179 search_handle = Entrez .esearch (
154180 db = "gene" ,
@@ -181,11 +207,12 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]:
181207 logger .error ("Keyword %s not found: %s" , keyword , e )
182208 return None
183209
184- def search_by_sequence (self , sequence : str ) -> Optional [dict ]:
210+ def search_by_sequence (self , sequence : str , threshold : float = 0.01 ) -> Optional [dict ]:
185211 """
186212 Search NCBI with a DNA sequence using BLAST.
187213 Note: This is a simplified version. For production, consider using local BLAST.
188214 :param sequence: DNA sequence (FASTA format or raw sequence).
215+ :param threshold: E-value threshold for BLAST search.
189216 :return: A dictionary containing the best hit information or None if not found.
190217 """
191218 try :
@@ -215,7 +242,7 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
215242 database = "nr" ,
216243 sequence = seq ,
217244 hitlist_size = 1 ,
218- expect = 0.001 ,
245+ expect = threshold ,
219246 )
220247 blast_record = NCBIXML .read (result_handle )
221248
@@ -225,6 +252,9 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
225252
226253 best_alignment = blast_record .alignments [0 ]
227254 best_hsp = best_alignment .hsps [0 ]
255+ if best_hsp .expect > threshold :
256+ logger .info ("No BLAST hits below the threshold E-value." )
257+ return None
228258 hit_id = best_alignment .hit_id
229259
230260 # Extract accession number
@@ -257,11 +287,12 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
257287 reraise = True ,
258288 )
259289 async def search (
260- self , query : str , ** kwargs
290+ self , query : str , threshold : float = 0.01 , ** kwargs
261291 ) -> Optional [Dict ]:
262292 """
263293 Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.
264294 :param query: The search query (gene ID, accession, keyword, or DNA sequence).
295+ :param threshold: E-value threshold for BLAST search.
265296 :param kwargs: Additional keyword arguments (not used currently).
266297 :return: A dictionary containing the search results or None if not found.
267298 """
@@ -278,7 +309,7 @@ async def search(
278309 # check if DNA sequence (ATCG characters)
279310 if query .startswith (">" ) or re .fullmatch (r"[ATCGN\s]+" , query , re .I ):
280311 result = await loop .run_in_executor (
281- _get_pool (), self .search_by_sequence , query
312+ _get_pool (), self .search_by_sequence , query , threshold
282313 )
283314 # check if gene ID (numeric)
284315 elif re .fullmatch (r"^\d+$" , query ):
@@ -293,7 +324,7 @@ async def search(
293324 else :
294325 # otherwise treat as keyword
295326 result = await loop .run_in_executor (
296- _get_pool (), self .search_by_keyword , query
327+ _get_pool (), self .get_best_hit , query
297328 )
298329
299330 if result :
0 commit comments