Skip to content

Commit 0da7ce1

Browse files
committed
feat: Add DNA and RNA search functionality
- Add NCBISearch searcher for DNA/GenBank/Entrez database search - Add RNACentralSearch searcher for RNA database search - Update search_all.py to support ncbi and rnacentral data sources - Add search configs for DNA, RNA, and protein (renamed from search_config) - Add search scripts for DNA and RNA - Add demo input files for DNA and RNA search - Update search_uniprot.sh to use search_protein_config.yaml This PR extends the search functionality to support biological data types: - Protein search (existing, now explicitly named) - DNA search via NCBI - RNA search via RNAcentral
1 parent 48d44c5 commit 0da7ce1

File tree

12 files changed

+568
-7
lines changed

12 files changed

+568
-7
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
pipeline:
2+
- name: read_step
3+
op_key: read
4+
params:
5+
input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6+
7+
- name: search_step
8+
op_key: search
9+
deps: [read_step] # search_step depends on read_step
10+
params:
11+
data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
12+
ncbi_params:
13+
email: [email protected] # NCBI requires an email address
14+
tool: GraphGen # tool name for NCBI API
15+

graphgen/configs/search_config.yaml renamed to graphgen/configs/search_protein_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ pipeline:
22
- name: read_step
33
op_key: read
44
params:
5-
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
66

77
- name: search_step
88
op_key: search
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
pipeline:
2+
- name: read_step
3+
op_key: read
4+
params:
5+
input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6+
7+
- name: search_step
8+
op_key: search
9+
deps: [read_step] # search_step depends on read_step
10+
params:
11+
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
12+
rnacentral_params:
13+
{} # RNAcentral doesn't require additional parameters currently
14+

graphgen/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
RDFReader,
2727
TXTReader,
2828
)
29+
from .searcher.db.ncbi_searcher import NCBISearch
30+
from .searcher.db.rnacentral_searcher import RNACentralSearch
2931
from .searcher.db.uniprot_searcher import UniProtSearch
3032
from .searcher.kg.wiki_search import WikiSearch
3133
from .searcher.web.bing_search import BingSearch
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
import asyncio
2+
import re
3+
import time
4+
from concurrent.futures import ThreadPoolExecutor
5+
from functools import lru_cache
6+
from http.client import IncompleteRead
7+
from typing import Dict, Optional
8+
9+
from Bio import Entrez
10+
from requests.exceptions import RequestException
11+
from tenacity import (
12+
retry,
13+
retry_if_exception_type,
14+
stop_after_attempt,
15+
wait_exponential,
16+
)
17+
18+
from graphgen.bases import BaseSearcher
19+
from graphgen.utils import logger
20+
21+
22+
@lru_cache(maxsize=None)
23+
def _get_pool():
24+
return ThreadPoolExecutor(max_workers=10)
25+
26+
27+
class NCBISearch(BaseSearcher):
28+
"""
29+
NCBI Search client to search DNA/GenBank/Entrez databases.
30+
1) Get the gene/DNA by accession number or gene ID.
31+
2) Search with keywords or gene names (fuzzy search).
32+
3) Search with FASTA sequence (BLAST search for DNA sequences).
33+
34+
API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/
35+
Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
36+
"""
37+
38+
def __init__(self, email: str = "[email protected]", tool: str = "GraphGen"):
39+
super().__init__()
40+
Entrez.email = email
41+
Entrez.tool = tool
42+
Entrez.timeout = 60 # 60 seconds timeout
43+
44+
def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
45+
"""
46+
Get gene information by Gene ID.
47+
:param gene_id: NCBI Gene ID.
48+
:return: A dictionary containing gene information or None if not found.
49+
"""
50+
try:
51+
time.sleep(0.35) # Comply with rate limit (max 3 requests per second)
52+
handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
53+
try:
54+
gene_record = Entrez.read(handle)
55+
if not gene_record:
56+
return None
57+
58+
gene_data = gene_record[0]
59+
gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
60+
61+
return {
62+
"molecule_type": "DNA",
63+
"database": "NCBI",
64+
"id": gene_id,
65+
"gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
66+
"gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
67+
"organism": gene_data.get("Entrezgene_source", {}).get("BioSource", {}).get("BioSource_org", {}).get("Org-ref", {}).get("Org-ref_taxname", "N/A"),
68+
"url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
69+
}
70+
finally:
71+
handle.close()
72+
except RequestException:
73+
raise
74+
except Exception as exc: # pylint: disable=broad-except
75+
logger.error("Gene ID %s not found: %s", gene_id, exc)
76+
return None
77+
78+
def get_by_accession(self, accession: str) -> Optional[dict]:
79+
"""
80+
Get sequence information by accession number.
81+
:param accession: NCBI accession number (e.g., NM_000546).
82+
:return: A dictionary containing sequence information or None if not found.
83+
"""
84+
try:
85+
time.sleep(0.35) # 遵守速率限制
86+
handle = Entrez.efetch(
87+
db="nuccore",
88+
id=accession,
89+
rettype="fasta",
90+
retmode="text",
91+
)
92+
try:
93+
sequence_data = handle.read()
94+
if not sequence_data:
95+
return None
96+
97+
seq_lines = sequence_data.strip().split("\n")
98+
header = seq_lines[0] if seq_lines else ""
99+
sequence = "".join(seq_lines[1:])
100+
101+
# Try to get more information
102+
time.sleep(0.35)
103+
summary_handle = Entrez.esummary(db="nuccore", id=accession)
104+
try:
105+
summary = Entrez.read(summary_handle)
106+
if summary:
107+
summary_data = summary[0]
108+
title = summary_data.get("Title", header)
109+
organism = summary_data.get("Organism", "N/A")
110+
else:
111+
title = header
112+
organism = "N/A"
113+
finally:
114+
summary_handle.close()
115+
116+
return {
117+
"molecule_type": "DNA",
118+
"database": "NCBI",
119+
"id": accession,
120+
"title": title,
121+
"organism": organism,
122+
"sequence": sequence,
123+
"sequence_length": len(sequence),
124+
"url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}",
125+
}
126+
finally:
127+
handle.close()
128+
except RequestException:
129+
raise
130+
except Exception as exc: # pylint: disable=broad-except
131+
logger.error("Accession %s not found: %s", accession, exc)
132+
return None
133+
134+
def search_by_keyword(self, keyword: str) -> Optional[dict]:
135+
"""
136+
Search NCBI Gene database with a keyword and return the best hit.
137+
:param keyword: The search keyword (e.g., gene name).
138+
:return: A dictionary containing the best hit information or None if not found.
139+
"""
140+
if not keyword.strip():
141+
return None
142+
143+
try:
144+
time.sleep(0.35) # 遵守速率限制
145+
# Search gene database
146+
search_handle = Entrez.esearch(
147+
db="gene",
148+
term=f"{keyword}[Gene Name] OR {keyword}[All Fields]",
149+
retmax=1,
150+
)
151+
try:
152+
search_results = Entrez.read(search_handle)
153+
if not search_results.get("IdList"):
154+
# If not found, try a broader search
155+
time.sleep(0.35)
156+
search_handle2 = Entrez.esearch(
157+
db="gene",
158+
term=keyword,
159+
retmax=1,
160+
)
161+
try:
162+
search_results = Entrez.read(search_handle2)
163+
finally:
164+
search_handle2.close()
165+
166+
if search_results.get("IdList"):
167+
gene_id = search_results["IdList"][0]
168+
return self.get_by_gene_id(gene_id)
169+
finally:
170+
search_handle.close()
171+
except RequestException:
172+
raise
173+
except Exception as e: # pylint: disable=broad-except
174+
logger.error("Keyword %s not found: %s", keyword, e)
175+
return None
176+
177+
def search_by_sequence(self, sequence: str) -> Optional[dict]:
178+
"""
179+
Search NCBI with a DNA sequence using BLAST.
180+
Note: This is a simplified version. For production, consider using local BLAST.
181+
:param sequence: DNA sequence (FASTA format or raw sequence).
182+
:return: A dictionary containing the best hit information or None if not found.
183+
"""
184+
try:
185+
# Extract sequence (if in FASTA format)
186+
if sequence.startswith(">"):
187+
seq_lines = sequence.strip().split("\n")
188+
seq = "".join(seq_lines[1:])
189+
else:
190+
seq = sequence.strip().replace(" ", "").replace("\n", "")
191+
192+
# Validate if it's a DNA sequence
193+
if not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
194+
logger.error("Invalid DNA sequence provided.")
195+
return None
196+
197+
if not seq:
198+
logger.error("Empty DNA sequence provided.")
199+
return None
200+
201+
# Use BLAST search (Note: requires network connection, may be slow)
202+
logger.debug("Performing BLAST search for DNA sequence...")
203+
time.sleep(0.35)
204+
from Bio.Blast import NCBIWWW, NCBIXML
205+
206+
result_handle = NCBIWWW.qblast(
207+
program="blastn",
208+
database="nr",
209+
sequence=seq,
210+
hitlist_size=1,
211+
expect=0.001,
212+
)
213+
blast_record = NCBIXML.read(result_handle)
214+
215+
if not blast_record.alignments:
216+
logger.info("No BLAST hits found for the given sequence.")
217+
return None
218+
219+
best_alignment = blast_record.alignments[0]
220+
best_hsp = best_alignment.hsps[0]
221+
hit_id = best_alignment.hit_id
222+
223+
# Extract accession number
224+
# Format may be: gi|123456|ref|NM_000546.5|
225+
accession_match = re.search(r"ref\|([^|]+)", hit_id)
226+
if accession_match:
227+
accession = accession_match.group(1).split(".")[0]
228+
return self.get_by_accession(accession)
229+
else:
230+
# If unable to extract accession, return basic information
231+
return {
232+
"molecule_type": "DNA",
233+
"database": "NCBI",
234+
"id": hit_id,
235+
"title": best_alignment.title,
236+
"sequence_length": len(seq),
237+
"e_value": best_hsp.expect,
238+
"identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
239+
"url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
240+
}
241+
except RequestException:
242+
raise
243+
except Exception as e: # pylint: disable=broad-except
244+
logger.error("BLAST search failed: %s", e)
245+
return None
246+
247+
@retry(
248+
stop=stop_after_attempt(5),
249+
wait=wait_exponential(multiplier=1, min=4, max=10),
250+
retry=retry_if_exception_type((RequestException, IncompleteRead)),
251+
reraise=True,
252+
)
253+
async def search(
254+
self, query: str, **kwargs
255+
) -> Optional[Dict]:
256+
"""
257+
Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.
258+
:param query: The search query (gene ID, accession, keyword, or DNA sequence).
259+
:param kwargs: Additional keyword arguments (not used currently).
260+
:return: A dictionary containing the search results or None if not found.
261+
"""
262+
# auto detect query type
263+
if not query or not isinstance(query, str):
264+
logger.error("Empty or non-string input.")
265+
return None
266+
query = query.strip()
267+
268+
logger.debug("NCBI search query: %s", query)
269+
270+
loop = asyncio.get_running_loop()
271+
272+
# check if DNA sequence (ATCG characters)
273+
if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
274+
result = await loop.run_in_executor(
275+
_get_pool(), self.search_by_sequence, query
276+
)
277+
# check if gene ID (numeric)
278+
elif re.fullmatch(r"^\d+$", query):
279+
result = await loop.run_in_executor(
280+
_get_pool(), self.get_by_gene_id, query
281+
)
282+
# check if accession number (e.g., NM_000546, NC_000001)
283+
elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
284+
result = await loop.run_in_executor(
285+
_get_pool(), self.get_by_accession, query
286+
)
287+
else:
288+
# otherwise treat as keyword
289+
result = await loop.run_in_executor(
290+
_get_pool(), self.search_by_keyword, query
291+
)
292+
293+
if result:
294+
result["_search_query"] = query
295+
return result
296+

0 commit comments

Comments
 (0)