InternScience
diff --git a/‎graphgen/configs/search_dna_config.yaml‎
Lines changed: 15 additions & 0 deletions b/‎graphgen/configs/search_dna_config.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎graphgen/configs/search_config.yaml‎ ‎…phgen/configs/search_protein_config.yaml‎graphgen/configs/search_config.yaml renamed to graphgen/configs/search_protein_config.yaml
Lines changed: 1 addition & 1 deletion b/‎graphgen/configs/search_config.yaml‎ ‎…phgen/configs/search_protein_config.yaml‎graphgen/configs/search_config.yaml renamed to graphgen/configs/search_protein_config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/configs/search_rna_config.yaml‎
Lines changed: 14 additions & 0 deletions b/‎graphgen/configs/search_rna_config.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎graphgen/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎graphgen/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphgen/models/searcher/db/ncbi_searcher.py‎
Lines changed: 296 additions & 0 deletions b/‎graphgen/models/searcher/db/ncbi_searcher.py‎
Lines changed: 296 additions & 0 deletions
@@ -0,0 +1,15 @@
+pipeline:
+  - name: read_step
+    op_key: read
+    params:
+      input_file:  resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: search_step
+    op_key: search
+    deps: [read_step] # search_step depends on read_step
+    params:
+      data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      ncbi_params:
+        email: [email protected] # NCBI requires an email address
+        tool: GraphGen # tool name for NCBI API
+
@@ -2,7 +2,7 @@ pipeline:
   - name: read_step
     op_key: read
     params:
-      input_file:  resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      input_file:  resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 
   - name: search_step
     op_key: search
 
@@ -0,0 +1,14 @@
+pipeline:
+  - name: read_step
+    op_key: read
+    params:
+      input_file:  resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: search_step
+    op_key: search
+    deps: [read_step] # search_step depends on read_step
+    params:
+      data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      rnacentral_params:
+        {} # RNAcentral doesn't require additional parameters currently
+
@@ -26,6 +26,8 @@
     RDFReader,
     TXTReader,
 )
+from .searcher.db.ncbi_searcher import NCBISearch
+from .searcher.db.rnacentral_searcher import RNACentralSearch
 from .searcher.db.uniprot_searcher import UniProtSearch
 from .searcher.kg.wiki_search import WikiSearch
 from .searcher.web.bing_search import BingSearch
 
@@ -0,0 +1,296 @@
+import asyncio
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from http.client import IncompleteRead
+from typing import Dict, Optional
+
+from Bio import Entrez
+from requests.exceptions import RequestException
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+from graphgen.utils import logger
+
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=10)
+
+
+class NCBISearch(BaseSearcher):
+    """
+    NCBI Search client to search DNA/GenBank/Entrez databases.
+    1) Get the gene/DNA by accession number or gene ID.
+    2) Search with keywords or gene names (fuzzy search).
+    3) Search with FASTA sequence (BLAST search for DNA sequences).
+    
+    API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/
+    Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
+    """
+
+    def __init__(self, email: str = "[email protected]", tool: str = "GraphGen"):
+        super().__init__()
+        Entrez.email = email
+        Entrez.tool = tool
+        Entrez.timeout = 60  # 60 seconds timeout
+
+    def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
+        """
+        Get gene information by Gene ID.
+        :param gene_id: NCBI Gene ID.
+        :return: A dictionary containing gene information or None if not found.
+        """
+        try:
+            time.sleep(0.35)  # Comply with rate limit (max 3 requests per second)
+            handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
+            try:
+                gene_record = Entrez.read(handle)
+                if not gene_record:
+                    return None
+                
+                gene_data = gene_record[0]
+                gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
+                
+                return {
+                    "molecule_type": "DNA",
+                    "database": "NCBI",
+                    "id": gene_id,
+                    "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
+                    "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+                    "organism": gene_data.get("Entrezgene_source", {}).get("BioSource", {}).get("BioSource_org", {}).get("Org-ref", {}).get("Org-ref_taxname", "N/A"),
+                    "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
+                }
+            finally:
+                handle.close()
+        except RequestException:
+            raise
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Gene ID %s not found: %s", gene_id, exc)
+            return None
+
+    def get_by_accession(self, accession: str) -> Optional[dict]:
+        """
+        Get sequence information by accession number.
+        :param accession: NCBI accession number (e.g., NM_000546).
+        :return: A dictionary containing sequence information or None if not found.
+        """
+        try:
+            time.sleep(0.35)  # 遵守速率限制
+            handle = Entrez.efetch(
+                db="nuccore",
+                id=accession,
+                rettype="fasta",
+                retmode="text",
+            )
+            try:
+                sequence_data = handle.read()
+                if not sequence_data:
+                    return None
+                
+                seq_lines = sequence_data.strip().split("\n")
+                header = seq_lines[0] if seq_lines else ""
+                sequence = "".join(seq_lines[1:])
+                
+                # Try to get more information
+                time.sleep(0.35)
+                summary_handle = Entrez.esummary(db="nuccore", id=accession)
+                try:
+                    summary = Entrez.read(summary_handle)
+                    if summary:
+                        summary_data = summary[0]
+                        title = summary_data.get("Title", header)
+                        organism = summary_data.get("Organism", "N/A")
+                    else:
+                        title = header
+                        organism = "N/A"
+                finally:
+                    summary_handle.close()
+                
+                return {
+                    "molecule_type": "DNA",
+                    "database": "NCBI",
+                    "id": accession,
+                    "title": title,
+                    "organism": organism,
+                    "sequence": sequence,
+                    "sequence_length": len(sequence),
+                    "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}",
+                }
+            finally:
+                handle.close()
+        except RequestException:
+            raise
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Accession %s not found: %s", accession, exc)
+            return None
+
+    def search_by_keyword(self, keyword: str) -> Optional[dict]:
+        """
+        Search NCBI Gene database with a keyword and return the best hit.
+        :param keyword: The search keyword (e.g., gene name).
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        if not keyword.strip():
+            return None
+
+        try:
+            time.sleep(0.35)  # 遵守速率限制
+            # Search gene database
+            search_handle = Entrez.esearch(
+                db="gene",
+                term=f"{keyword}[Gene Name] OR {keyword}[All Fields]",
+                retmax=1,
+            )
+            try:
+                search_results = Entrez.read(search_handle)
+                if not search_results.get("IdList"):
+                    # If not found, try a broader search
+                    time.sleep(0.35)
+                    search_handle2 = Entrez.esearch(
+                        db="gene",
+                        term=keyword,
+                        retmax=1,
+                    )
+                    try:
+                        search_results = Entrez.read(search_handle2)
+                    finally:
+                        search_handle2.close()
+                
+                if search_results.get("IdList"):
+                    gene_id = search_results["IdList"][0]
+                    return self.get_by_gene_id(gene_id)
+            finally:
+                search_handle.close()
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Keyword %s not found: %s", keyword, e)
+        return None
+
+    def search_by_sequence(self, sequence: str) -> Optional[dict]:
+        """
+        Search NCBI with a DNA sequence using BLAST.
+        Note: This is a simplified version. For production, consider using local BLAST.
+        :param sequence: DNA sequence (FASTA format or raw sequence).
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        try:
+            # Extract sequence (if in FASTA format)
+            if sequence.startswith(">"):
+                seq_lines = sequence.strip().split("\n")
+                seq = "".join(seq_lines[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            
+            # Validate if it's a DNA sequence
+            if not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
+                logger.error("Invalid DNA sequence provided.")
+                return None
+            
+            if not seq:
+                logger.error("Empty DNA sequence provided.")
+                return None
+            
+            # Use BLAST search (Note: requires network connection, may be slow)
+            logger.debug("Performing BLAST search for DNA sequence...")
+            time.sleep(0.35)
+            from Bio.Blast import NCBIWWW, NCBIXML
+            
+            result_handle = NCBIWWW.qblast(
+                program="blastn",
+                database="nr",
+                sequence=seq,
+                hitlist_size=1,
+                expect=0.001,
+            )
+            blast_record = NCBIXML.read(result_handle)
+            
+            if not blast_record.alignments:
+                logger.info("No BLAST hits found for the given sequence.")
+                return None
+            
+            best_alignment = blast_record.alignments[0]
+            best_hsp = best_alignment.hsps[0]
+            hit_id = best_alignment.hit_id
+            
+            # Extract accession number
+            # Format may be: gi|123456|ref|NM_000546.5|
+            accession_match = re.search(r"ref\|([^|]+)", hit_id)
+            if accession_match:
+                accession = accession_match.group(1).split(".")[0]
+                return self.get_by_accession(accession)
+            else:
+                # If unable to extract accession, return basic information
+                return {
+                    "molecule_type": "DNA",
+                    "database": "NCBI",
+                    "id": hit_id,
+                    "title": best_alignment.title,
+                    "sequence_length": len(seq),
+                    "e_value": best_hsp.expect,
+                    "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+                    "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+                }
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("BLAST search failed: %s", e)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    async def search(
+        self, query: str, **kwargs
+    ) -> Optional[Dict]:
+        """
+        Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.
+        :param query: The search query (gene ID, accession, keyword, or DNA sequence).
+        :param kwargs: Additional keyword arguments (not used currently).
+        :return: A dictionary containing the search results or None if not found.
+        """
+        # auto detect query type
+        if not query or not isinstance(query, str):
+            logger.error("Empty or non-string input.")
+            return None
+        query = query.strip()
+
+        logger.debug("NCBI search query: %s", query)
+
+        loop = asyncio.get_running_loop()
+
+        # check if DNA sequence (ATCG characters)
+        if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+            result = await loop.run_in_executor(
+                _get_pool(), self.search_by_sequence, query
+            )
+        # check if gene ID (numeric)
+        elif re.fullmatch(r"^\d+$", query):
+            result = await loop.run_in_executor(
+                _get_pool(), self.get_by_gene_id, query
+            )
+        # check if accession number (e.g., NM_000546, NC_000001)
+        elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+            result = await loop.run_in_executor(
+                _get_pool(), self.get_by_accession, query
+            )
+        else:
+            # otherwise treat as keyword
+            result = await loop.run_in_executor(
+                _get_pool(), self.search_by_keyword, query
+            )
+
+        if result:
+            result["_search_query"] = query
+        return result
+
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,8 @@`
`26`	`26`	`RDFReader,`
`27`	`27`	`TXTReader,`
`28`	`28`	`)`
	`29`	`+from .searcher.db.ncbi_searcher import NCBISearch`
	`30`	`+from .searcher.db.rnacentral_searcher import RNACentralSearch`
`29`	`31`	`from .searcher.db.uniprot_searcher import UniProtSearch`
`30`	`32`	`from .searcher.kg.wiki_search import WikiSearch`
`31`	`33`	`from .searcher.web.bing_search import BingSearch`