diff --git a/.gitignore b/.gitignore
index 678cdc50..b654d301 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,3 +177,7 @@ cache
*.pyc
*.html
.gradio
+
+# macOS
+.DS_Store
+**/.DS_Store
diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml
index 5245ea0c..f53a5eb8 100644
--- a/graphgen/configs/search_dna_config.yaml
+++ b/graphgen/configs/search_dna_config.yaml
@@ -13,5 +13,5 @@ pipeline:
email: test@example.com # NCBI requires an email address
tool: GraphGen # tool name for NCBI API
use_local_blast: true # whether to use local blast for DNA search
- local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
+ local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml
index dae62ec2..10422988 100644
--- a/graphgen/configs/search_rna_config.yaml
+++ b/graphgen/configs/search_rna_config.yaml
@@ -11,6 +11,4 @@ pipeline:
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
rnacentral_params:
use_local_blast: true # whether to use local blast for RNA search
- local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
- # can also use DNA database with RNA sequences (if already built)
-
+ local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 0de8ecc0..f453c700 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -83,6 +83,29 @@ def _nested_get(data: dict, *keys, default=None):
data = data.get(key, default)
return data
+ @staticmethod
+ def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
+ """Infer molecule_type_detail from accession prefix or gene type."""
+ if accession:
+ if accession.startswith(("NM_", "XM_")):
+ return "mRNA"
+ if accession.startswith(("NC_", "NT_")):
+ return "genomic DNA"
+ if accession.startswith(("NR_", "XR_")):
+ return "RNA"
+ if accession.startswith("NG_"):
+ return "genomic region"
+ # Fallback: infer from gene type if available
+ if gene_type is not None:
+ gene_type_map = {
+ 3: "rRNA",
+ 4: "tRNA",
+ 5: "snRNA",
+ 6: "ncRNA",
+ }
+ return gene_type_map.get(gene_type)
+ return None
+
def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
"""
Convert an Entrez gene record to a dictionary.
@@ -120,7 +143,7 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
else None
)
- # Extract representative accession
+ # Extract representative accession (prefer type 3 = mRNA/transcript)
representative_accession = next(
(
product.get("Gene-commentary_accession")
@@ -129,6 +152,17 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
),
None,
)
+ # Fallback: if no type 3 accession, try any available accession
+ # This is needed for genes that don't have mRNA transcripts but have other sequence records
+ if not representative_accession:
+ representative_accession = next(
+ (
+ product.get("Gene-commentary_accession")
+ for product in locus.get("Gene-commentary_products", [])
+ if product.get("Gene-commentary_accession")
+ ),
+ None,
+ )
# Extract function
function = data.get("Entrezgene_summary") or next(
@@ -169,18 +203,19 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
"sequence": None,
"sequence_length": None,
"gene_id": gene_id,
- "molecule_type_detail": None,
+ "molecule_type_detail": self._infer_molecule_type_detail(
+ representative_accession, data.get("Entrezgene_type")
+ ),
"_representative_accession": representative_accession,
}
def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
"""Get gene information by Gene ID."""
- def _extract_from_genbank(result: dict, accession: str):
- """Enrich result dictionary with sequence and summary information from accession."""
+ def _extract_metadata_from_genbank(result: dict, accession: str):
+ """Extract metadata from GenBank format (title, features, organism, etc.)."""
with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
record = SeqIO.read(handle, "genbank")
- result["sequence"] = str(record.seq)
- result["sequence_length"] = len(record.seq)
+
result["title"] = record.description
result["molecule_type_detail"] = (
"mRNA" if accession.startswith(("NM_", "XM_")) else
@@ -206,6 +241,22 @@ def _extract_from_genbank(result: dict, accession: str):
return result
+ def _extract_sequence_from_fasta(result: dict, accession: str):
+ """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
+ try:
+ with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
+ fasta_record = SeqIO.read(fasta_handle, "fasta")
+ result["sequence"] = str(fasta_record.seq)
+ result["sequence_length"] = len(fasta_record.seq)
+ except Exception as fasta_exc:
+ logger.warning(
+ "Failed to extract sequence from accession %s using FASTA format: %s",
+ accession, fasta_exc
+ )
+ result["sequence"] = None
+ result["sequence_length"] = None
+ return result
+
try:
with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
gene_record = Entrez.read(handle)
@@ -214,7 +265,8 @@ def _extract_from_genbank(result: dict, accession: str):
result = self._gene_record_to_dict(gene_record, gene_id)
if accession := (preferred_accession or result.get("_representative_accession")):
- result = _extract_from_genbank(result, accession)
+ result = _extract_metadata_from_genbank(result, accession)
+ result = _extract_sequence_from_fasta(result, accession)
result.pop("_representative_accession", None)
return result
diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl
index 346b65f0..f423e1c1 100644
--- a/resources/input_examples/search_dna_demo.jsonl
+++ b/resources/input_examples/search_dna_demo.jsonl
@@ -1,9 +1,4 @@
-{"type": "text", "content": "TP53"}
-{"type": "text", "content": "BRCA1"}
-{"type": "text", "content": "672"}
-{"type": "text", "content": "11998"}
-{"type": "text", "content": "NM_000546"}
-{"type": "text", "content": "NM_024140"}
-{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
-{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
-
+{"type": "text", "content": "NG_033923"}
+{"type": "text", "content": "NG_056118"}
+{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
+{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl
index 16e99479..896473e2 100644
--- a/resources/input_examples/search_rna_demo.jsonl
+++ b/resources/input_examples/search_rna_demo.jsonl
@@ -1,5 +1,8 @@
{"type": "text", "content": "hsa-let-7a-1"}
+{"type": "text", "content": "XIST regulator"}
{"type": "text", "content": "URS0000123456"}
{"type": "text", "content": "URS0000000001"}
+{"type": "text", "content": "URS0000000787"}
+{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"}
{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh
index b53b4249..1928d7d0 100755
--- a/scripts/search/build_db/build_dna_blast_db.sh
+++ b/scripts/search/build_db/build_dna_blast_db.sh
@@ -24,7 +24,8 @@ set -e
# - {category}.{number}.genomic.fna.gz (基因组序列)
# - {category}.{number}.rna.fna.gz (RNA序列)
#
-# Usage: ./build_dna_blast_db.sh [representative|complete|all]
+# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all]
+# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
# representative: Download genomic sequences from major categories (recommended, smaller)
# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
# complete: Download all complete genomic sequences from complete/ directory (very large)
@@ -35,7 +36,7 @@ set -e
# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
-DOWNLOAD_TYPE=${1:-representative}
+DOWNLOAD_TYPE=${1:-human_mouse}
# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
DOWNLOAD_TMP=_downloading_dna
@@ -57,8 +58,66 @@ else
echo "Using date as release identifier: ${RELEASE}"
fi
+# Function to check if a file contains target species
+check_file_for_species() {
+ local url=$1
+ local filename=$2
+ local temp_file="/tmp/check_${filename//\//_}"
+
+ # Download first 500KB (enough to get many sequence headers)
+ # This should be sufficient to identify the species in most cases
+ if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
+ # Try to decompress and check for species names
+ if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
+ rm -f "${temp_file}"
+ return 0 # Contains target species
+ else
+ rm -f "${temp_file}"
+ return 1 # Does not contain target species
+ fi
+ else
+ # If partial download fails, skip this file (don't download it)
+ rm -f "${temp_file}"
+ return 1
+ fi
+}
+
# Download based on type
case ${DOWNLOAD_TYPE} in
+ human_mouse)
+ echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
+ echo "This will check each file to see if it contains human or mouse sequences..."
+ category="vertebrate_mammalian"
+ echo "Checking files in ${category} category..."
+
+ # Get list of files and save to temp file to avoid subshell issues
+ curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+ sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt
+
+ file_count=0
+ download_count=0
+
+ while read filename; do
+ file_count=$((file_count + 1))
+ url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
+ echo -n "[${file_count}] Checking ${filename}... "
+
+ if check_file_for_species "${url}" "${filename}"; then
+ echo "✓ contains target species, downloading..."
+ download_count=$((download_count + 1))
+ wget -c -q --show-progress "${url}" || {
+ echo "Warning: Failed to download ${filename}"
+ }
+ else
+ echo "✗ skipping (no human/mouse data)"
+ fi
+ done < /tmp/refseq_files.txt
+
+ rm -f /tmp/refseq_files.txt
+ echo ""
+ echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences."
+ ;;
representative)
echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
# Download major categories for representative coverage
@@ -109,7 +168,11 @@ case ${DOWNLOAD_TYPE} in
;;
*)
echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
- echo "Usage: $0 [representative|complete|all]"
+ echo "Usage: $0 [human_mouse|representative|complete|all]"
+ echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)"
+ echo " representative: Download major categories (recommended)"
+ echo " complete: Download all complete genomic sequences (very large)"
+ echo " all: Download all genomic sequences (extremely large)"
echo "Note: For RNA sequences, use build_rna_blast_db.sh instead"
exit 1
;;
diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh
index 89b9dc0e..26e1cd33 100755
--- a/scripts/search/build_db/build_rna_blast_db.sh
+++ b/scripts/search/build_db/build_rna_blast_db.sh
@@ -2,156 +2,218 @@
set -e
-# Downloads NCBI RefSeq RNA sequences and creates BLAST databases.
-# This script specifically downloads RNA sequences (mRNA, rRNA, tRNA, etc.)
-# from RefSeq, which is suitable for RNA sequence searches.
+# Downloads RNAcentral sequences and creates BLAST databases.
+# This script downloads the RNAcentral active database, which is the same
+# data source used for online RNAcentral searches, ensuring consistency
+# between local and online search results.
#
-# Usage: ./build_rna_blast_db.sh [representative|complete|all]
-# representative: Download RNA sequences from major categories (recommended, smaller)
-# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi, invertebrate, plant, viral
-# complete: Download all RNA sequences from complete/ directory (very large)
-# all: Download all RNA sequences from all categories (very large)
+# RNAcentral is a comprehensive database of non-coding RNA sequences that
+# integrates data from multiple expert databases including RefSeq, Rfam, etc.
+#
+# Usage: ./build_rna_blast_db.sh [all|list|database_name]
+# all (default): Download complete active database (~8.4G compressed)
+# list: List all available database subsets
+# database_name: Download specific database subset (e.g., refseq, rfam, mirbase)
+#
+# Available database subsets (examples):
+# - refseq.fasta (~98M): RefSeq RNA sequences
+# - rfam.fasta (~1.5G): Rfam RNA families
+# - mirbase.fasta (~10M): microRNA sequences
+# - ensembl.fasta (~2.9G): Ensembl annotations
+# - See "list" option for complete list
+#
+# The complete "active" database contains all sequences from all expert databases.
+# Using a specific database subset provides a smaller, focused database.
#
# We need makeblastdb on our PATH
# For Ubuntu/Debian: sudo apt install ncbi-blast+
# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
-DOWNLOAD_TYPE=${1:-representative}
+# RNAcentral HTTP base URL (using HTTPS for better reliability)
+RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral"
+RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release"
+RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences"
+RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database"
+
+# Parse command line argument
+DB_SELECTION=${1:-all}
+
+# List available databases if requested
+if [ "${DB_SELECTION}" = "list" ]; then
+ echo "Available RNAcentral database subsets:"
+ echo ""
+ echo "Fetching list from RNAcentral FTP..."
+ listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/")
+ echo "${listing}" | \
+ grep -oE '' | \
+ sed 's///' | \
+ sort | \
+ while read db; do
+ size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown")
+ echo " - ${db%.fasta}: ${size}"
+ done
+ echo ""
+ echo "Usage: $0 [database_name]"
+ echo " Example: $0 refseq # Download only RefSeq sequences (~98M)"
+ echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)"
+ echo " Example: $0 all # Download complete active database (~8.4G)"
+ exit 0
+fi
# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
-DOWNLOAD_TMP=_downloading_rna
+DOWNLOAD_TMP=_downloading_rnacentral
mkdir -p ${DOWNLOAD_TMP}
cd ${DOWNLOAD_TMP}
-# Download RefSeq release information
-echo "Downloading RefSeq release information..."
-wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || {
- echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier"
+# Get RNAcentral release version from release notes
+echo "Getting RNAcentral release information..."
+RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt"
+RELEASE_NOTES="release_notes.txt"
+wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || {
+ echo "Warning: Could not download release notes, using current date as release identifier"
RELEASE=$(date +%Y%m%d)
}
-if [ -f "RELEASE_NUMBER" ]; then
- RELEASE=$(cat RELEASE_NUMBER | tr -d '\n')
- echo "RefSeq release: ${RELEASE}"
-else
+if [ -f "${RELEASE_NOTES}" ]; then
+ # Try to extract version from release notes (first line usually contains version info)
+ RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.')
+fi
+
+if [ -z "${RELEASE}" ]; then
RELEASE=$(date +%Y%m%d)
echo "Using date as release identifier: ${RELEASE}"
+else
+ echo "RNAcentral release: ${RELEASE}"
fi
-# Download based on type
-case ${DOWNLOAD_TYPE} in
- representative)
- echo "Downloading RefSeq representative RNA sequences (recommended, smaller size)..."
- echo "Downloading RNA sequences from major categories..."
- for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral; do
- echo "Downloading ${category} RNA sequences..."
- curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
- grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
- sed 's/href="\(.*\)"/\1/' | \
- while read filename; do
- echo " Downloading ${filename}..."
- wget -c -q --show-progress \
- "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
- echo "Warning: Failed to download ${filename}"
- }
- done
- done
- ;;
- complete)
- echo "Downloading RefSeq complete RNA sequences (WARNING: very large, may take hours)..."
- curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
- grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
- sed 's/href="\(.*\)"/\1/' | \
- while read filename; do
- echo " Downloading ${filename}..."
- wget -c -q --show-progress \
- "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || {
- echo "Warning: Failed to download ${filename}"
- }
- done
- ;;
- all)
- echo "Downloading all RefSeq RNA sequences from all categories (WARNING: extremely large, may take many hours)..."
- for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
- echo "Downloading ${category} RNA sequences..."
- curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
- grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
- sed 's/href="\(.*\)"/\1/' | \
- while read filename; do
- echo " Downloading ${filename}..."
- wget -c -q --show-progress \
- "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
- echo "Warning: Failed to download ${filename}"
- }
- done
- done
- ;;
- *)
- echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
- echo "Usage: $0 [representative|complete|all]"
+# Download RNAcentral FASTA file
+if [ "${DB_SELECTION}" = "all" ]; then
+ # Download complete active database
+ FASTA_FILE="rnacentral_active.fasta.gz"
+ DB_NAME="rnacentral"
+ echo "Downloading RNAcentral active sequences (~8.4G)..."
+ echo " Contains sequences currently present in at least one expert database"
+ echo " Uses standard URS IDs (e.g., URS000149A9AF)"
+ echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency"
+ FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}"
+ IS_COMPRESSED=true
+else
+ # Download specific database subset
+ DB_NAME="${DB_SELECTION}"
+ FASTA_FILE="${DB_SELECTION}.fasta"
+ echo "Downloading RNAcentral database subset: ${DB_SELECTION}"
+ echo " This is a subset of the active database from a specific expert database"
+ echo " File: ${FASTA_FILE}"
+ FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}"
+ IS_COMPRESSED=false
+
+ # Check if database exists
+ if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then
+ echo "Error: Database '${DB_SELECTION}' not found"
+ echo "Run '$0 list' to see available databases"
exit 1
- ;;
-esac
-
-cd ..
-
-# Create release directory
-mkdir -p refseq_rna_${RELEASE}
-mv ${DOWNLOAD_TMP}/* refseq_rna_${RELEASE}/ 2>/dev/null || true
-rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
-
-cd refseq_rna_${RELEASE}
-
-# Extract and combine sequences
-echo "Extracting and combining RNA sequences..."
-
-# Extract all downloaded RNA sequences
-if [ $(find . -name "*.rna.fna.gz" -type f | wc -l) -gt 0 ]; then
- echo "Extracting RNA sequences..."
- find . -name "*.rna.fna.gz" -type f -exec gunzip {} \;
+ fi
fi
-# Combine all FASTA files into one
-echo "Combining all FASTA files..."
-FASTA_FILES=$(find . -name "*.fna" -type f)
-if [ -z "$FASTA_FILES" ]; then
- FASTA_FILES=$(find . -name "*.fa" -type f)
+echo "Downloading from: ${FASTA_URL}"
+echo "This may take a while depending on your internet connection..."
+if [ "${DB_SELECTION}" = "all" ]; then
+ echo "File size is approximately 8-9GB, please be patient..."
+else
+ echo "Downloading database subset..."
fi
+wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || {
+ echo "Error: Failed to download RNAcentral FASTA file"
+ echo "Please check your internet connection and try again"
+ echo "You can also try downloading manually from: ${FASTA_URL}"
+ exit 1
+}
-if [ -z "$FASTA_FILES" ]; then
- echo "Error: No FASTA files found to combine"
+if [ ! -f "${FASTA_FILE}" ]; then
+ echo "Error: Downloaded file not found"
exit 1
fi
-echo "$FASTA_FILES" | while read -r file; do
- if [ -f "$file" ]; then
- cat "$file" >> refseq_rna_${RELEASE}.fasta
+cd ..
+
+# Create release directory
+if [ "${DB_SELECTION}" = "all" ]; then
+ OUTPUT_DIR="rnacentral_${RELEASE}"
+else
+ OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}"
+fi
+mkdir -p ${OUTPUT_DIR}
+mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true
+rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+
+cd ${OUTPUT_DIR}
+
+# Extract FASTA file if compressed
+echo "Preparing RNAcentral sequences..."
+if [ -f "${FASTA_FILE}" ]; then
+ if [ "${IS_COMPRESSED}" = "true" ]; then
+ echo "Decompressing ${FASTA_FILE}..."
+ OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+ gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || {
+ echo "Error: Failed to decompress FASTA file"
+ exit 1
+ }
+ # Optionally remove the compressed file to save space
+ # rm "${FASTA_FILE}"
+ else
+ # File is not compressed, just copy/rename
+ OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+ cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || {
+ echo "Error: Failed to copy FASTA file"
+ exit 1
+ }
fi
-done
+else
+ echo "Error: FASTA file not found"
+ exit 1
+fi
# Check if we have sequences
-if [ ! -s "refseq_rna_${RELEASE}.fasta" ]; then
- echo "Error: Combined FASTA file is empty"
+if [ ! -s "${OUTPUT_FASTA}" ]; then
+ echo "Error: FASTA file is empty"
exit 1
fi
+# Get file size for user information
+FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1)
+echo "FASTA file size: ${FILE_SIZE}"
+
echo "Creating BLAST database..."
# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide)
-makeblastdb -in refseq_rna_${RELEASE}.fasta \
- -out refseq_rna_${RELEASE} \
+# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers,
+# which matches the format expected by the RNACentralSearch class
+DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+makeblastdb -in "${OUTPUT_FASTA}" \
+ -out "${DB_OUTPUT_NAME}" \
-dbtype nucl \
-parse_seqids \
- -title "RefSeq_RNA_${RELEASE}"
+ -title "RNAcentral_${DB_NAME}_${RELEASE}"
+echo ""
echo "BLAST database created successfully!"
-echo "Database location: $(pwd)/refseq_rna_${RELEASE}"
+echo "Database location: $(pwd)/${DB_OUTPUT_NAME}"
echo ""
-echo "To use this database, set in your config:"
-echo " local_blast_db: $(pwd)/refseq_rna_${RELEASE}"
+echo "To use this database, set in your config (search_rna_config.yaml):"
+echo " rnacentral_params:"
+echo " use_local_blast: true"
+echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}"
echo ""
echo "Note: The database files are:"
-ls -lh refseq_rna_${RELEASE}.*
+ls -lh ${DB_OUTPUT_NAME}.* | head -5
+echo ""
+if [ "${DB_SELECTION}" = "all" ]; then
+ echo "This database uses RNAcentral IDs (URS...), which matches the online"
+ echo "RNAcentral search API, ensuring consistent results between local and online searches."
+else
+ echo "This is a subset database from ${DB_SELECTION} expert database."
+ echo "For full coverage matching online API, use 'all' option."
+fi
cd ..
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 00000000..a02a6a37
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,3 @@
+version = 1
+revision = 3
+requires-python = ">=3.10"