diff --git a/.gitignore b/.gitignore index 678cdc50..b654d301 100644 --- a/.gitignore +++ b/.gitignore @@ -177,3 +177,7 @@ cache *.pyc *.html .gradio + +# macOS +.DS_Store +**/.DS_Store diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml index 5245ea0c..f53a5eb8 100644 --- a/graphgen/configs/search_dna_config.yaml +++ b/graphgen/configs/search_dna_config.yaml @@ -13,5 +13,5 @@ pipeline: email: test@example.com # NCBI requires an email address tool: GraphGen # tool name for NCBI API use_local_blast: true # whether to use local blast for DNA search - local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension) + local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml index dae62ec2..10422988 100644 --- a/graphgen/configs/search_rna_config.yaml +++ b/graphgen/configs/search_rna_config.yaml @@ -11,6 +11,4 @@ pipeline: data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral rnacentral_params: use_local_blast: true # whether to use local blast for RNA search - local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE} - # can also use DNA database with RNA sequences (if already built) - + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 0de8ecc0..f453c700 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -83,6 +83,29 @@ def _nested_get(data: dict, *keys, default=None): data = data.get(key, default) return data + @staticmethod + def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]: + """Infer molecule_type_detail from accession prefix or gene type.""" + if accession: + if accession.startswith(("NM_", "XM_")): + return "mRNA" + if accession.startswith(("NC_", "NT_")): + return "genomic DNA" + if accession.startswith(("NR_", "XR_")): + return "RNA" + if accession.startswith("NG_"): + return "genomic region" + # Fallback: infer from gene type if available + if gene_type is not None: + gene_type_map = { + 3: "rRNA", + 4: "tRNA", + 5: "snRNA", + 6: "ncRNA", + } + return gene_type_map.get(gene_type) + return None + def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: """ Convert an Entrez gene record to a dictionary. @@ -120,7 +143,7 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: else None ) - # Extract representative accession + # Extract representative accession (prefer type 3 = mRNA/transcript) representative_accession = next( ( product.get("Gene-commentary_accession") @@ -129,6 +152,17 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: ), None, ) + # Fallback: if no type 3 accession, try any available accession + # This is needed for genes that don't have mRNA transcripts but have other sequence records + if not representative_accession: + representative_accession = next( + ( + product.get("Gene-commentary_accession") + for product in locus.get("Gene-commentary_products", []) + if product.get("Gene-commentary_accession") + ), + None, + ) # Extract function function = data.get("Entrezgene_summary") or next( @@ -169,18 +203,19 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: "sequence": None, "sequence_length": None, "gene_id": gene_id, - "molecule_type_detail": None, + "molecule_type_detail": self._infer_molecule_type_detail( + representative_accession, data.get("Entrezgene_type") + ), "_representative_accession": representative_accession, } def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: """Get gene information by Gene ID.""" - def _extract_from_genbank(result: dict, accession: str): - """Enrich result dictionary with sequence and summary information from accession.""" + def _extract_metadata_from_genbank(result: dict, accession: str): + """Extract metadata from GenBank format (title, features, organism, etc.).""" with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle: record = SeqIO.read(handle, "genbank") - result["sequence"] = str(record.seq) - result["sequence_length"] = len(record.seq) + result["title"] = record.description result["molecule_type_detail"] = ( "mRNA" if accession.startswith(("NM_", "XM_")) else @@ -206,6 +241,22 @@ def _extract_from_genbank(result: dict, accession: str): return result + def _extract_sequence_from_fasta(result: dict, accession: str): + """Extract sequence from FASTA format (more reliable than GenBank for CON-type records).""" + try: + with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle: + fasta_record = SeqIO.read(fasta_handle, "fasta") + result["sequence"] = str(fasta_record.seq) + result["sequence_length"] = len(fasta_record.seq) + except Exception as fasta_exc: + logger.warning( + "Failed to extract sequence from accession %s using FASTA format: %s", + accession, fasta_exc + ) + result["sequence"] = None + result["sequence_length"] = None + return result + try: with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: gene_record = Entrez.read(handle) @@ -214,7 +265,8 @@ def _extract_from_genbank(result: dict, accession: str): result = self._gene_record_to_dict(gene_record, gene_id) if accession := (preferred_accession or result.get("_representative_accession")): - result = _extract_from_genbank(result, accession) + result = _extract_metadata_from_genbank(result, accession) + result = _extract_sequence_from_fasta(result, accession) result.pop("_representative_accession", None) return result diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl index 346b65f0..f423e1c1 100644 --- a/resources/input_examples/search_dna_demo.jsonl +++ b/resources/input_examples/search_dna_demo.jsonl @@ -1,9 +1,4 @@ -{"type": "text", "content": "TP53"} -{"type": "text", "content": "BRCA1"} -{"type": "text", "content": "672"} -{"type": "text", "content": "11998"} -{"type": "text", "content": "NM_000546"} -{"type": "text", "content": "NM_024140"} -{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} -{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} - +{"type": "text", "content": "NG_033923"} +{"type": "text", "content": "NG_056118"} +{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl index 16e99479..896473e2 100644 --- a/resources/input_examples/search_rna_demo.jsonl +++ b/resources/input_examples/search_rna_demo.jsonl @@ -1,5 +1,8 @@ {"type": "text", "content": "hsa-let-7a-1"} +{"type": "text", "content": "XIST regulator"} {"type": "text", "content": "URS0000123456"} {"type": "text", "content": "URS0000000001"} +{"type": "text", "content": "URS0000000787"} +{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} {"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} {"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh index b53b4249..1928d7d0 100755 --- a/scripts/search/build_db/build_dna_blast_db.sh +++ b/scripts/search/build_db/build_dna_blast_db.sh @@ -24,7 +24,8 @@ set -e # - {category}.{number}.genomic.fna.gz (基因组序列) # - {category}.{number}.rna.fna.gz (RNA序列) # -# Usage: ./build_dna_blast_db.sh [representative|complete|all] +# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all] +# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest) # representative: Download genomic sequences from major categories (recommended, smaller) # Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi # complete: Download all complete genomic sequences from complete/ directory (very large) @@ -35,7 +36,7 @@ set -e # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -DOWNLOAD_TYPE=${1:-representative} +DOWNLOAD_TYPE=${1:-human_mouse} # Better to use a stable DOWNLOAD_TMP name to support resuming downloads DOWNLOAD_TMP=_downloading_dna @@ -57,8 +58,66 @@ else echo "Using date as release identifier: ${RELEASE}" fi +# Function to check if a file contains target species +check_file_for_species() { + local url=$1 + local filename=$2 + local temp_file="/tmp/check_${filename//\//_}" + + # Download first 500KB (enough to get many sequence headers) + # This should be sufficient to identify the species in most cases + if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then + # Try to decompress and check for species names + if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then + rm -f "${temp_file}" + return 0 # Contains target species + else + rm -f "${temp_file}" + return 1 # Does not contain target species + fi + else + # If partial download fails, skip this file (don't download it) + rm -f "${temp_file}" + return 1 + fi +} + # Download based on type case ${DOWNLOAD_TYPE} in + human_mouse) + echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..." + echo "This will check each file to see if it contains human or mouse sequences..." + category="vertebrate_mammalian" + echo "Checking files in ${category} category..." + + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt + + file_count=0 + download_count=0 + + while read filename; do + file_count=$((file_count + 1)) + url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" + echo -n "[${file_count}] Checking ${filename}... " + + if check_file_for_species "${url}" "${filename}"; then + echo "✓ contains target species, downloading..." + download_count=$((download_count + 1)) + wget -c -q --show-progress "${url}" || { + echo "Warning: Failed to download ${filename}" + } + else + echo "✗ skipping (no human/mouse data)" + fi + done < /tmp/refseq_files.txt + + rm -f /tmp/refseq_files.txt + echo "" + echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences." + ;; representative) echo "Downloading RefSeq representative sequences (recommended, smaller size)..." # Download major categories for representative coverage @@ -109,7 +168,11 @@ case ${DOWNLOAD_TYPE} in ;; *) echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" - echo "Usage: $0 [representative|complete|all]" + echo "Usage: $0 [human_mouse|representative|complete|all]" + echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)" + echo " representative: Download major categories (recommended)" + echo " complete: Download all complete genomic sequences (very large)" + echo " all: Download all genomic sequences (extremely large)" echo "Note: For RNA sequences, use build_rna_blast_db.sh instead" exit 1 ;; diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh index 89b9dc0e..26e1cd33 100755 --- a/scripts/search/build_db/build_rna_blast_db.sh +++ b/scripts/search/build_db/build_rna_blast_db.sh @@ -2,156 +2,218 @@ set -e -# Downloads NCBI RefSeq RNA sequences and creates BLAST databases. -# This script specifically downloads RNA sequences (mRNA, rRNA, tRNA, etc.) -# from RefSeq, which is suitable for RNA sequence searches. +# Downloads RNAcentral sequences and creates BLAST databases. +# This script downloads the RNAcentral active database, which is the same +# data source used for online RNAcentral searches, ensuring consistency +# between local and online search results. # -# Usage: ./build_rna_blast_db.sh [representative|complete|all] -# representative: Download RNA sequences from major categories (recommended, smaller) -# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi, invertebrate, plant, viral -# complete: Download all RNA sequences from complete/ directory (very large) -# all: Download all RNA sequences from all categories (very large) +# RNAcentral is a comprehensive database of non-coding RNA sequences that +# integrates data from multiple expert databases including RefSeq, Rfam, etc. +# +# Usage: ./build_rna_blast_db.sh [all|list|database_name] +# all (default): Download complete active database (~8.4G compressed) +# list: List all available database subsets +# database_name: Download specific database subset (e.g., refseq, rfam, mirbase) +# +# Available database subsets (examples): +# - refseq.fasta (~98M): RefSeq RNA sequences +# - rfam.fasta (~1.5G): Rfam RNA families +# - mirbase.fasta (~10M): microRNA sequences +# - ensembl.fasta (~2.9G): Ensembl annotations +# - See "list" option for complete list +# +# The complete "active" database contains all sequences from all expert databases. +# Using a specific database subset provides a smaller, focused database. # # We need makeblastdb on our PATH # For Ubuntu/Debian: sudo apt install ncbi-blast+ # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -DOWNLOAD_TYPE=${1:-representative} +# RNAcentral HTTP base URL (using HTTPS for better reliability) +RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" +RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" +RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" +RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" + +# Parse command line argument +DB_SELECTION=${1:-all} + +# List available databases if requested +if [ "${DB_SELECTION}" = "list" ]; then + echo "Available RNAcentral database subsets:" + echo "" + echo "Fetching list from RNAcentral FTP..." + listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") + echo "${listing}" | \ + grep -oE '' | \ + sed 's///' | \ + sort | \ + while read db; do + size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown") + echo " - ${db%.fasta}: ${size}" + done + echo "" + echo "Usage: $0 [database_name]" + echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" + echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" + echo " Example: $0 all # Download complete active database (~8.4G)" + exit 0 +fi # Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading_rna +DOWNLOAD_TMP=_downloading_rnacentral mkdir -p ${DOWNLOAD_TMP} cd ${DOWNLOAD_TMP} -# Download RefSeq release information -echo "Downloading RefSeq release information..." -wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || { - echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier" +# Get RNAcentral release version from release notes +echo "Getting RNAcentral release information..." +RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" +RELEASE_NOTES="release_notes.txt" +wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || { + echo "Warning: Could not download release notes, using current date as release identifier" RELEASE=$(date +%Y%m%d) } -if [ -f "RELEASE_NUMBER" ]; then - RELEASE=$(cat RELEASE_NUMBER | tr -d '\n') - echo "RefSeq release: ${RELEASE}" -else +if [ -f "${RELEASE_NOTES}" ]; then + # Try to extract version from release notes (first line usually contains version info) + RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') +fi + +if [ -z "${RELEASE}" ]; then RELEASE=$(date +%Y%m%d) echo "Using date as release identifier: ${RELEASE}" +else + echo "RNAcentral release: ${RELEASE}" fi -# Download based on type -case ${DOWNLOAD_TYPE} in - representative) - echo "Downloading RefSeq representative RNA sequences (recommended, smaller size)..." - echo "Downloading RNA sequences from major categories..." - for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral; do - echo "Downloading ${category} RNA sequences..." - curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ - grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do - echo " Downloading ${filename}..." - wget -c -q --show-progress \ - "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { - echo "Warning: Failed to download ${filename}" - } - done - done - ;; - complete) - echo "Downloading RefSeq complete RNA sequences (WARNING: very large, may take hours)..." - curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ - grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do - echo " Downloading ${filename}..." - wget -c -q --show-progress \ - "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { - echo "Warning: Failed to download ${filename}" - } - done - ;; - all) - echo "Downloading all RefSeq RNA sequences from all categories (WARNING: extremely large, may take many hours)..." - for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do - echo "Downloading ${category} RNA sequences..." - curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ - grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do - echo " Downloading ${filename}..." - wget -c -q --show-progress \ - "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { - echo "Warning: Failed to download ${filename}" - } - done - done - ;; - *) - echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" - echo "Usage: $0 [representative|complete|all]" +# Download RNAcentral FASTA file +if [ "${DB_SELECTION}" = "all" ]; then + # Download complete active database + FASTA_FILE="rnacentral_active.fasta.gz" + DB_NAME="rnacentral" + echo "Downloading RNAcentral active sequences (~8.4G)..." + echo " Contains sequences currently present in at least one expert database" + echo " Uses standard URS IDs (e.g., URS000149A9AF)" + echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" + FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" + IS_COMPRESSED=true +else + # Download specific database subset + DB_NAME="${DB_SELECTION}" + FASTA_FILE="${DB_SELECTION}.fasta" + echo "Downloading RNAcentral database subset: ${DB_SELECTION}" + echo " This is a subset of the active database from a specific expert database" + echo " File: ${FASTA_FILE}" + FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" + IS_COMPRESSED=false + + # Check if database exists + if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then + echo "Error: Database '${DB_SELECTION}' not found" + echo "Run '$0 list' to see available databases" exit 1 - ;; -esac - -cd .. - -# Create release directory -mkdir -p refseq_rna_${RELEASE} -mv ${DOWNLOAD_TMP}/* refseq_rna_${RELEASE}/ 2>/dev/null || true -rmdir ${DOWNLOAD_TMP} 2>/dev/null || true - -cd refseq_rna_${RELEASE} - -# Extract and combine sequences -echo "Extracting and combining RNA sequences..." - -# Extract all downloaded RNA sequences -if [ $(find . -name "*.rna.fna.gz" -type f | wc -l) -gt 0 ]; then - echo "Extracting RNA sequences..." - find . -name "*.rna.fna.gz" -type f -exec gunzip {} \; + fi fi -# Combine all FASTA files into one -echo "Combining all FASTA files..." -FASTA_FILES=$(find . -name "*.fna" -type f) -if [ -z "$FASTA_FILES" ]; then - FASTA_FILES=$(find . -name "*.fa" -type f) +echo "Downloading from: ${FASTA_URL}" +echo "This may take a while depending on your internet connection..." +if [ "${DB_SELECTION}" = "all" ]; then + echo "File size is approximately 8-9GB, please be patient..." +else + echo "Downloading database subset..." fi +wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || { + echo "Error: Failed to download RNAcentral FASTA file" + echo "Please check your internet connection and try again" + echo "You can also try downloading manually from: ${FASTA_URL}" + exit 1 +} -if [ -z "$FASTA_FILES" ]; then - echo "Error: No FASTA files found to combine" +if [ ! -f "${FASTA_FILE}" ]; then + echo "Error: Downloaded file not found" exit 1 fi -echo "$FASTA_FILES" | while read -r file; do - if [ -f "$file" ]; then - cat "$file" >> refseq_rna_${RELEASE}.fasta +cd .. + +# Create release directory +if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" +else + OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" +fi +mkdir -p ${OUTPUT_DIR} +mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true +rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + +cd ${OUTPUT_DIR} + +# Extract FASTA file if compressed +echo "Preparing RNAcentral sequences..." +if [ -f "${FASTA_FILE}" ]; then + if [ "${IS_COMPRESSED}" = "true" ]; then + echo "Decompressing ${FASTA_FILE}..." + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { + echo "Error: Failed to decompress FASTA file" + exit 1 + } + # Optionally remove the compressed file to save space + # rm "${FASTA_FILE}" + else + # File is not compressed, just copy/rename + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { + echo "Error: Failed to copy FASTA file" + exit 1 + } fi -done +else + echo "Error: FASTA file not found" + exit 1 +fi # Check if we have sequences -if [ ! -s "refseq_rna_${RELEASE}.fasta" ]; then - echo "Error: Combined FASTA file is empty" +if [ ! -s "${OUTPUT_FASTA}" ]; then + echo "Error: FASTA file is empty" exit 1 fi +# Get file size for user information +FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) +echo "FASTA file size: ${FILE_SIZE}" + echo "Creating BLAST database..." # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) -makeblastdb -in refseq_rna_${RELEASE}.fasta \ - -out refseq_rna_${RELEASE} \ +# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, +# which matches the format expected by the RNACentralSearch class +DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" +makeblastdb -in "${OUTPUT_FASTA}" \ + -out "${DB_OUTPUT_NAME}" \ -dbtype nucl \ -parse_seqids \ - -title "RefSeq_RNA_${RELEASE}" + -title "RNAcentral_${DB_NAME}_${RELEASE}" +echo "" echo "BLAST database created successfully!" -echo "Database location: $(pwd)/refseq_rna_${RELEASE}" +echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" echo "" -echo "To use this database, set in your config:" -echo " local_blast_db: $(pwd)/refseq_rna_${RELEASE}" +echo "To use this database, set in your config (search_rna_config.yaml):" +echo " rnacentral_params:" +echo " use_local_blast: true" +echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" echo "" echo "Note: The database files are:" -ls -lh refseq_rna_${RELEASE}.* +ls -lh ${DB_OUTPUT_NAME}.* | head -5 +echo "" +if [ "${DB_SELECTION}" = "all" ]; then + echo "This database uses RNAcentral IDs (URS...), which matches the online" + echo "RNAcentral search API, ensuring consistent results between local and online searches." +else + echo "This is a subset database from ${DB_SELECTION} expert database." + echo "For full coverage matching online API, use 'all' option." +fi cd .. diff --git a/uv.lock b/uv.lock new file mode 100644 index 00000000..a02a6a37 --- /dev/null +++ b/uv.lock @@ -0,0 +1,3 @@ +version = 1 +revision = 3 +requires-python = ">=3.10"