Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,7 @@ cache
*.pyc
*.html
.gradio

# macOS
.DS_Store
**/.DS_Store
2 changes: 1 addition & 1 deletion graphgen/configs/search_dna_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ pipeline:
email: [email protected] # NCBI requires an email address
tool: GraphGen # tool name for NCBI API
use_local_blast: true # whether to use local blast for DNA search
local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)

4 changes: 1 addition & 3 deletions graphgen/configs/search_rna_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,4 @@ pipeline:
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
rnacentral_params:
use_local_blast: true # whether to use local blast for RNA search
local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
# can also use DNA database with RNA sequences (if already built)

local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
66 changes: 59 additions & 7 deletions graphgen/models/searcher/db/ncbi_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,29 @@ def _nested_get(data: dict, *keys, default=None):
data = data.get(key, default)
return data

@staticmethod
def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
"""Infer molecule_type_detail from accession prefix or gene type."""
if accession:
if accession.startswith(("NM_", "XM_")):
return "mRNA"
if accession.startswith(("NC_", "NT_")):
return "genomic DNA"
if accession.startswith(("NR_", "XR_")):
return "RNA"
if accession.startswith("NG_"):
return "genomic region"
# Fallback: infer from gene type if available
if gene_type is not None:
gene_type_map = {
3: "rRNA",
4: "tRNA",
5: "snRNA",
6: "ncRNA",
}
return gene_type_map.get(gene_type)
return None

def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
"""
Convert an Entrez gene record to a dictionary.
Expand Down Expand Up @@ -120,7 +143,7 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
else None
)

# Extract representative accession
# Extract representative accession (prefer type 3 = mRNA/transcript)
representative_accession = next(
(
product.get("Gene-commentary_accession")
Expand All @@ -129,6 +152,17 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
),
None,
)
# Fallback: if no type 3 accession, try any available accession
# This is needed for genes that don't have mRNA transcripts but have other sequence records
if not representative_accession:
representative_accession = next(
(
product.get("Gene-commentary_accession")
for product in locus.get("Gene-commentary_products", [])
if product.get("Gene-commentary_accession")
),
None,
)

# Extract function
function = data.get("Entrezgene_summary") or next(
Expand Down Expand Up @@ -169,18 +203,19 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
"sequence": None,
"sequence_length": None,
"gene_id": gene_id,
"molecule_type_detail": None,
"molecule_type_detail": self._infer_molecule_type_detail(
representative_accession, data.get("Entrezgene_type")
),
"_representative_accession": representative_accession,
}

def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
"""Get gene information by Gene ID."""
def _extract_from_genbank(result: dict, accession: str):
"""Enrich result dictionary with sequence and summary information from accession."""
def _extract_metadata_from_genbank(result: dict, accession: str):
"""Extract metadata from GenBank format (title, features, organism, etc.)."""
with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
record = SeqIO.read(handle, "genbank")
result["sequence"] = str(record.seq)
result["sequence_length"] = len(record.seq)

result["title"] = record.description
result["molecule_type_detail"] = (
"mRNA" if accession.startswith(("NM_", "XM_")) else
Expand All @@ -206,6 +241,22 @@ def _extract_from_genbank(result: dict, accession: str):

return result

def _extract_sequence_from_fasta(result: dict, accession: str):
"""Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
try:
with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
fasta_record = SeqIO.read(fasta_handle, "fasta")
result["sequence"] = str(fasta_record.seq)
result["sequence_length"] = len(fasta_record.seq)
except Exception as fasta_exc:
logger.warning(
"Failed to extract sequence from accession %s using FASTA format: %s",
accession, fasta_exc
)
result["sequence"] = None
result["sequence_length"] = None
return result

try:
with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
gene_record = Entrez.read(handle)
Expand All @@ -214,7 +265,8 @@ def _extract_from_genbank(result: dict, accession: str):

result = self._gene_record_to_dict(gene_record, gene_id)
if accession := (preferred_accession or result.get("_representative_accession")):
result = _extract_from_genbank(result, accession)
result = _extract_metadata_from_genbank(result, accession)
result = _extract_sequence_from_fasta(result, accession)

result.pop("_representative_accession", None)
return result
Expand Down
13 changes: 4 additions & 9 deletions resources/input_examples/search_dna_demo.jsonl
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
{"type": "text", "content": "TP53"}
{"type": "text", "content": "BRCA1"}
{"type": "text", "content": "672"}
{"type": "text", "content": "11998"}
{"type": "text", "content": "NM_000546"}
{"type": "text", "content": "NM_024140"}
{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}

{"type": "text", "content": "NG_033923"}
{"type": "text", "content": "NG_056118"}
{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
3 changes: 3 additions & 0 deletions resources/input_examples/search_rna_demo.jsonl
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
{"type": "text", "content": "hsa-let-7a-1"}
{"type": "text", "content": "XIST regulator"}
{"type": "text", "content": "URS0000123456"}
{"type": "text", "content": "URS0000000001"}
{"type": "text", "content": "URS0000000787"}
{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"}
{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
69 changes: 66 additions & 3 deletions scripts/search/build_db/build_dna_blast_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ set -e
# - {category}.{number}.genomic.fna.gz (基因组序列)
# - {category}.{number}.rna.fna.gz (RNA序列)
#
# Usage: ./build_dna_blast_db.sh [representative|complete|all]
# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all]
# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
# representative: Download genomic sequences from major categories (recommended, smaller)
# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
# complete: Download all complete genomic sequences from complete/ directory (very large)
Expand All @@ -35,7 +36,7 @@ set -e
# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/

DOWNLOAD_TYPE=${1:-representative}
DOWNLOAD_TYPE=${1:-human_mouse}

# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
DOWNLOAD_TMP=_downloading_dna
Expand All @@ -57,8 +58,66 @@ else
echo "Using date as release identifier: ${RELEASE}"
fi

# Function to check if a file contains target species
check_file_for_species() {
local url=$1
local filename=$2
local temp_file="/tmp/check_${filename//\//_}"

# Download first 500KB (enough to get many sequence headers)
# This should be sufficient to identify the species in most cases
if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
# Try to decompress and check for species names
if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
rm -f "${temp_file}"
return 0 # Contains target species
else
rm -f "${temp_file}"
return 1 # Does not contain target species
fi
else
# If partial download fails, skip this file (don't download it)
rm -f "${temp_file}"
return 1
fi
}
Comment on lines +62 to +83
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The check_file_for_species function uses a predictable temporary file path in /tmp. This can lead to race conditions and unexpected behavior if the script is run multiple times concurrently. It's a better and safer practice to use mktemp to create a unique temporary file. Using trap also simplifies cleanup logic by ensuring the temporary file is removed when the function exits.

check_file_for_species() {
    local url=$1
    local filename=$2
    local temp_file
    temp_file=$(mktemp)
    trap 'rm -f "${temp_file}"' RETURN
    
    # Download first 500KB (enough to get many sequence headers)
    # This should be sufficient to identify the species in most cases
    if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
        # Try to decompress and check for species names
        if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
            return 0  # Contains target species
        else
            return 1  # Does not contain target species
        fi
    else
        # If partial download fails, skip this file (don't download it)
        return 1
    fi
}


# Download based on type
case ${DOWNLOAD_TYPE} in
human_mouse)
echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
echo "This will check each file to see if it contains human or mouse sequences..."
category="vertebrate_mammalian"
echo "Checking files in ${category} category..."

# Get list of files and save to temp file to avoid subshell issues
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt

file_count=0
download_count=0

while read filename; do
file_count=$((file_count + 1))
url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
echo -n "[${file_count}] Checking ${filename}... "

if check_file_for_species "${url}" "${filename}"; then
echo "✓ contains target species, downloading..."
download_count=$((download_count + 1))
wget -c -q --show-progress "${url}" || {
echo "Warning: Failed to download ${filename}"
}
else
echo "✗ skipping (no human/mouse data)"
fi
done < /tmp/refseq_files.txt

rm -f /tmp/refseq_files.txt
Comment on lines +93 to +117
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The script uses a hardcoded temporary file /tmp/refseq_files.txt. If multiple instances of this script are run simultaneously, they will interfere with each other by overwriting this file, leading to incorrect downloads or failures. You should use mktemp to create a temporary file with a unique name to avoid this race condition.

        # Get list of files and save to temp file to avoid subshell issues
        file_list=$(mktemp)
        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
            grep -oE 'href="[^\"]*\.genomic\.fna\.gz"' | \
            sed 's/href="\(.*\)"/\1/' > "${file_list}"
        
        file_count=0
        download_count=0
        
        while read filename; do
            file_count=$((file_count + 1))
            url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
            echo -n "[${file_count}] Checking ${filename}... "
            
            if check_file_for_species "${url}" "${filename}"; then
                echo "✓ contains target species, downloading..."
                download_count=$((download_count + 1))
                wget -c -q --show-progress "${url}" || {
                    echo "Warning: Failed to download ${filename}"
                }
            else
                echo "✗ skipping (no human/mouse data)"
            fi
        done < "${file_list}"
        
        rm -f "${file_list}"

echo ""
echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences."
;;
representative)
echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
# Download major categories for representative coverage
Expand Down Expand Up @@ -109,7 +168,11 @@ case ${DOWNLOAD_TYPE} in
;;
*)
echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
echo "Usage: $0 [representative|complete|all]"
echo "Usage: $0 [human_mouse|representative|complete|all]"
echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)"
echo " representative: Download major categories (recommended)"
echo " complete: Download all complete genomic sequences (very large)"
echo " all: Download all genomic sequences (extremely large)"
echo "Note: For RNA sequences, use build_rna_blast_db.sh instead"
exit 1
;;
Expand Down
Loading