MTD/Customized_host_offline.sh at main · patrick-douglas/MTD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/bin/bash
MTDIR=~/MTD
condapath=~/miniconda3
threads=`nproc`

# Função para mostrar instruções de uso
usage() {
    echo ""
    echo "Usage:"
    echo "  bash $0 --genome <genome.fa.gz> --gtf-file <annotations.gtf.gz> --ncbi-taxon-id <TaxonID> --offline-folder <path>"
    echo ""
    echo "Required options:"
    echo "  -d, --genome           Path to host genome FASTA (.fa.gz)"
    echo "  -g, --gtf-file         Path to GTF annotation file (.gtf.gz)"
    echo "  -c, --ncbi-taxon-id    NCBI Taxon ID of the host species"
    echo "  -o, --offline-folder   Folder where offline files will be stored"
    echo "      --help             Show this help message"
    echo ""
    echo "Example:"
    echo "  bash $0 --genome ~/genomes/Anas_platyrhynchos.ASM874695v1.dna.toplevel.fa.gz --gtf-file ~/annotations/Anas_platyrhynchos.ASM874695v1.dna.toplevel.fa.gz --ncbi-taxon-id 8839 --offline-folder ~/makeOrgPackageFromNCBI/"
    echo ""
    exit 1
}

# Define as opções curtas e longas
OPTIONS=c:d:g:o:
LONGOPTIONS=ncbi-taxon-id:,genome:,gtf-file:,offline-folder:,help

# Processa os argumentos
PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTIONS --name "$0" -- "$@")
if [[ $? -ne 0 ]]; then
    usage
fi

eval set -- "$PARSED"

# Variáveis
while true; do
    case "$1" in
        -c|--ncbi-taxon-id)
            customized="$2"
            shift 2
            ;;
        -d|--genome)
            download="$2"
            shift 2
            ;;
        -g|--gtf-file)
            gtf="$2"
            shift 2
            ;;
        -o|--offline-folder)
            offline_files_folder="$2"
            shift 2
            ;;
        --help)
            usage
            ;;
        --)
            shift
            break
            ;;
        *)
            echo "Error: invalid arguments: $1"
            usage
            ;;
    esac
done

# Verificação básica de argumentos obrigatórios
if [[ -z "$download" || -z "$gtf" || -z "$customized" || -z "$offline_files_folder" ]]; then
    echo "Error: Missing arguments"
    usage
fi

# Remove any previously created directories or files from previous runs
rm -rf $MTDIR/kraken2DB_${customized} $MTDIR/ref_${customized} $MTDIR/hisat2_index_${customized} $MTDIR/blastdb_${customized}

# get MTD folder place; same as Install.sh script file path (in the MTD folder)
dir=$(dirname $(readlink -f $0))
cd $dir # MTD folder place

# get conda path
condapath=$(head -n 1 $MTDIR/condaPath)
# activate MTD conda environment
source $condapath/etc/profile.d/conda.sh


conda activate MTD

# Kraken2 database building - Customized
DBNAME=kraken2DB_${customized}
rm -rf $DBNAME
mkdir -p $DBNAME
cd $DBNAME
#cp /media/me/4TB_BACKUP_LBN/Compressed/MTD/Calidris_pugnax.ASM143184v1.dna.toplevel.fa.gz .
#cp /media/me/4TB_BACKUP_LBN/Compressed/MTD/Myotis_lucifugus/Myotis_lucifugus.Myoluc2.0.dna.toplevel.fa.gz .
#wget -c $download #http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz
cp -u $download .
#bash ~/MTD/Customized_host.sh -d http://ftp.ensembl.org/pub/release-111/fasta/myotis_lucifugus/dna/Myotis_lucifugus.Myoluc2.0.dna.toplevel.fa.gz -c 59463 -g http://ftp.ensembl.org/pub/release-111/gtf/myotis_lucifugus/Myotis_lucifugus.Myoluc2.0.111.gtf.gz

#Morcego offline
#bash ~/MTD/Customized_host_offline.sh -d /media/me/4TB_BACKUP_LBN/Compressed/MTD/Myotis_lucifugus/Myotis_lucifugus.Myoluc2.0.dna.toplevel.fa.gz -g /media/me/4TB_BACKUP_LBN/Compressed/MTD/Myotis_lucifugus/Myotis_lucifugus.Myoluc2.0.111.gtf.gz -c 59463

#Calidris pugnax
#bash  ~/MTD/Customized_host.sh -t 20 -d https://ftp.ensembl.org/pub/release-111/fasta/calidris_pugnax/dna/Calidris_pugnax.ASM143184v1.dna.toplevel.fa.gz -c 198806 -g https://ftp.ensembl.org/pub/release-111/gtf/calidris_pugnax/Calidris_pugnax.ASM143184v1.111.gtf.gz
#Rattus norvegicus
#bash ~/MTD/Customized_host_offline.sh -d /media/me/4TB_BACKUP_LBN/Compressed/MTD/Rattus_norvegicus/Rattus_norvegicus.mRatBN7.2.dna.toplevel.fa.gz -g /media/me/4TB_BACKUP_LBN/Compressed/MTD/-c 10116

#Gallus gallus
#bash ~/MTD/Customized_host_offline.sh -d /media/me/4TB_BACKUP_LBN/Compressed/MTD/Gallus_gallus/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.dna.toplevel.fa.gz -g /media/me/4TB_BACKUP_LBN/Compressed/MTD/Gallus_gallus/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.111.gtf.gz -c 9031

unpigz *.fa.gz
mv *.fa genome_${customized}.fa

# Extraia o nome científico da espécie baseado no Taxon_ID
species_name=$(awk -F, -v taxid="$customized" '$1 == taxid {print $3}' "$MTDIR/HostSpecies.csv")

# Verifique se o nome da espécie foi encontrado
if [ -z "$species_name" ]; then
  echo "Error: species name not found for Taxon_ID $customized."
  exit 1
fi

# Extraia o assembly_name do cabeçalho da sequência de entrada
assembly_name=$(grep -m 1 '^>' genome_${customized}.fa | sed -n 's/.*dna:primary_assembly \([^ ]*\).*/\1/p')
echo ''
echo -e "Selected host species:\e[3m $species_name\e[0m"
#echo "Selected host species:$species_name"
echo "Taxon ID: $customized"
echo ''

# Check the header format of the FASTA file
header=$(head -n 1 genome_${customized}.fa)

if [[ "$header" == *"dna:primary_assembly"* || "$header" == *"dna:genescaffold"* ]]; then
    # Ensembl format detected
    echo "Detected Ensembl format. Modifying headers for Ensembl..."
    sed -i "s/^>\(.*\) dna:.* \(.*\):\(.*\):\(.*\):\(.*\) REF$/>kraken:taxid|${customized}|\\3 ${species_name} chromosome \\3, ${assembly_name} Primary Assembly/" genome_${customized}.fa
else
    # NCBI format detected
    echo "Detected NCBI format. Modifying headers for NCBI..."
    sed -i "s/^>\(.*\) \(.*\)/>kraken:taxid|${customized}|\\1 ${species_name} \\2/" genome_${customized}.fa
fi

echo "Header modification complete."
rm -rf $MTDIR/blastdb_$customized
mkdir -p $MTDIR/blastdb_$customized
#cp genome_${customized}.fa $MTDIR/blastdb_$customized

cd ..
kraken2-build --download-taxonomy --threads $threads --db $DBNAME
kraken2-build --add-to-library $DBNAME/genome_${customized}.fa --threads $threads --db $DBNAME
kraken2-build --build --threads $threads --db $DBNAME
# download host GTF
#wget -c $gtf -P ref_${customized} -O ref_${customized}.gtf.gz
echo "Coping GTF file to $ref_${customized}/ref_${customized}.gtf.gz"
rm -rf ref_${customized}
mkdir -p ref_${customized}
cp $gtf ref_${customized}
cd ref_${customized}
mv *.gtf.gz ref_${customized}.gtf.gz

#cp /media/me/4TB_BACKUP_LBN/Compressed/MTD/Calidris_pugnax.ASM143184v1.111.gtf.gz .
#cp /media/me/4TB_BACKUP_LBN/Compressed/MTD/Myotis_lucifugus/Myotis_lucifugus.Myoluc2.0.111.gtf.gz .
cd ..
echo "Building indexes for hisat2"
rm -rf hisat2_index_${customized}
mkdir -p hisat2_index_${customized}
cd hisat2_index_${customized}
cp ../ref_${customized}/ref_${customized}.gtf.gz .
gzip -d *.gtf.gz
mv *.gtf genome.gtf
python $dir/Installation/hisat2_extract_splice_sites.py genome.gtf > genome.ss
python $dir/Installation/hisat2_extract_exons.py genome.gtf > genome.exon
mv ../$DBNAME/genome_${customized}.fa genome.fa
hisat2-build -p $threads --exon genome.exon --ss genome.ss genome.fa genome_tran
cd ..

echo "Creating blast databases for custom reference $customized"
mkdir -p $MTDIR/blastdb_$customized
cd $MTDIR/blastdb_$customized
#cp $DBNAME/genome_${customized}.fa .
cp $download .
gunzip *.fa.gz
mv *.fa blastdb_$customized

makeblastdb -in $MTDIR/blastdb_$customized/blastdb_$customized -dbtype nucl -out $MTDIR/blastdb_$customized/blastdb_$customized -parse_seqids

echo "Creating the annotation package for R412"
echo -e "Selected host species:\e[3m $species_name\e[0m"
echo "Taxon ID: $customized"
echo ''
conda deactivate
#conda activate R412
cd $MTDIR
rm -rf NCBI org.*eg*
################################################################################################
#Instalar o AnnotationDBI 1.68
R -e "install.packages('~/MTD/update_fix/pvr_pkg/AnnotationDbi_1.66.0.tar.gz', repos=NULL, type='source', Ncpus=$threads)"
R -e "install.packages('~/MTD/update_fix/pvr_pkg/GO.db_3.19.1.tar.gz', repos=NULL, type='source', Ncpus=$threads)"
R --no-restore -e 'packageVersion("AnnotationDbi")'
R --no-restore -e 'packageVersion("GO.db")'

# Verificar se a pasta NBCI do makeOrgPackageFRomNCBI
bash $MTDIR/update_fix/verify_NCBI_makeOrgPackageFromNCBI.sh --MakeOrgpkgDir $offline_files_folder

# Run the R script and capture stdout/stderr

RSCRIPT_OUTPUT=$(Rscript $MTDIR/create_annotation_package.R --taxid $customized --offline $offline_files_folder --copy 2>&1)

# Optional: show output
echo "$RSCRIPT_OUTPUT"

# Extract the last line with the .sqlite file name
SQLITE_NAME=$(echo "$RSCRIPT_OUTPUT" | grep -oE 'org\.[A-Za-z]+\.eg\.sqlite' | tail -n1)

# Convert the .sqlite to .db (as expected by install.packages)
PKG_NAME="${SQLITE_NAME/sqlite/db}"

if [ -n "$PKG_NAME" ]; then
    echo "Installing R package: $PKG_NAME"
    R -e "install.packages('$PKG_NAME', repos = NULL, type = 'source')"
    rm -rf NCBI org.*eg*
else
    echo "Error: Could not extract the package name from the R script output."
    exit 1
fi

################################################################################################

echo "Customized host reference building is done"