-
Notifications
You must be signed in to change notification settings - Fork 511
/
Copy pathsetup_databases.sh
executable file
·144 lines (127 loc) · 4.65 KB
/
setup_databases.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash -ex
# Setup everything for using mmseqs locally
# Set MMSEQS_NO_INDEX to skip the index creation step (not useful for colabfold_search in most cases)
ARIA_NUM_CONN=8
WORKDIR="${1:-$(pwd)}"
PDB_SERVER="${2:-"rsync.wwpdb.org::ftp"}"
PDB_PORT="${3:-"33444"}"
# do initial download of the PDB through aws?
# still syncs latest structures through rsync
PDB_AWS_DOWNLOAD="${4:-}"
PDB_AWS_SNAPSHOT="20240101"
UNIREF30DB="uniref30_2302"
MMSEQS_NO_INDEX=${MMSEQS_NO_INDEX:-}
DOWNLOADS_ONLY=${DOWNLOADS_ONLY:-}
GPU=${GPU:-}
mkdir -p -- "${WORKDIR}"
cd "${WORKDIR}"
hasCommand () {
command -v "$1" >/dev/null 2>&1
}
STRATEGY=""
if hasCommand aria2c; then STRATEGY="$STRATEGY ARIA"; fi
if hasCommand curl; then STRATEGY="$STRATEGY CURL"; fi
if hasCommand wget; then STRATEGY="$STRATEGY WGET"; fi
if [ "$STRATEGY" = "" ]; then
fail "No download tool found in PATH. Please install aria2c, curl or wget."
fi
if [ -n "${PDB_AWS_DOWNLOAD}" ]; then
if ! hasCommand aws; then
fail "aws command not found in PATH. Please install the aws command line tool."
fi
fi
downloadFile() {
URL="$1"
OUTPUT="$2"
set +e
for i in $STRATEGY; do
case "$i" in
ARIA)
FILENAME=$(basename "${OUTPUT}")
DIR=$(dirname "${OUTPUT}")
aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0
;;
CURL)
curl -L -o "$OUTPUT" "$URL" && set -e && return 0
;;
WGET)
wget -O "$OUTPUT" "$URL" && set -e && return 0
;;
esac
done
set -e
fail "Could not download $URL to $OUTPUT"
}
if [ ! -f DOWNLOADS_READY ]; then
downloadFile "https://wwwuser.gwdg.de/~compbiol/colabfold/${UNIREF30DB}.tar.gz" "${UNIREF30DB}.tar.gz"
downloadFile "https://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz" "colabfold_envdb_202108.tar.gz"
downloadFile "https://wwwuser.gwdg.de/~compbiol/colabfold/pdb100_230517.fasta.gz" "pdb100_230517.fasta.gz"
downloadFile "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb100_foldseek_230517.tar.gz" "pdb100_foldseek_230517.tar.gz"
touch DOWNLOADS_READY
fi
if [ ! -f PDB_MMCIF_READY ]; then
mkdir -p pdb/divided
mkdir -p pdb/obsolete
if [ -n "${PDB_AWS_DOWNLOAD}" ]; then
aws s3 cp --no-sign-request --recursive s3://pdbsnapshots/${PDB_AWS_SNAPSHOT}/pub/pdb/data/structures/divided/mmCIF/ pdb/divided/
aws s3 cp --no-sign-request --recursive s3://pdbsnapshots/${PDB_AWS_SNAPSHOT}/pub/pdb/data/structures/obsolete/mmCIF/ pdb/obsolete/
fi
rsync -rlpt -v -z --delete --port=${PDB_PORT} ${PDB_SERVER}/data/structures/divided/mmCIF/ pdb/divided
rsync -rlpt -v -z --delete --port=${PDB_PORT} ${PDB_SERVER}/data/structures/obsolete/mmCIF/ pdb/obsolete
touch PDB_MMCIF_READY
fi
if [ -n "$DOWNLOADS_ONLY" ]; then
exit 0
fi
# Make MMseqs2 merge the databases to avoid spamming the folder with files
export MMSEQS_FORCE_MERGE=1
GPU_PAR=""
GPU_INDEX_PAR=""
if [ -n "${GPU}" ]; then
GPU_PAR="--gpu 1"
GPU_INDEX_PAR=" --split 1 --index-subset 2"
if ! mmseqs --help | grep -q 'gpuserver'; then
echo "The installed MMseqs2 has no GPU support, update to at least release 16"
exit 1
fi
fi
if [ ! -f UNIREF30_READY ]; then
tar xzvf "${UNIREF30DB}.tar.gz"
mmseqs tsv2exprofiledb "${UNIREF30DB}" "${UNIREF30DB}_db" ${GPU_PAR}
if [ -z "$MMSEQS_NO_INDEX" ]; then
mmseqs createindex "${UNIREF30DB}_db" tmp1 --remove-tmp-files 1 ${GPU_INDEX_PAR}
fi
if [ -e ${UNIREF30DB}_db_mapping ]; then
ln -sf ${UNIREF30DB}_db_mapping ${UNIREF30DB}_db.idx_mapping
fi
if [ -e ${UNIREF30DB}_db_taxonomy ]; then
ln -sf ${UNIREF30DB}_db_taxonomy ${UNIREF30DB}_db.idx_taxonomy
fi
touch UNIREF30_READY
fi
if [ ! -f COLABDB_READY ]; then
tar xzvf "colabfold_envdb_202108.tar.gz"
mmseqs tsv2exprofiledb "colabfold_envdb_202108" "colabfold_envdb_202108_db" ${GPU_PAR}
# TODO: split memory value for createindex?
if [ -z "$MMSEQS_NO_INDEX" ]; then
mmseqs createindex "colabfold_envdb_202108_db" tmp2 --remove-tmp-files 1 ${GPU_INDEX_PAR}
fi
touch COLABDB_READY
fi
if [ ! -f PDB_READY ]; then
if [ -n "${GPU}" ]; then
mmseqs createdb pdb100_230517.fasta.gz pdb100_230517_tmp
mmseqs makepaddedseqdb pdb100_230517_tmp pdb100_230517
mmseqs rmdb pdb100_230517_tmp
else
mmseqs createdb pdb100_230517.fasta.gz pdb100_230517
fi
if [ -z "$MMSEQS_NO_INDEX" ]; then
mmseqs createindex pdb100_230517 tmp3 --remove-tmp-files 1 ${GPU_INDEX_PAR}
fi
touch PDB_READY
fi
if [ ! -f PDB100_READY ]; then
tar xzvf pdb100_foldseek_230517.tar.gz pdb100_a3m.ffdata pdb100_a3m.ffindex
touch PDB100_READY
fi