Skip to content

Commit b0a9498

Browse files
authored
Merge pull request #2 from kdroidFilter/feature/docker-support
Add Docker support for running export pipeline locally
2 parents f9cb524 + 7e82488 commit b0a9498

9 files changed

Lines changed: 222 additions & 13 deletions

.dockerignore

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Git
2+
.git
3+
.gitignore
4+
5+
# IDE
6+
.idea
7+
.vscode
8+
*.iml
9+
10+
# Exports and output (generated)
11+
exports/
12+
output/
13+
*.tar.zst
14+
*.part*
15+
16+
# Cloned repos (will be cloned inside container)
17+
Sefaria-Project/
18+
19+
# OS files
20+
.DS_Store
21+
Thumbs.db
22+
23+
# Docker
24+
docker-compose*.yml
25+
Dockerfile*
26+
.dockerignore

01_compute_timestamp.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#!/usr/bin/env bash
22
set -euo pipefail
33

4-
# Compute release timestamp and expose it via GITHUB_OUTPUT
4+
# Compute release timestamp
55
TZ="${TZ_NAME:-Asia/Jerusalem}" date '+%Y-%m-%d_%H-%M' > ts.txt
6-
echo "stamp=$(cat ts.txt)" >> "${GITHUB_OUTPUT}"
6+
export TS_STAMP="$(cat ts.txt)"
7+
echo "Timestamp: $TS_STAMP"
8+
9+
# Export to GITHUB_OUTPUT if running in GitHub Actions
10+
if [ -n "${GITHUB_OUTPUT:-}" ]; then
11+
echo "stamp=$TS_STAMP" >> "${GITHUB_OUTPUT}"
12+
fi

05_clone_sefaria_project.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/usr/bin/env bash
22
set -euo pipefail
33

4-
git clone --depth 1 https://github.com/Sefaria/Sefaria-Project.git
4+
if [ -d "Sefaria-Project" ]; then
5+
echo "Sefaria-Project already exists, skipping clone"
6+
else
7+
git clone --depth 1 https://github.com/Sefaria/Sefaria-Project.git
8+
fi
59
ls -la Sefaria-Project | head -n 50

09_create_exports_dir.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#!/usr/bin/env bash
22
set -euo pipefail
33

4-
EXPORTS_DIR="${GITHUB_WORKSPACE:-$PWD}/exports"
4+
EXPORTS_DIR="${SEFARIA_EXPORT_PATH:-${GITHUB_WORKSPACE:-$PWD}/exports}"
55
mkdir -p "${EXPORTS_DIR}"
6-
echo "SEFARIA_EXPORT_BASE=${EXPORTS_DIR}" >> "${GITHUB_ENV}"
6+
export SEFARIA_EXPORT_BASE="${EXPORTS_DIR}"
7+
echo "Exports directory: ${EXPORTS_DIR}"
8+
9+
# Export to GITHUB_ENV if running in GitHub Actions
10+
if [ -n "${GITHUB_ENV:-}" ]; then
11+
echo "SEFARIA_EXPORT_BASE=${EXPORTS_DIR}" >> "${GITHUB_ENV}"
12+
fi

11_wait_for_mongodb.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
#!/usr/bin/env bash
22
set -euo pipefail
33

4+
MONGO_HOST="${MONGO_HOST:-127.0.0.1}"
5+
MONGO_PORT="${MONGO_PORT:-27017}"
6+
47
for i in {1..60}; do
5-
if nc -z 127.0.0.1 27017; then
6-
echo "✅ MongoDB reachable"; exit 0
8+
if nc -z "$MONGO_HOST" "$MONGO_PORT"; then
9+
echo "✅ MongoDB reachable at $MONGO_HOST:$MONGO_PORT"; exit 0
710
fi
8-
echo "⏳ Waiting for MongoDB..."; sleep 2
11+
echo "⏳ Waiting for MongoDB at $MONGO_HOST:$MONGO_PORT..."; sleep 2
912
done
1013
echo "❌ MongoDB not reachable in time" >&2
1114
exit 1

Dockerfile

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
FROM ubuntu:22.04
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
ENV TZ=Asia/Jerusalem
5+
ENV DJANGO_SETTINGS_MODULE=sefaria.settings
6+
ENV MONGO_HOST=mongodb
7+
ENV MONGO_PORT=27017
8+
ENV MONGO_DB_NAME=sefaria
9+
ENV PIP_NO_CACHE_DIR=1
10+
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
11+
12+
# Install base system dependencies and add deadsnakes PPA for Python 3.9
13+
RUN apt-get update -y && \
14+
apt-get install -y --no-install-recommends \
15+
software-properties-common \
16+
gpg-agent \
17+
&& add-apt-repository -y ppa:deadsnakes/ppa \
18+
&& apt-get update -y && \
19+
apt-get install -y --no-install-recommends \
20+
aria2 \
21+
ca-certificates \
22+
tar \
23+
zstd \
24+
wget \
25+
netcat-openbsd \
26+
git \
27+
curl \
28+
jq \
29+
unzip \
30+
python3.9 \
31+
python3.9-venv \
32+
python3.9-dev \
33+
python3.9-distutils \
34+
libre2-dev \
35+
pybind11-dev \
36+
build-essential \
37+
cmake \
38+
ninja-build \
39+
libpq-dev \
40+
sudo \
41+
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 \
42+
&& apt-get clean \
43+
&& rm -rf /var/lib/apt/lists/*
44+
45+
# Make python3.9 the default python
46+
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1 && \
47+
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1
48+
49+
# Install MongoDB Database Tools (detect architecture)
50+
ENV TOOLS_VER=100.9.4
51+
RUN ARCH=$(dpkg --print-architecture) && \
52+
if [ "$ARCH" = "arm64" ]; then \
53+
MONGO_ARCH="arm64"; \
54+
else \
55+
MONGO_ARCH="x86_64"; \
56+
fi && \
57+
wget -q "https://fastdl.mongodb.org/tools/db/mongodb-database-tools-ubuntu2204-${MONGO_ARCH}-${TOOLS_VER}.tgz" && \
58+
tar -xzf "mongodb-database-tools-ubuntu2204-${MONGO_ARCH}-${TOOLS_VER}.tgz" && \
59+
mv mongodb-database-tools-ubuntu2204-${MONGO_ARCH}-${TOOLS_VER}/bin/* /usr/local/bin/ && \
60+
rm -rf mongodb-database-tools-ubuntu2204-${MONGO_ARCH}-${TOOLS_VER}*
61+
62+
# Install GitHub CLI (optional, for releases)
63+
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg && \
64+
chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg && \
65+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null && \
66+
apt-get update && \
67+
apt-get install -y gh && \
68+
apt-get clean && \
69+
rm -rf /var/lib/apt/lists/*
70+
71+
WORKDIR /app
72+
73+
# Copy all scripts and Python files
74+
COPY *.sh *.py ./
75+
76+
# Make all scripts executable
77+
RUN chmod +x *.sh
78+
79+
# Create exports directory
80+
RUN mkdir -p /app/exports
81+
ENV SEFARIA_EXPORT_PATH=/app/exports
82+
83+
# Create entrypoint script
84+
RUN echo '#!/bin/bash\n\
85+
set -e\n\
86+
\n\
87+
echo "=== Sefaria Export Pipeline ==="\n\
88+
echo "MongoDB: $MONGO_HOST:$MONGO_PORT"\n\
89+
echo "Database: $MONGO_DB_NAME"\n\
90+
echo ""\n\
91+
\n\
92+
# Wait for MongoDB\n\
93+
echo "Waiting for MongoDB..."\n\
94+
./11_wait_for_mongodb.sh\n\
95+
\n\
96+
# Run the export pipeline\n\
97+
echo "Starting export pipeline..."\n\
98+
\n\
99+
./01_compute_timestamp.sh\n\
100+
./04_download_small_dump.sh\n\
101+
./05_clone_sefaria_project.sh\n\
102+
./06_install_build_deps.sh || true\n\
103+
./07_pip_install_requirements.sh || ./08_fallback_built_google_re2.sh\n\
104+
./09_create_exports_dir.sh\n\
105+
./10_create_local_settings.sh\n\
106+
./12_restore_db_from_dump.sh\n\
107+
./13_check_export_module.sh\n\
108+
./14_run_exports.sh\n\
109+
./15_verify_exports.sh\n\
110+
./16_drop_db.sh\n\
111+
./17a_remove_english_in_exports.sh\n\
112+
./17b_flatten_hebrew_in_exports.sh\n\
113+
./17_build_combined_archive.sh\n\
114+
./18_split_archive.sh\n\
115+
\n\
116+
echo ""\n\
117+
echo "=== Export complete! ==="\n\
118+
echo "Archives available in /app/exports"\n\
119+
ls -lah /app/exports/\n\
120+
' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
121+
122+
ENTRYPOINT ["/app/entrypoint.sh"]

configure_local_settings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ def main() -> None:
2020
with open(p, "r", encoding="utf-8") as f:
2121
s = f.read()
2222

23-
export_base = os.environ.get("SEFARIA_EXPORT_BASE", "")
24-
s = re.sub(r"SEFARIA_EXPORT_PATH\s*=.*", f'SEFARIA_EXPORT_PATH = r"{export_base}"', s)
23+
export_path = os.environ.get("SEFARIA_EXPORT_PATH", os.environ.get("SEFARIA_EXPORT_BASE", ""))
24+
s = re.sub(r"SEFARIA_EXPORT_PATH\s*=.*", f'SEFARIA_EXPORT_PATH = r"{export_path}"', s)
2525
s = re.sub(r"MONGO_HOST\s*=.*", f'MONGO_HOST = "{os.environ.get("MONGO_HOST", "127.0.0.1")}"', s)
2626
s = re.sub(r"MONGO_PORT\s*=.*", f'MONGO_PORT = {int(os.environ.get("MONGO_PORT", "27017"))}', s)
2727
s = re.sub(r"MONGO_DB_NAME\s*=.*", f'MONGO_DB_NAME = "{os.environ.get("MONGO_DB_NAME", "sefaria")}"', s)
@@ -35,7 +35,7 @@ def main() -> None:
3535
f.write(s)
3636

3737
print("✅ local_settings.py configured")
38-
print(f" SEFARIA_EXPORT_PATH = {export_base}")
38+
print(f" SEFARIA_EXPORT_PATH = {export_path}")
3939

4040

4141
if __name__ == "__main__":

docker-compose.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
services:
2+
mongodb:
3+
image: mongo:6.0
4+
container_name: sefaria-mongodb
5+
ports:
6+
- "27017:27017"
7+
volumes:
8+
- mongodb_data:/data/db
9+
healthcheck:
10+
test: ["CMD", "mongosh", "--eval", "db.runCommand({ ping: 1 })"]
11+
interval: 10s
12+
timeout: 5s
13+
retries: 20
14+
15+
exporter:
16+
build:
17+
context: .
18+
dockerfile: Dockerfile
19+
container_name: sefaria-exporter
20+
depends_on:
21+
mongodb:
22+
condition: service_healthy
23+
environment:
24+
- TZ=Asia/Jerusalem
25+
- DJANGO_SETTINGS_MODULE=sefaria.settings
26+
- MONGO_HOST=mongodb
27+
- MONGO_PORT=27017
28+
- MONGO_DB_NAME=sefaria
29+
- SEFARIA_EXPORT_PATH=/app/exports
30+
volumes:
31+
- ./exports:/app/exports
32+
- ./output:/app/output
33+
# Increase shared memory for MongoDB operations
34+
shm_size: 2gb
35+
36+
volumes:
37+
mongodb_data:

ensure_history_collection.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,17 @@
33
Ensure the MongoDB collection 'history' exists in the 'sefaria' database.
44
55
"""
6+
import os
67
from pymongo import MongoClient, errors
78

89

910
def main() -> None:
10-
client = MongoClient("mongodb://127.0.0.1:27017")
11-
db = client["sefaria"]
11+
host = os.environ.get("MONGO_HOST", "127.0.0.1")
12+
port = int(os.environ.get("MONGO_PORT", "27017"))
13+
db_name = os.environ.get("MONGO_DB_NAME", "sefaria")
14+
15+
client = MongoClient(host=host, port=port)
16+
db = client[db_name]
1217
try:
1318
db.create_collection("history")
1419
print("Created empty 'history' collection.")

0 commit comments

Comments
 (0)