From 270c5f2d1eaa272198c5710ac1617fa1f9f7cbb5 Mon Sep 17 00:00:00 2001 From: zimuwang Date: Mon, 29 Jun 2026 16:13:54 +0800 Subject: [PATCH] Add Docker-native container image --- README.md | 4 ++ containers/docker/README.md | 61 +++++++++++++++++++++++++++ containers/docker/standard.Dockerfile | 54 ++++++++++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 containers/docker/README.md create mode 100644 containers/docker/standard.Dockerfile diff --git a/README.md b/README.md index 3ffc2125..e2f870e0 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,10 @@ export GEMINI_API_KEY="your-key" bash src/commit_utils/commit.sh ``` +For cloud GPU environments that run workloads directly in Docker and do not +expose Apptainer/Singularity, see the Docker-native image build in +[`containers/docker`](containers/docker/README.md). + Currently, we only support the HTCondor job scheduler. [Harbor](https://github.com/harbor-framework/harbor) support is planned. #### API-based agents diff --git a/containers/docker/README.md b/containers/docker/README.md new file mode 100644 index 00000000..6189dcf5 --- /dev/null +++ b/containers/docker/README.md @@ -0,0 +1,61 @@ +# Docker-native container image + +This directory contains a Docker-native build of the standard PostTrainBench +environment. It is useful for cloud GPU environments that already run workloads +inside Docker containers and do not expose Apptainer/Singularity to the user. +Runpod Pods are one example of this deployment style. + +This image is only the execution environment. It does not replace the current +HTCondor submission scripts, and the existing Apptainer flow remains the +canonical path for cluster runs. + +## Build + +Build from the repository root so the Dockerfile can read +`containers/requirements-direct.txt`: + +```bash +docker build \ + -f containers/docker/standard.Dockerfile \ + -t posttrainbench-standard:docker . +``` + +## Smoke test + +On a machine with an NVIDIA GPU and the NVIDIA Container Toolkit: + +```bash +docker run --rm --gpus all posttrainbench-standard:docker \ + python - <<'PY' +from importlib.metadata import version + +import torch +import vllm +import inspect_ai + +print("cuda_available=", torch.cuda.is_available()) +print("device_count=", torch.cuda.device_count()) +print("vllm=", version("vllm")) +print("inspect_ai=", version("inspect-ai")) +PY +``` + +To work with a local PostTrainBench checkout: + +```bash +docker run --rm -it --gpus all \ + -v "$PWD:/workspace/PostTrainBench" \ + -w /workspace/PostTrainBench \ + posttrainbench-standard:docker \ + bash +``` + +## Notes + +- The image uses Python 3.11 in `/opt/posttrainbench-venv` because current + `inspect_evals` releases require Python 3.11 or newer. +- The image installs the same CLI agent tools as `containers/standard.def`: + Claude Code, Codex CLI, Gemini CLI, and OpenCode. +- If a cloud provider requires SSH access, add that provider-specific SSH setup + in a downstream image or through the provider template. SSH is intentionally + not part of this base Docker image. diff --git a/containers/docker/standard.Dockerfile b/containers/docker/standard.Dockerfile new file mode 100644 index 00000000..f65225ae --- /dev/null +++ b/containers/docker/standard.Dockerfile @@ -0,0 +1,54 @@ +FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV VIRTUAL_ENV=/opt/posttrainbench-venv +ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}" +ENV PYTHONNOUSERSITE=1 +ENV NO_PROXY=localhost,127.0.0.1 +ENV no_proxy=localhost,127.0.0.1 + +RUN chmod 1777 /tmp \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + git \ + python3 \ + python3-dev \ + wget \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ + && apt-get update \ + && apt-get install -y --no-install-recommends nodejs \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ + && uv python install 3.11 \ + && uv venv "${VIRTUAL_ENV}" --python 3.11 + +RUN uv pip install --no-cache vllm==0.11.0 + +RUN npm install -g \ + @anthropic-ai/claude-code@2.0.55 \ + @openai/codex@0.79.0 \ + @google/gemini-cli@0.18.4 \ + opencode-ai@1.1.59 + +COPY containers/requirements-direct.txt /tmp/posttrainbench-requirements.txt +RUN uv pip install --no-cache -r /tmp/posttrainbench-requirements.txt \ + && uv pip install --no-cache flash_attn --no-build-isolation \ + && rm /tmp/posttrainbench-requirements.txt + +RUN mkdir -p /opt \ + && cd /opt \ + && git clone --depth=1 https://github.com/UKGovernmentBEIS/inspect_evals.git \ + && cd /opt/inspect_evals \ + && uv pip install --no-cache . + +WORKDIR /workspace +CMD ["bash"]