diff --git a/.devcontainer/devcontainer.json.j2 b/.devcontainer/devcontainer.json.j2 index 29a5c5faad..fa15aa39cb 100644 --- a/.devcontainer/devcontainer.json.j2 +++ b/.devcontainer/devcontainer.json.j2 @@ -5,7 +5,7 @@ "SPDX-License-Identifier: Apache-2.0" ], "name": "Dynamo {{ framework.upper() }} Dev Container", - "remoteUser": "ubuntu", // Matches our container user + "remoteUser": "dynamo", // Matches our container user "updateRemoteUserUID": true, // Updates the UID of the remote user to match the host user, avoids permission errors "image": "dynamo:latest-{{ framework }}-local-dev", // Use the latest {{ framework.upper() }} dev image "runArgs": [ @@ -68,11 +68,11 @@ }, "mounts": [ // These are for convenience, so that the history and pre-commit cache are persisted between sessions - "source=dynamo-bashhistory,target=/home/ubuntu/.commandhistory,type=volume", - "source=dynamo-precommit-cache,target=/home/ubuntu/.cache/pre-commit,type=volume", + "source=dynamo-bashhistory,target=/home/dynamo/.commandhistory,type=volume", + "source=dynamo-precommit-cache,target=/home/dynamo/.cache/pre-commit,type=volume", // Default mounts "source=/tmp/,target=/tmp/,type=bind" // Uncomment this to reuse your Hugging Face cache - //"source=${localEnv:HOME}/.cache/huggingface,target=/home/ubuntu/.cache/huggingface,type=bind" + //"source=${localEnv:HOME}/.cache/huggingface,target=/home/dynamo/.cache/huggingface,type=bind" ] } diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index af838d4af3..cca684695b 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -54,18 +54,23 @@ runs: # Run pytest with detailed output and JUnit XML set +e # Don't exit on test failures - docker run --runtime=nvidia --rm --gpus all -w /workspace \ + docker run --runtime=nvidia --gpus all -w /workspace \ --cpus=${NUM_CPUS} \ --network host \ --name ${{ env.CONTAINER_ID }}_pytest \ - -v "$(pwd)/test-results:/test-results" \ ${{ inputs.image_tag }} \ - bash -c "pytest -v --tb=short --basetemp=/tmp --junitxml=/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\"" + bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\"" TEST_EXIT_CODE=$? echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}" + # Copy test results from container to host + docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/test-results . || echo "Failed to copy test results" + + # Clean up container + docker rm -f ${{ env.CONTAINER_ID }}_pytest || echo "Failed to clean up container" + # Always continue to results processing exit 0 diff --git a/.github/workflows/container-validation-dynamo.yml b/.github/workflows/container-validation-dynamo.yml index cf84e57222..f982a2d08a 100644 --- a/.github/workflows/container-validation-dynamo.yml +++ b/.github/workflows/container-validation-dynamo.yml @@ -49,7 +49,7 @@ jobs: docker compose up -d nats-server etcd-server - name: Run Rust checks (block-manager + integration tests) run: | - docker run --rm -v ${{ github.workspace }}:/workspace -w /workspace/lib/llm \ + docker run --rm -w /workspace/lib/llm \ --name ${{ env.CONTAINER_ID }}_rust_checks \ ${{ steps.define_image_tag.outputs.image_tag }} \ bash -ec 'rustup component add rustfmt clippy && \ @@ -66,7 +66,7 @@ jobs: env: PYTEST_MARKS: "pre_merge or mypy" run: | - docker run -v ${{ github.workspace }}:/workspace -w /workspace \ + docker run -w /workspace \ --name ${{ env.CONTAINER_ID }}_pytest \ ${{ steps.define_image_tag.outputs.image_tag }} \ bash -c "pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\" " diff --git a/container/Dockerfile b/container/Dockerfile index b17c57ca7b..4623b4d84a 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -335,23 +335,6 @@ ARG ARCH_ALT ENV DYNAMO_HOME=/opt/dynamo \ CARGO_TARGET_DIR=/opt/dynamo/target -# NIXL environment variables -ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \ - NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \ - NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins -ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH} - -# Copy ucx and nixl libs -COPY --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/ -COPY --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/ -COPY --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/ - -# Copy built artifacts -COPY --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ -COPY --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ -COPY --from=wheel_builder $CARGO_TARGET_DIR $CARGO_TARGET_DIR -COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME - RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ # required for AIC perf files @@ -361,13 +344,29 @@ RUN apt-get update -y \ clang \ libclang-dev \ protobuf-compiler \ + # sudo for dev stage + sudo \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + # Add sudo privileges to dynamo user + && echo "dynamo ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/dynamo \ + && chmod 0440 /etc/sudoers.d/dynamo + +# Create dynamo user with group 0 for OpenShift compatibility +RUN userdel -r ubuntu > /dev/null 2>&1 || true \ + && useradd -m -s /bin/bash -g 0 dynamo \ + && [ `id -u dynamo` -eq 1000 ] \ + && mkdir -p /home/dynamo/.cache \ + && chown -R dynamo: /opt/dynamo /home/dynamo /workspace \ + && chmod -R g+w /opt/dynamo /home/dynamo/.cache /workspace + +# Switch to dynamo user +USER dynamo +ENV HOME=/home/dynamo # Create and activate virtual environment ARG PYTHON_VERSION -RUN mkdir -p /opt/dynamo/venv && \ - uv venv /opt/dynamo/venv --python $PYTHON_VERSION +RUN uv venv /opt/dynamo/venv --python $PYTHON_VERSION ENV VIRTUAL_ENV=/opt/dynamo/venv \ PATH="/opt/dynamo/venv/bin:${PATH}" @@ -380,7 +379,25 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi --requirement /tmp/requirements.txt \ --requirement /tmp/requirements.test.txt -COPY benchmarks/ /opt/dynamo/benchmarks/ +# NIXL environment variables +ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \ + NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \ + NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins +ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH} + +# Copy ucx and nixl libs +COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/ +COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/ +COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/ + +# Copy built artifacts +COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ +COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ +COPY --chown=dynamo: --from=wheel_builder $CARGO_TARGET_DIR $CARGO_TARGET_DIR +COPY --chown=dynamo: --from=wheel_builder $CARGO_HOME $CARGO_HOME + +COPY --chown=dynamo: ./ /workspace/ + RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ @@ -388,16 +405,20 @@ RUN uv pip install \ if [ "$ENABLE_KVBM" = "true" ]; then \ uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \ fi \ - && cd /opt/dynamo/benchmarks \ - && UV_GIT_LFS=1 uv pip install --no-cache . \ - && cd - \ - && rm -rf /opt/dynamo/benchmarks + && cd /workspace/benchmarks \ + && UV_GIT_LFS=1 uv pip install --no-cache . -# Setup launch banner +# Setup launch banner in common directory accessible to all users RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \ - sed '/^#\s/d' /opt/dynamo/launch_message.txt > ~/.launch_screen && \ - echo "cat ~/.launch_screen" >> ~/.bashrc && \ - echo "source $VIRTUAL_ENV/bin/activate" >> ~/.bashrc + sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen + +# Setup environment for all users +USER root +RUN chmod 755 /opt/dynamo/.launch_screen && \ + echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \ + echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc + +USER dynamo ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] CMD [] diff --git a/container/Dockerfile.local_dev b/container/Dockerfile.local_dev index 0e67f3dc36..ed1d845b2d 100644 --- a/container/Dockerfile.local_dev +++ b/container/Dockerfile.local_dev @@ -14,8 +14,8 @@ ARG DEV_BASE="" FROM ${DEV_BASE} AS local-dev -# Don't want ubuntu to be editable, just change uid and gid. -ENV USERNAME=ubuntu +# Don't want dynamo to be editable, just change uid and gid. +ENV USERNAME=dynamo ARG USER_UID ARG USER_GID ARG WORKSPACE_DIR=/workspace @@ -50,7 +50,10 @@ RUN apt-get install -y sudo gnupg2 gnupg1 \ && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \ && chmod 0440 /etc/sudoers.d/$USERNAME \ && mkdir -p /home/$USERNAME \ - && groupmod -g $USER_GID $USERNAME \ + # Handle GID conflicts: if target GID exists and it's not our group, remove it + && (getent group $USER_GID | grep -v "^$USERNAME:" && groupdel $(getent group $USER_GID | cut -d: -f1) || true) \ + # Create group if it doesn't exist, otherwise modify existing group + && (getent group $USERNAME > /dev/null 2>&1 && groupmod -g $USER_GID $USERNAME || groupadd -g $USER_GID $USERNAME) \ && usermod -u $USER_UID -g $USER_GID $USERNAME \ && chown -R $USERNAME:$USERNAME /home/$USERNAME \ && chsh -s /bin/bash $USERNAME diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index 48b5ba77c2..ac2c6b90f8 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -187,6 +187,17 @@ RUN git clone --depth 1 --branch ${GDRCOPY_COMMIT} https://github.com/NVIDIA/gdr # Fix DeepEP IBGDA symlink RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so +# Create dynamo user EARLY - before copying files, with group 0 for OpenShift compatibility +RUN userdel -r ubuntu > /dev/null 2>&1 || true \ + && useradd -m -s /bin/bash -g 0 dynamo \ + && [ `id -u dynamo` -eq 1000 ] \ + && mkdir -p /workspace /home/dynamo/.cache /opt/dynamo \ + && chown -R dynamo: /sgl-workspace /workspace /home/dynamo /opt/dynamo \ + && chmod -R g+w /sgl-workspace /workspace /home/dynamo/.cache /opt/dynamo + +USER dynamo +ENV HOME=/home/dynamo + # Install SGLang (requires CUDA 12.8.1 or 12.9.1) RUN python3 -m pip install --no-cache-dir --ignore-installed pip==25.3 setuptools==80.9.0 wheel==0.45.1 html5lib==1.1 six==1.17.0 \ && git clone --depth 1 --branch v${SGLANG_COMMIT} https://github.com/sgl-project/sglang.git \ @@ -202,7 +213,7 @@ RUN python3 -m pip install --no-cache-dir --ignore-installed pip==25.3 setuptool && FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin # Download and extract NVSHMEM source, clone DeepEP (use Tom's fork for GB200) -RUN --mount=type=cache,target=/var/cache/curl \ +RUN --mount=type=cache,target=/var/cache/curl,uid=1000,gid=0 \ curl --retry 3 --retry-delay 2 -fsSL -o /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \ && tar -xf /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \ && mv nvshmem_src nvshmem \ @@ -345,40 +356,50 @@ COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/ COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX -ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH +ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:${HOME}/.local/bin:$PATH # Install Dynamo wheels from dynamo_base wheelhouse -COPY benchmarks/ /opt/dynamo/benchmarks/ -COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ -RUN pip install \ +COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/ +COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ +RUN python3 -m pip install \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \ && cd /opt/dynamo/benchmarks \ - && pip install --no-cache . \ + && python3 -m pip install --no-cache . \ && cd - \ && rm -rf /opt/dynamo/benchmarks # Install common and test dependencies RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ - pip install \ + python3 -m pip install \ --no-cache \ --requirement /tmp/requirements.txt \ --requirement /tmp/requirements.test.txt -## Copy attribution files and launch banner -COPY ATTRIBUTION* LICENSE /workspace/ -COPY container/launch_message.txt /workspace/launch_message.txt -RUN sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \ - echo "cat ~/.launch_screen" >> ~/.bashrc +## Copy attribution files and launch banner with correct ownership +COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/ +COPY --chown=dynamo: container/launch_message.txt /workspace/launch_message.txt + +# Setup launch banner in common directory accessible to all users +RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \ + sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen -# Copy tests, benchmarks, deploy and components for CI -COPY tests /workspace/tests -COPY examples /workspace/examples -COPY benchmarks /workspace/benchmarks -COPY deploy /workspace/deploy -COPY components/ /workspace/components/ +# Setup environment for all users +USER root +RUN chmod 755 /opt/dynamo/.launch_screen && \ + echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \ + echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc + +USER dynamo + +# Copy tests, benchmarks, deploy and components for CI with correct ownership +COPY --chown=dynamo: tests /workspace/tests +COPY --chown=dynamo: examples /workspace/examples +COPY --chown=dynamo: benchmarks /workspace/benchmarks +COPY --chown=dynamo: deploy /workspace/deploy +COPY --chown=dynamo: components/ /workspace/components/ ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] CMD [] @@ -412,6 +433,7 @@ RUN mkdir -p /opt/dynamo/venv && \ ENV VIRTUAL_ENV=/opt/dynamo/venv \ PATH="/opt/dynamo/venv/bin:${PATH}" +USER root # Install development tools and utilities RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ @@ -450,6 +472,8 @@ RUN apt-get update -y && \ silversearcher-ag \ cloc \ locales \ + # sudo for dev stage + sudo \ # NVIDIA tools dependencies gnupg && \ echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list && \ @@ -469,10 +493,10 @@ RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format https://git # Editable install of dynamo COPY pyproject.toml README.md hatch_build.py /workspace/ -RUN pip install --no-deps -e . +RUN python3 -m pip install --no-deps -e . # Install Python development packages -RUN pip install --no-cache-dir \ +RUN python3 -m pip install --no-cache-dir \ maturin[patchelf] \ pytest \ black \ diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index 7735f160d3..151f3de68d 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -1,8 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -ARG BASE_IMAGE="nvcr.io/nvidia/pytorch" -ARG BASE_IMAGE_TAG="25.06-py3" +ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" +ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" + +ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch" +ARG PYTORCH_BASE_IMAGE_TAG="25.06-py3" ARG ENABLE_KVBM=false ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04" @@ -34,33 +37,22 @@ ARG DYNAMO_BASE_IMAGE="dynamo:latest-none" FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base # Copy artifacts from NGC PyTorch image -FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework - - +FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_IMAGE_TAG} AS pytorch_base ################################################## -########## Runtime Image ######################## +########## Framework Builder Stage ############## ################################################## # -# PURPOSE: Production runtime environment -# -# This stage creates a lightweight production-ready image containing: -# - Pre-compiled TensorRT-LLM and framework dependencies -# - Dynamo runtime libraries and Python packages -# - Essential runtime dependencies and configurations -# - Optimized for inference workloads and deployment +# PURPOSE: Build TensorRT-LLM with root privileges # -# Use this stage when you need: -# - Production deployment of Dynamo with TensorRT-LLM -# - Minimal runtime footprint without build tools -# - Ready-to-run inference server environment -# - Base for custom application containers +# This stage handles TensorRT-LLM installation which requires: +# - Root access for apt operations (CUDA repos, TensorRT installation) +# - System-level modifications in install_tensorrt.sh +# - Virtual environment population with PyTorch and TensorRT-LLM # +# The completed venv is then copied to runtime stage with dynamo ownership -FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime - -WORKDIR /workspace -ENV VIRTUAL_ENV=/opt/dynamo/venv +FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework ARG ARCH_ALT ARG PYTHON_VERSION @@ -68,93 +60,27 @@ ARG ENABLE_KVBM ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins +ENV VIRTUAL_ENV=/opt/dynamo/venv +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" -# Install Python, build-essential and python3-dev as apt dependencies +# Install minimal dependencies needed for TensorRT-LLM installation RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - # Build tools (required for JIT kernel compilation) - build-essential \ - g++ \ - ninja-build \ - git \ - git-lfs \ - # Python runtime - CRITICAL for virtual environment to work python${PYTHON_VERSION}-dev \ python3-pip \ - # CUDA/ML libraries - libcudnn9-cuda-12 \ - # Network and communication libraries - libzmq3-dev \ - # RDMA/UCX libraries required to find RDMA devices - ibverbs-providers \ - ibverbs-utils \ - libibumad3 \ - libibverbs1 \ - libnuma1 \ - librdmacm1 \ - rdma-core \ - # OpenMPI dependencies - openssh-client \ - openssh-server \ - # System utilities - ca-certificates \ curl \ - jq \ - wget && \ + git \ + git-lfs \ + ca-certificates && \ rm -rf /var/lib/apt/lists/* -# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from framework devel image -COPY --from=framework /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc -COPY --from=framework /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++ -COPY --from=framework /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas -COPY --from=framework /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary -COPY --from=framework /usr/local/cuda/include/ /usr/local/cuda/include/ -COPY --from=framework /usr/local/cuda/nvvm /usr/local/cuda/nvvm -COPY --from=framework /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ -COPY --from=framework /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/ -COPY --from=framework /usr/local/lib/lib* /usr/local/lib/ - -### COPY NATS & ETCD ### -# Copy nats and etcd from dynamo_base image -COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server -COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/ -# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible -ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH - -# Copy UCX from framework image as plugin for NIXL -# Copy NIXL source from framework image -# Copy dynamo wheels for gitlab artifacts -COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx -COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX -ENV PATH=/usr/local/ucx/bin:$PATH - -# Copy OpenMPI from framework image -COPY --from=framework /opt/hpcx/ompi /opt/hpcx/ompi -# Copy NUMA library from framework image -COPY --from=framework /usr/lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/ - -ENV DYNAMO_HOME=/workspace -ENV LD_LIBRARY_PATH=\ -$NIXL_LIB_DIR:\ -$NIXL_PLUGIN_DIR:\ -/usr/local/ucx/lib:\ -/usr/local/ucx/lib/ucx:\ -/opt/hpcx/ompi/lib:\ -$LD_LIBRARY_PATH -ENV PATH="${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH" -ENV OPAL_PREFIX=/opt/hpcx/ompi - -### VIRTUAL ENVIRONMENT SETUP ### +# Copy uv +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ -COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin # Create virtual environment RUN mkdir -p /opt/dynamo/venv && \ uv venv /opt/dynamo/venv --python $PYTHON_VERSION -# Activate virtual environment -ENV VIRTUAL_ENV=/opt/dynamo/venv \ - PATH="/opt/dynamo/venv/bin:${PATH}" - # Copy pytorch installation from NGC PyTorch ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6 ARG TORCHVISION_VER=0.22.0a0+95f10a4e @@ -167,25 +93,22 @@ ARG PACKAGING_VER=23.2 ARG FLASH_ATTN_VER=2.7.4.post1 ARG MPMATH_VER=1.3.0 -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchgen ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchgen -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision-${TORCHVISION_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision-${TORCHVISION_VER}.dist-info -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision.libs ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision.libs -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/functorch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/functorch -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/jinja2 ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/jinja2 -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/jinja2-${JINJA2_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/jinja2-${JINJA2_VER}.dist-info -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sympy ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/sympy -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sympy-${SYMPY_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/sympy-${SYMPY_VER}.dist-info -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/ -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton -COPY --from=framework /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info - - -ENV ENV=${ENV:-/etc/shinit_v2} +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchgen ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchgen +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision-${TORCHVISION_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision-${TORCHVISION_VER}.dist-info +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision.libs ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision.libs +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/functorch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/functorch +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/jinja2 ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/jinja2 +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/jinja2-${JINJA2_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/jinja2-${JINJA2_VER}.dist-info +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sympy ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/sympy +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sympy-${SYMPY_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/sympy-${SYMPY_VER}.dist-info +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/ +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton +COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info # Install TensorRT-LLM and related dependencies ARG HAS_TRTLLM_CONTEXT @@ -235,12 +158,146 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \ fi +################################################## +########## Runtime Image ######################## +################################################## +# +# PURPOSE: Production runtime environment +# +# This stage creates a lightweight production-ready image containing: +# - Pre-compiled TensorRT-LLM and framework dependencies +# - Dynamo runtime libraries and Python packages +# - Essential runtime dependencies and configurations +# - Optimized for inference workloads and deployment +# +# Use this stage when you need: +# - Production deployment of Dynamo with TensorRT-LLM +# - Minimal runtime footprint without build tools +# - Ready-to-run inference server environment +# - Base for custom application containers +# + +FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime + +ARG ARCH_ALT +ARG PYTHON_VERSION + +WORKDIR /workspace + +ENV ENV=${ENV:-/etc/shinit_v2} +ENV VIRTUAL_ENV=/opt/dynamo/venv +ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl +ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu +ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins + +# Install Python, build-essential and python3-dev as apt dependencies +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + # Build tools + build-essential \ + g++ \ + ninja-build \ + git \ + git-lfs \ + # Python runtime - CRITICAL for virtual environment to work + python${PYTHON_VERSION}-dev \ + python3-pip \ + # CUDA/ML libraries + libcudnn9-cuda-12 \ + # Network and communication libraries + libzmq3-dev \ + # RDMA/UCX libraries required to find RDMA devices + ibverbs-providers \ + ibverbs-utils \ + libibumad3 \ + libibverbs1 \ + libnuma1 \ + librdmacm1 \ + rdma-core \ + # OpenMPI dependencies + openssh-client \ + openssh-server \ + # System utilities and dependencies + curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image +COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc +COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++ +COPY --from=pytorch_base /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas +COPY --from=pytorch_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary +COPY --from=pytorch_base /usr/local/cuda/include/ /usr/local/cuda/include/ +COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm +COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ +COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/ +COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/ + +# Copy nats and etcd from dynamo_base image +COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server +COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/ +# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible +ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH + +# Copy OpenMPI from PyTorch base image +COPY --from=pytorch_base /opt/hpcx/ompi /opt/hpcx/ompi +# Copy NUMA library from PyTorch base image +COPY --from=pytorch_base /usr/lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/ + +# Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/ +COPY --from=pytorch_base /opt/hpcx /opt/hpcx +# This is needed to make libucc.so visible so pytorch can use it. +ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}" +# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container +# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them +# pytorch-triton is copied after trtllm installation. +COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ + +# Copy uv to system /bin +COPY --from=framework /bin/uv /bin/uvx /bin/ + +# Copy libgomp.so from framework image +COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt +COPY --from=framework /usr/lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/ + +# Create dynamo user with group 0 for OpenShift compatibility +RUN userdel -r ubuntu > /dev/null 2>&1 || true \ + && useradd -m -s /bin/bash -g 0 dynamo \ + && [ `id -u dynamo` -eq 1000 ] \ + && mkdir -p /home/dynamo/.cache /opt/dynamo \ + && chown -R dynamo: /workspace /home/dynamo /opt/dynamo \ + && chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo + +# Switch to dynamo user +USER dynamo +ENV HOME=/home/dynamo +ENV DYNAMO_HOME=/workspace + +# Copy UCX from framework image as plugin for NIXL +# Copy NIXL source from framework image +# Copy dynamo wheels for gitlab artifacts +COPY --chown=dynamo: --from=dynamo_base /usr/local/ucx /usr/local/ucx +COPY --chown=dynamo: --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX + +ENV PATH="/usr/local/ucx/bin:${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH" +ENV LD_LIBRARY_PATH=\ +$NIXL_LIB_DIR:\ +$NIXL_PLUGIN_DIR:\ +/usr/local/ucx/lib:\ +/usr/local/ucx/lib/ucx:\ +/opt/hpcx/ompi/lib:\ +$LD_LIBRARY_PATH +ENV OPAL_PREFIX=/opt/hpcx/ompi + +# Copy pre-built venv with PyTorch and TensorRT-LLM from framework stage +COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV} + ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH} # Install dynamo, NIXL, and dynamo-specific dependencies -COPY benchmarks/ /opt/dynamo/benchmarks/ -COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ +COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/ +COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ @@ -261,31 +318,28 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi --requirement /tmp/requirements.txt \ --requirement /tmp/requirements.test.txt -# Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/ -COPY --from=framework /opt/hpcx /opt/hpcx -# This is needed to make libucc.so visible so pytorch can use it. -ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}" -# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container -# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them -# pytorch-triton is copied after trtllm installation. -COPY --from=framework /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ +# Copy tests, benchmarks, deploy and components for CI with correct ownership +COPY --chown=dynamo: tests /workspace/tests +COPY --chown=dynamo: examples /workspace/examples +COPY --chown=dynamo: benchmarks /workspace/benchmarks +COPY --chown=dynamo: deploy /workspace/deploy +COPY --chown=dynamo: components/ /workspace/components/ +COPY --chown=dynamo: recipes/ /workspace/recipes/ -# Copy tests, benchmarks, deploy and components for CI -COPY tests /workspace/tests -COPY examples /workspace/examples -COPY benchmarks /workspace/benchmarks -COPY deploy /workspace/deploy -COPY components/ /workspace/components/ -COPY recipes/ /workspace/recipes/ +# Copy attribution files with correct ownership +COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/ -# Copy attribution files -COPY ATTRIBUTION* LICENSE /workspace/ -# Copy launch banner -RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \ - sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \ - echo "cat ~/.launch_screen" >> ~/.bashrc && \ - echo "source $VIRTUAL_ENV/bin/activate" >> ~/.bashrc +# Setup launch banner in common directory accessible to all users +RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \ + sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen +# Setup environment for all users +USER root +RUN chmod 755 /opt/dynamo/.launch_screen && \ + echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \ + echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc + +USER dynamo ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] CMD [] @@ -308,6 +362,9 @@ FROM runtime AS dev # Don't want ubuntu to be editable, just change uid and gid. ARG WORKSPACE_DIR=/workspace +# Switch to root for system package installation +USER root + # Install utilities as root RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 58d1d2adde..02acf40526 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -222,15 +222,28 @@ COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/ # Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH -# Copy UCX from dev image as plugin for NIXL -# Copy NIXL source from devr image -# Copy dynamo wheels for gitlab artifacts -COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx -COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX -ENV PATH=/usr/local/ucx/bin:$PATH +# DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path +# is not properly set for complilation. Set CPATH to help nvcc find the headers. +ENV CPATH=/usr/local/cuda/include -# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries -COPY --from=framework /opt/vllm /opt/vllm +# Copy uv to system /bin +COPY --from=framework /bin/uv /bin/uvx /bin/ + +# Create dynamo user with group 0 for OpenShift compatibility +RUN userdel -r ubuntu > /dev/null 2>&1 || true \ + && useradd -m -s /bin/bash -g 0 dynamo \ + && [ `id -u dynamo` -eq 1000 ] \ + && mkdir -p /home/dynamo/.cache /opt/dynamo \ + && chown -R dynamo: /workspace /home/dynamo /opt/dynamo \ + && chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo + +USER dynamo +ENV HOME=/home/dynamo + +# Copy UCX and NIXL to system directories +COPY --chown=dynamo: --from=dynamo_base /usr/local/ucx /usr/local/ucx +COPY --chown=dynamo: --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX +ENV PATH=/usr/local/ucx/bin:$PATH ENV LD_LIBRARY_PATH=\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ @@ -240,19 +253,17 @@ $NIXL_PLUGIN_DIR:\ /usr/local/ucx/lib/ucx:\ $LD_LIBRARY_PATH -# DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path -# is not properly set for complilation. Set CPATH to help nvcc find the headers. -ENV CPATH=/usr/local/cuda/include - ### VIRTUAL ENVIRONMENT SETUP ### -# Copy uv and entire virtual environment from framework container -COPY --from=framework /bin/uv /bin/uvx /bin/ -COPY --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV} +# Copy entire virtual environment from framework container with correct ownership +COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV} + +# Copy vllm with correct ownership +COPY --chown=dynamo: --from=framework /opt/vllm /opt/vllm # Install dynamo, NIXL, and dynamo-specific dependencies -COPY benchmarks/ /opt/dynamo/benchmarks/ -COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ +COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/ +COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ RUN uv pip install \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ @@ -273,16 +284,23 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi --requirement /tmp/requirements.txt \ --requirement /tmp/requirements.test.txt -# Copy benchmarks, examples, and tests for CI -COPY . /workspace/ +# Copy benchmarks, examples, and tests for CI with correct ownership +COPY --chown=dynamo: . /workspace/ # Copy attribution files -COPY ATTRIBUTION* LICENSE /workspace/ -# Copy launch banner -RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \ - sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \ - echo "cat ~/.launch_screen" >> ~/.bashrc && \ - echo "source $VIRTUAL_ENV/bin/activate" >> ~/.bashrc +COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/ + +# Setup launch banner in common directory accessible to all users +RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \ + sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen + +# Setup environment for all users +USER root +RUN chmod 755 /opt/dynamo/.launch_screen && \ + echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \ + echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc + +USER dynamo ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] CMD [] @@ -305,6 +323,7 @@ FROM runtime AS dev # Don't want ubuntu to be editable, just change uid and gid. ARG WORKSPACE_DIR=/workspace +USER root # Install utilities as root RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ diff --git a/container/build.sh b/container/build.sh index 4d6659c7e0..06a88ca501 100755 --- a/container/build.sh +++ b/container/build.sh @@ -555,7 +555,7 @@ build_local_dev_with_header() { fi echo "Building new local-dev image from: $dev_base_image" - echo "User 'ubuntu' will have UID: $USER_UID, GID: $USER_GID" + echo "User 'dynamo' will have UID: $USER_UID, GID: $USER_GID" # Show the docker command being executed if not in dry-run mode if [ -z "$RUN_PREFIX" ]; then diff --git a/container/run.sh b/container/run.sh index 2f16b3e729..05094c0a93 100755 --- a/container/run.sh +++ b/container/run.sh @@ -45,6 +45,7 @@ USE_NIXL_GDS= RUNTIME=nvidia WORKDIR=/workspace NETWORK=host +USER= get_options() { while :; do @@ -183,6 +184,14 @@ get_options() { missing_requirement "$1" fi ;; + --user) + if [ "$2" ]; then + USER=$2 + shift + else + missing_requirement "$1" + fi + ;; --dry-run) RUN_PREFIX="echo" echo "" @@ -267,11 +276,10 @@ get_options() { if [ -n "$HF_HOME" ]; then mkdir -p "$HF_HOME" - # Use /home/ubuntu for local-dev target, /root for dev target. - if [ "$TARGET" = "local-dev" ] || [[ "$IMAGE" == *"local-dev"* ]]; then - HF_HOME_TARGET="/home/ubuntu/.cache/huggingface" - else + if [[ ${USER} == "root" ]] || [[ ${USER} == "0" ]]; then HF_HOME_TARGET="/root/.cache/huggingface" + else + HF_HOME_TARGET="/home/dynamo/.cache/huggingface" fi VOLUME_MOUNTS+=" -v $HF_HOME:$HF_HOME_TARGET" fi @@ -313,6 +321,12 @@ get_options() { RUNTIME="" fi + if [[ ${USER} == "" ]]; then + USER_STRING="" + else + USER_STRING="--user ${USER}" + fi + REMAINING_ARGS=("$@") } @@ -330,6 +344,8 @@ show_help() { echo " Options: 'host' (default), 'bridge', 'none', 'container:name'" echo " Examples: --network bridge (isolated), --network none (no network - WARNING: breaks most functionality)" echo " --network container:redis (share network with 'redis' container)" + echo " [--user [:] specify user to run container as]" + echo " Format: username or numeric UID, optionally with group/GID (e.g., 'root', '0', '1000:0')" echo " [-v add volume mount]" echo " [-p|--port add port mapping (host_port:container_port)]" echo " [-e add environment variable]" @@ -376,6 +392,7 @@ ${RUN_PREFIX} docker run \ ${NIXL_GDS_CAPS} \ --ipc host \ ${PRIVILEGED_STRING} \ + ${USER_STRING} \ ${NAME_STRING} \ ${ENTRYPOINT_STRING} \ ${IMAGE} \