ai-dynamo · nv-tusharma · Oct 9, 2025 · Oct 10, 2025 · Oct 14, 2025 · Oct 16, 2025
@@ -183,7 +183,6 @@ async def init(runtime: DistributedRuntime, config: Config):
         "pipeline_parallel_size": config.pipeline_parallel_size,
         "moe_expert_parallel_size": config.expert_parallel_size,
         "backend": "pytorch",
-        "skip_tokenizer_init": True,
         "build_config": build_config,
         "kv_cache_config": kv_cache_config,
         "gpus_per_node": gpus_per_node,
@@ -240,8 +239,6 @@ async def init(runtime: DistributedRuntime, config: Config):
     # Populate default sampling params from the model
     tokenizer = tokenizer_factory(arg_map["model"])
     default_sampling_params = SamplingParams()
-    default_sampling_params._setup(tokenizer)
-    default_sampling_params.stop = None
     model_input = ModelInput.Tokens
 
     # Set model type based on disaggregation mode for unified frontend support

@@ -12,7 +12,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.09-cuda13.0-devel-ubuntu24.04"
 
 # Build configuration
 ARG ENABLE_KVBM=false
@@ -53,7 +53,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
 # Redeclare ARGs for this stage
 ARG ARCH
 ARG ARCH_ALT
-ARG PYTHON_VERSION
+ARG PYTHON_VERSION=3.12
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET
 ARG SCCACHE_REGION
@@ -410,6 +410,8 @@ COPY --chown=dynamo: --from=wheel_builder $CARGO_HOME $CARGO_HOME
 
 COPY --chown=dynamo: ./ /workspace/
 
+# Install Python packages
+# Install dynamo, NIXL, and dynamo-specific dependencies
 RUN uv pip install \
     /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
     /opt/dynamo/wheelhouse/ai_dynamo*any.whl \

@@ -3,21 +3,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now
-# Please check https://github.com/ai-dynamo/dynamo/pull/1065
-# for details and reproducer to manually test if the image
-# can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.09-cuda13.0-devel-ubuntu24.04"
+ARG RELEASE_BUILD
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
-ARG CUDA_VERSION="12.8"
+ARG RUNTIME_IMAGE_TAG="13.0.1-runtime-ubuntu24.04"
+ARG CUDA_VERSION="13.0"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.11.0"
+ARG VLLM_REF="v0.11.1rc2"
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
-ARG FLASHINF_REF="v0.3.1"
-ARG TORCH_BACKEND="cu128"
+ARG FLASHINF_REF="v0.4.1"
+ARG TORCH_BACKEND="cu130"
 
 # If left blank, then we will fallback to vLLM defaults
 ARG DEEPGEMM_REF=""
@@ -81,6 +78,7 @@ RUN apt-get update -y \
         ibverbs-utils \
         libibumad-dev \
         libibverbs-dev \
+        libmlx5-1 \
         libnuma-dev \
         librdmacm-dev \
         rdma-core \
@@ -143,7 +141,13 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
     export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
         cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
         chmod +x /tmp/install_vllm.sh && \
-        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
+        /tmp/install_vllm.sh --editable  \
+            --vllm-ref $VLLM_REF \
+            --max-jobs $MAX_JOBS \
+            --arch $ARCH \
+            --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
+            --torch-backend $TORCH_BACKEND \
+            --cuda-version $CUDA_VERSION && \
         /tmp/use-sccache.sh show-stats "vLLM";
 
 ENV LD_LIBRARY_PATH=\
@@ -206,7 +210,7 @@ RUN apt-get update && \
         # prometheus dependencies
         ca-certificates \
         # DeepGemm uses 'cuobjdump' which does not come with CUDA image
-        cuda-command-line-tools-12-8 && \
+        cuda-command-line-tools-13-0 && \
     rm -rf /var/lib/apt/lists/*
 
 # Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image

@@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
 
 # Base Images
 TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
-TRTLLM_BASE_IMAGE_TAG=25.06-py3
+TRTLLM_BASE_IMAGE_TAG=25.08-py3
 
 # Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
 # we need to build the TensorRT-LLM wheel from source.
@@ -89,16 +89,17 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="736e7ee136e0d65f98704db13ab7e053803033c4" # tag v1.2.0rc1
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
 
 # TensorRT-LLM PyPI index URL
-DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
+DEFAULT_TENSORRTLLM_INDEX_URL="https://download.pytorch.org/whl/cu130"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc1"
+TENSORRTLLM_INDEX_URL=""
 TENSORRTLLM_PIP_WHEEL=""
 
 
@@ -107,13 +108,13 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.09-cuda13.0-devel-ubuntu24.04"
 
 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+NONE_BASE_IMAGE_TAG="25.09-cuda13.0-devel-ubuntu24.04"
 
 SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+SGLANG_BASE_IMAGE_TAG="25.09-cuda13.0-devel-ubuntu24.04"
 
 NIXL_REF=0.7.1
 NIXL_UCX_REF=v1.19.0
@@ -566,7 +567,7 @@ build_local_dev_with_header() {
         set -x
     fi
 
-    $RUN_PREFIX docker build \
+    $RUN_PREFIX docker build --progress=plain  \
         --build-arg DEV_BASE="$dev_base_image" \
         --build-arg USER_UID="$USER_UID" \
         --build-arg USER_GID="$USER_GID" \
@@ -848,15 +849,22 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
         echo "======================================"
         echo "Starting Build 1: Base Image"
         echo "======================================"
-        $RUN_PREFIX docker build -f "${SOURCE_DIR}/Dockerfile" --target dev $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
+        # Build 1 (container/Dockerfile) does NOT use (will be removed soon):
+        #   - FRAMEWORK
+        #   - VLLM_FRAMEWORK (or TRTLLM_FRAMEWORK, SGLANG_FRAMEWORK, etc.)
+        #   - VERSION
+        #   - PYTHON_PACKAGE_VERSION
+        #   - HF_TOKEN
+        #   - MAX_JOBS
+        $RUN_PREFIX docker build --progress=plain -f "${SOURCE_DIR}/Dockerfile" --target dev $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
         # Start framework build
         echo "======================================"
         echo "Starting Build 2: Framework Image"
         echo "======================================"
         BUILD_ARGS+=" --build-arg DYNAMO_BASE_IMAGE=${DYNAMO_BASE_IMAGE}"
-        $RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
+        $RUN_PREFIX docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
     else
-        $RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
+        $RUN_PREFIX docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
     fi
 fi
 

@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ucx-py-cu12
+ucx-py-cu13
@@ -1,10 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-
-accelerate==1.6.0
-aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759
+--extra-index-url https://download.pytorch.org/whl/cu130 # this is only needed for accelerate dependencies
+accelerate
+# aiconfigurator # @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759
 aiofiles
-aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
+# aiperf # @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
 av==15.0.0
 fastapi==0.120.1
 ftfy
@@ -17,7 +17,7 @@ kubernetes_asyncio
 matplotlib
 msgspec
 mypy
-nvidia-ml-py==13.580.65
+nvidia-ml-py==13.580.82
 opentelemetry-api
 opentelemetry-sdk
 pip
@@ -33,8 +33,8 @@ PyYAML
 scikit-learn
 scipy<1.14.0  # Pin scipy version for pmdarima compatibility
 sentencepiece
-tensorboard==2.19.0
-tensorboardX==2.6.2.2
+tensorboard==2.20.0
+tensorboardX==2.6.4
 transformers
 types-aiofiles
 types-PyYAML

@@ -23,11 +23,11 @@ set -ex
 
 GITHUB_URL="https://github.com"
 
-UCX_VERSION="v1.18.1"
+UCX_VERSION="v1.19.0" # suggested by Ovidiu Mara
 UCX_INSTALL_PATH="/usr/local/ucx/"
 CUDA_PATH="/usr/local/cuda"
 
-NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
+NIXL_COMMIT="9ada51f154cc3bedcf94b3a3fcdea6e9b4117284" # suggested by Ovidiu Mara
 
 UCX_REPO="https://github.com/openucx/ucx.git"
 NIXL_REPO="https://github.com/ai-dynamo/nixl.git"

@@ -13,18 +13,18 @@
 
 set -euo pipefail
 
-VLLM_REF="v0.11.0"
+VLLM_REF="v0.11.1rc1"
 
 # Basic Configurations
 ARCH=$(uname -m)
 MAX_JOBS=16
 INSTALLATION_DIR=/tmp
 
 # VLLM and Dependency Configurations
-TORCH_BACKEND="cu128"
+TORCH_BACKEND="cu130"
 TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
 DEEPGEMM_REF=""
-CUDA_VERSION="12.8" # For DEEPGEMM
+CUDA_VERSION="13.0" # For DEEPGEMM
 
 # These flags are applicable when installing vLLM from source code
 EDITABLE=true
@@ -146,6 +146,7 @@ else
     # VLLM_REF does not start with 'v' or amd64 - use git checkout path
     if [ "$ARCH" = "arm64" ]; then
 
+        # TODO: update comments for torch 2.9.0
         # torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
         # nightly can be unstable so we will not use it here
         # for now we will use torch 2.7.1+cu128 but this requires a recompilation from source
@@ -154,7 +155,7 @@ else
 
         # Try to install specific PyTorch version first
         echo "Attempting to install pinned PyTorch nightly versions..."
-        if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
+        if ! uv pip install torch==2.9.0+cu130 torchaudio==2.9.0+cu130 torchvision==0.24.0+cu130 --index-url https://download.pytorch.org/whl/cu130; then
             echo "Pinned versions failed"
             exit 1
         fi