torchspec-project · zhubohao911 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -0,0 +1,68 @@
+name: Build Docker Image
+
+on:
+  workflow_dispatch:
+    inputs:
+      include_model:
+        description: 'Include Qwen3-8B model in image (0=slim ~4GB, 1=full ~20GB)'
+        required: true
+        default: '0'
+        type: choice
+        options:
+          - '0'
+          - '1'
+      tag:
+        description: 'Image tag (default: slim or latest based on include_model)'
+        required: false
+        type: string
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Determine tag
+        id: tag
+        run: |
+          if [ -n "${{ inputs.tag }}" ]; then
+            echo "tag=${{ inputs.tag }}" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.include_model }}" = "1" ]; then
+            echo "tag=latest" >> $GITHUB_OUTPUT
+          else
+            echo "tag=slim" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Free up disk space
+        run: |
+          # GitHub runners have ~14GB free, we need more for the build
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
+          sudo apt-get clean
+          df -h /
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: docker/sglang/v0.5.8.post1/Dockerfile.runpod
+          push: true
+          tags: ghcr.io/${{ github.repository_owner }}/torchspec-dflash:${{ steps.tag.outputs.tag }}
+          build-args: |
+            INCLUDE_MODEL=${{ inputs.include_model }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
@@ -88,4 +88,4 @@ _sglang/
 wandb/log.txt
 
 .claude/
-wandb/
+wandb/
diff --git a/README.md b/README.md
@@ -1,22 +1,59 @@
 # TorchSpec
 
-TorchSpec is a torch-native speculative decoding training framework. We introduce a disaggregated way of training speculative decoding draft models where inference and training are fully decoupled and stream hidden states directly from inference engine groups to distributed training workers via Mooncake (RDMA/TCP) store, allowing each side to scale independently.
+TorchSpec is a torch-native speculative decoding training framework. We introduce a disaggregated way of training speculative decoding draft models where inference and training are fully decoupled and stream hidden states directly from inference engine groups to distributed training workers via [Mooncake](https://github.com/kvcache-ai/Mooncake) store, allowing each side to scale independently.
+
+TorchSpec currently includes training flows and examples for:
+
+- Kimi-K2.5
+- MiniMax-M2.5
+- Qwen3-Coder-Next
+
+## 🚀 Blogs
+
+- PyTorch blog: [TorchSpec: Speculative Decoding Training at Scale](https://pytorch.org/blog/torchspec-speculative-decoding-training-at-scale/)
+- Release blog: [TorchSpec: Speculative Decoding Training at Scale](https://lightseek.org/blog/torchspec-speculative-decoding-training-at-scale.html)
+- Released draft model: [lightseekorg/kimi-k2.5-eagle3](https://huggingface.co/lightseekorg/kimi-k2.5-eagle3)
+
+## Table of Contents
+
+- [Architecture Overview](#architecture-overview)
+- [Quick Start](#quick-start)
+- [Setup](#setup)
+- [Examples](#examples)
+- [Training Modes](#training-modes)
+- [Checkpoint Conversion](#checkpoint-conversion)
+- [Metrics Reporting](#metrics-reporting)
+- [Troubleshooting](#troubleshooting)
+
+## Architecture Overview
 
 <p align="center">
   <img src="docs/torchspec_architecture.png" alt="TorchSpec Architecture" width="100%">
 </p>
 
-## Setup
+TorchSpec is built around a disaggregated training pipeline:
 
-### Choose Your Backend
+- **Inference engines** generate target-model hidden states with either vLLM or SGLang.
+- **Mooncake store** transfers tensors between inference and training without materializing them on disk.
+- **Training workers** consume streamed hidden states to train speculative decoding draft models.
 
-TorchSpec supports two inference backends:
+This separation keeps the training side focused on optimization while letting the inference side scale for hidden-state generation throughput.
+
+## Quick Start
 
-| Backend | Best For | Installation |
-|---------|----------|--------------|
-| **vLLM** | Flexibility, easier deployment | `./tools/build_conda.sh 1 vllm` |
-| **SGLang** | Production workloads, high throughput | `./tools/build_conda.sh 1 sglang` |
-| **Both** | Development, comparison testing | `./tools/build_conda.sh 1 both` |
+Train an Eagle3 draft model for Qwen3-8B on a single node with 4 GPUs (2 for training and 2 for inference):
+
+```bash
+./examples/qwen3-8b-single-node/run.sh
+```
+
+Override config values directly from the CLI:
+
+```bash
+./examples/qwen3-8b-single-node/run.sh training.learning_rate=5e-5 training.num_train_steps=500
+```
+
+## Setup
 
 ### Quick Setup
 
@@ -31,58 +68,50 @@ micromamba activate torchspec
 ```
 
 To install into your current environment instead:
+
 ```bash
 ./tools/build_conda.sh current sglang  # or 'vllm' or 'both'
 ```
 
-Optional — install Flash Attention:
+Optional: install Flash Attention support:
 
 ```bash
 pip install -e ".[fa]"
 ```
 
 ### Backend-Specific Usage
 
-**vLLM:**
-```bash
-./examples/qwen3-8b-single-node/run.sh --config configs/vllm_qwen3_8b.yaml
-```
+**vLLM**
 
-**SGLang:**
 ```bash
-./examples/qwen3-8b-single-node/run.sh
+./examples/qwen3-8b-single-node/run.sh --config configs/vllm_qwen3_8b.yaml
 ```
 
-TorchSpec uses vLLM's **Worker Extension** mechanism to hook into the model's forward pass and capture hidden states directly in the worker processes. This avoids RPC serialization issues and enables reliable hidden states extraction.
-
-## Quick Start
-
-Train an Eagle3 draft model for Qwen3-8B using inference engine (4 GPUs: 2 training + 2 inference):
+**SGLang**
 
 ```bash
 ./examples/qwen3-8b-single-node/run.sh
 ```
 
-Override any config value via CLI:
-
-```bash
-./examples/qwen3-8b-single-node/run.sh training.learning_rate=5e-5 training.num_train_steps=500
-```
+TorchSpec uses vLLM's **Worker Extension** mechanism to hook into the model forward pass and capture hidden states directly inside worker processes, which avoids RPC serialization overhead during extraction. For SGLang, TorchSpec applies a patch to the existing codebase to enable hidden-state extraction.
 
 ## Examples
 
 | Example | Backend | Model |
 |---------|---------|-------|
 | [hf-quickstart](examples/hf-quickstart/) | HuggingFace | Qwen3-8B |
-| [qwen3-8b-single-node](examples/qwen3-8b-single-node/) | Inference Engine | Qwen3-8B |
-| [kimi-k25-2node-h200](examples/kimi-k25-2node-h200/) | Inference Engine | Kimi-K2.5 |
-| [kimi-k25-3node-h100](examples/kimi-k25-3node-h100/) | Inference Engine | Kimi-K2.5 |
+| [qwen3-8b-single-node](examples/qwen3-8b-single-node/) | Inference engine | Qwen3-8B |
+| [kimi-k25-2node-h200](examples/kimi-k25-2node-h200/) | Inference engine | Kimi-K2.5 |
+| [kimi-k25-3node-h100](examples/kimi-k25-3node-h100/) | Inference engine | Kimi-K2.5 |
+| [minimax-m25-5node-h200](examples/minimax-m25-5node-h200/) | Inference engine | MiniMax-M2.5 |
+
+See [examples/README.md](examples/README.md) for more details about each example.
 
-See [examples/README.md](examples/README.md) for details.
+## Training Modes
 
-## Resume Vs Continual Training
+### Resume vs. Continual Training
 
-Both modes use `training.load_path`, but they restore different state:
+Both modes use `training.load_path`, but they restore different states:
 
 | Goal | `training.load_path` | `training.continual_training` | What gets restored |
 |------|----------------------|-------------------------------|--------------------|
@@ -119,11 +148,10 @@ Convert an FSDP checkpoint to HuggingFace format:
 python tools/convert_to_hf.py --input-dir ./outputs/my_experiment/iter_0010000/
 ```
 
-Vocabulary pruning — reducing the draft model's `lm_head` to a smaller token set and emitting `d2t`/`t2d` mappings — can be applied either **during training** (pre-pruning) or **at conversion time** (post-pruning):
+Vocabulary pruning, which reduces the draft model `lm_head` to a smaller token set and emits `d2t` and `t2d` mappings, can be applied either during training or at conversion time.
 
-- **Pre-pruning**: set `draft_vocab_size` in your training config. The checkpoint already contains the pruned `lm_head` and `d2t`/`t2d` buffers. Use the basic conversion command above — no extra flags needed.
-
-- **Post-pruning**: train with the full vocabulary, then prune at conversion time by passing `--prune-vocab` along with a representative dataset to compute token frequencies:
+- **Pre-pruning**: set `draft_vocab_size` in your training config. The checkpoint already contains the pruned `lm_head` and `d2t`/`t2d` buffers, so the basic conversion command is enough.
+- **Post-pruning**: train with the full vocabulary, then pass `--prune-vocab` at conversion time together with a representative dataset to compute token frequencies.
 
 ```bash
 python tools/convert_to_hf.py \
@@ -140,27 +168,27 @@ Pass `--cache-dir ./cache` to reuse the tokenized dataset cache from training.
 
 ## Metrics Reporting
 
-W&B logging is disabled by default (`report_to: none`). To enable it, set `report_to: wandb` in your config and supply your API key.
+W&B logging is disabled by default with `report_to: none`. To enable it, set `report_to: wandb` in your config and provide your API key.
 
 ## Troubleshooting
 
-Set `TORCHSPEC_LOG_LEVEL=DEBUG` for verbose logging when diagnosing issues:
+Set `TORCHSPEC_LOG_LEVEL=DEBUG` for more verbose logging when diagnosing issues:
 
 ```bash
 TORCHSPEC_LOG_LEVEL=DEBUG ./examples/qwen3-8b-single-node/run.sh
 ```
 
 ### Per-Rank File Logging
 
-Set `TORCHSPEC_LOG_DIR` to an <b> absolute path </b> on a shared filesystem (NFS) to enable per-rank log files for every Ray actor (training and inference):
+Set `TORCHSPEC_LOG_DIR` to an absolute path on a shared filesystem (NFS) to enable per-rank log files for every Ray actor on both training and inference:
 
 ```bash
 export TORCHSPEC_LOG_DIR=/my_project/running_logs
 ```
 
 This creates a structured directory with one file per actor, organized by role and node:
 
-```
+```text
 running_logs/
   training/
     10.0.0.1/
@@ -175,7 +203,7 @@ running_logs/
       inference_g0_rank1_20260301_080015.log
 ```
 
-The path must be an absolute path on a shared filesystem (NFS) accessible from all nodes. If `TORCHSPEC_LOG_DIR` is not set or the path is not writable, per-rank file logging is disabled and only Ray's default stdout/stderr capture is used.
+The path must be absolute and writable from all nodes. If `TORCHSPEC_LOG_DIR` is unset or not writable, per-rank file logging stays disabled and Ray falls back to stdout/stderr capture.
 
 | Issue | Reference |
 |-------|-----------|

diff --git a/configs/draft_models/kimi_k25_eagle3.json b/configs/draft_models/kimi_k25_eagle3.json
@@ -16,7 +16,7 @@
   "model_type": "llama",
   "num_attention_heads": 64,
   "num_hidden_layers": 1,
-  "num_key_value_heads": 16,
+  "num_key_value_heads": 64,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,

diff --git a/configs/draft_models/minimax_m25_eagle3.json b/configs/draft_models/minimax_m25_eagle3.json
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "max_position_embeddings": 196608,
+  "model_type": "llama",
+  "num_attention_heads": 48,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 48,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 200064
+}
diff --git a/configs/hf_qwen3_8b_1gpu.yaml b/configs/hf_qwen3_8b_1gpu.yaml
@@ -0,0 +1,60 @@
+# Eagle3 training config — single GPU (colocate mode)
+#
+# GPU allocation:
+#   - 1 GPU shared between inference (HF) and training (no FSDP)
+#   - Requires H100 80GB or A100 80GB
+#
+# Usage:
+#   python -m torchspec.train_entry --config configs/hf_qwen3_8b_1gpu.yaml
+
+model:
+  target_model_path: Qwen/Qwen3-8B
+  trust_remote_code: true
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  chat_template: qwen
+  prompt_key: conversations
+
+training:
+  attention_backend: flex_attention
+  colocate: true
+  micro_batch_size: 1
+  draft_accumulation_steps: 1
+  learning_rate: 1e-4
+  max_concurrent_batches: 1
+  max_grad_norm: 0.5
+  max_seq_length: 4096
+  num_epochs: 1
+  seed: 42
+  training_num_gpus_per_node: 1
+  training_num_nodes: 1
+  ttt_length: 7
+  save_per_epoch: true
+  warmup_ratio: 0.015
+  train_env_vars: '{"TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS": "ATEN,TRITON"}'
+
+inference:
+  inference_engine_type: hf
+  inference_num_gpus: 1
+  inference_num_gpus_per_engine: 1
+  inference_num_gpus_per_node: 1
+  max_sample_pool_size: 32
+  inference_buffer_threshold: 16
+  inference_batch_size: 4
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 16GB
+  local_buffer_size: 4GB
+
+output_dir: ./outputs/qwen3-8b-eagle3-1gpu
+cache_dir: ./cache/qwen3-8b-1gpu
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false
-Original file line number
+Diff line change
@@ Expand Up / @@ -88,4 +88,4 @@ _sglang/ @@
     wandb/log.txt
     .claude/
-    wandb/
+    wandb/