torchspec-project · yubofredwang · Apr 2, 2026 · Mar 30, 2026 · Apr 2, 2026
diff --git a/configs/vllm_qwen3_8b.yaml b/configs/vllm_qwen3_8b.yaml
@@ -11,7 +11,7 @@
 # Usage:
 #   python -m torchspec.train_entry --config configs/vllm_qwen3_8b.yaml
 #
-# Note: Uses vLLM Worker Extension to hook into model forward pass for hidden states capture.
+# Note: Uses vLLM's extract_hidden_states speculative method with MooncakeHiddenStatesConnector.
 
 model:
   target_model_path: Qwen/Qwen3-8B
@@ -51,17 +51,14 @@ inference:
   vllm:
     tp_size: 2
     mem_fraction_static: 0.7
-    use_worker_extension: true
     extra_args:
-      max_num_batched_tokens: 32768
-      compilation_config:
-        max_cudagraph_capture_size: 8
+      max_num_batched_tokens: 8192
 
 mooncake:
   master_server_address: null
   metadata_server: null
   protocol: tcp
-  global_segment_size: 16GB
+  global_segment_size: 32GB
   local_buffer_size: 4GB
 
 output_dir: ./outputs/qwen3-8b-single-node

diff --git a/examples/data/sample_conversations.jsonl b/examples/data/sample_conversations.jsonl
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,7 +42,7 @@ dev = [
 ]
 
 vllm = [
-    "vllm>=0.16.0",
+    "vllm>=0.18.0",
 ]
 
 fa = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,7 +42,7 @@ dev = [ @@
     ]
     vllm = [
-        "vllm>=0.16.0",
+        "vllm>=0.18.0",
     ]
     fa = [
@@ Expand Down @@