Luce-Org · javierpazo · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -126,6 +126,8 @@ add_library(dflash27b STATIC
     src/laguna_target_graph.cpp
     src/laguna_daemon.cpp
     src/sampler.cpp
+    # Native MTP / NextN helpers
+    src/f16_convert.cu
 )
 # FlashPrefill custom CUDA kernels need BF16 WMMA (sm_80+). On Turing (sm_75)
 # the drafter uses ggml's flash_attn_ext instead. Guard added after SM check.
@@ -334,6 +336,29 @@ if(DFLASH27B_TESTS)
         endif()
     endif()
 
+    # Native MTP / NextN: contract test + functional smokes. The contract test
+    # uses synthetic tensors and runs in CI; the smokes need a real MTP GGUF.
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_mtp_graph_contract.cpp")
+        add_executable(test_mtp_graph_contract test/test_mtp_graph_contract.cpp)
+        target_include_directories(test_mtp_graph_contract PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(test_mtp_graph_contract PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_graph.cpp")
+        add_executable(smoke_mtp_graph test/smoke_mtp_graph.cpp)
+        target_include_directories(smoke_mtp_graph PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_mtp_graph PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_mtp_handoff.cpp")
+        add_executable(smoke_target_mtp_handoff test/smoke_target_mtp_handoff.cpp)
+        target_include_directories(smoke_target_mtp_handoff PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_target_mtp_handoff PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_integrated_decode.cpp")
+        add_executable(smoke_mtp_integrated_decode test/smoke_mtp_integrated_decode.cpp)
+        target_include_directories(smoke_mtp_integrated_decode PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_mtp_integrated_decode PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+
     # internal.h includes <cuda_runtime.h> when GGML_USE_CUDA is set; link
     # CUDA::cudart so the toolkit headers are on the compile line (same as
     # test_dflash historically had alone).
@@ -350,6 +375,10 @@ if(DFLASH27B_TESTS)
         smoke_target_forward
         test_generate
         test_dflash
+        test_mtp_graph_contract
+        smoke_mtp_graph
+        smoke_target_mtp_handoff
+        smoke_mtp_integrated_decode
     )
     foreach(_t IN LISTS _dflash_internal_h_cuda_tests)
         if(TARGET ${_t})

diff --git a/dflash/docs/MTP_2026-05-11.md b/dflash/docs/MTP_2026-05-11.md
@@ -0,0 +1,111 @@
+# Native MTP (NextN) — runtime status, 2026-05-11
+
+This document describes the native multi-token prediction (MTP / NextN)
+runtime introduced into `dflash` in PR `feat(dflash): native Qwen3.6 MTP
+integrated decode`. It tracks the contract, what already works, and what
+remains for the next PR before MTP becomes a default-on decode mode.
+
+## What this PR ships
+
+- `dflash/src/f16_convert.cu` — small `f16/bf16 → f32` widen kernels used
+  by both the rollback path and the MTP token-embedding widen.
+- `dflash/src/internal.h` — new types:
+    - `TargetNextN`, `TargetMtpLayer`
+    - `TargetMtpCache` (KV cache for the NextN tail block only)
+    - `QwenMtpGraphInputs`, `QwenMtpGraphOutputs`
+    - `expose_pre_norm_hidden` on `QwenGraphInputs`
+    - `pre_norm_hidden` on `QwenGraphOutputs`
+    - `TargetWeights::mtp_layers`, `nextn_predict_layers`, `gguf_block_count`,
+      `tok_embd_gpu` (no fields removed; the trunk API is preserved).
+- `dflash/src/qwen35_target_graph.cpp` — four new functions:
+    - `create_target_mtp_cache` / `free_target_mtp_cache` / `reset_target_mtp_cache`
+    - `build_qwen35_mtp_graph` — RMSNorm(e) || RMSNorm(h) → `eh_proj` →
+      full-attention transformer block → SwiGLU FFN → shared head.
+  Also wires `expose_pre_norm_hidden` into `build_qwen35_graph`.
+- `dflash/src/gguf_target_loader.cpp` — reads `qwen35.nextn_predict_layers`,
+  splits the GGUF blocks into trunk + MTP tail, loads `blk.<i>.nextn.*`
+  tensors into `TargetWeights::mtp_layers`, and uploads `token_embd.weight`
+  to the GPU when the checkpoint carries MTP (`DFLASH27B_UPLOAD_TOK_EMBD`
+  env var overrides).
+- `dflash/test/test_mtp_graph_contract.cpp` — synthetic-tensor test that
+  asserts the MTP graph wires together correctly. No GPU model needed;
+  cheap to run in CI.
+- `dflash/test/smoke_mtp_graph.cpp` — loads a real MTP GGUF, builds the
+  NextN graph for a single token, and validates the output is finite.
+- `dflash/test/smoke_target_mtp_handoff.cpp` — loads a real MTP GGUF and
+  proves that the trunk pre-norm hidden tensor feeds directly into the
+  MTP block within the same `ggml_cgraph` (no CPU roundtrip required).
+- `dflash/test/smoke_mtp_integrated_decode.cpp` — full integrated decode
+  loop: target greedy + MTP greedy in one graph, with per-step accept /
+  correct counters. This is the functional baseline the upcoming PR's
+  speculative loop will be built on top of.
+
+## GGUF compatibility
+
+The loader follows the tensor naming convention introduced by llama.cpp's
+[MTP PR #22673](https://github.com/ggml-org/llama.cpp/pull/22673). It is
+compatible with the reference Qwen3.6-MTP GGUFs published on the Hub:
+
+- `am17an/Qwen3.6-27B-MTP-GGUF`
+- `am17an/Qwen3.6-35BA3B-MTP-GGUF` (MoE — see "MoE limitation" below)
+- `havenoammo/Qwen3.6-27B-MTP-UD-GGUF`
+- `havenoammo/Qwen3.6-35B-A3B-MTP-GGUF`
+- `froggeric/Qwen3.6-27B-MTP-GGUF`
+
+The expected tail-block tensor names are:
+
+```text
+blk.<n_trunk>.nextn.eh_proj.weight              [2 * hidden, hidden]
+blk.<n_trunk>.nextn.embed_tokens.weight          [hidden, vocab]    (optional)
+blk.<n_trunk>.nextn.enorm.weight                 [hidden]
+blk.<n_trunk>.nextn.hnorm.weight                 [hidden]
+blk.<n_trunk>.nextn.shared_head_head.weight      [hidden, vocab]    (optional)
+blk.<n_trunk>.nextn.shared_head_norm.weight      [hidden]           (optional)
+```
+
+When the optional shared-head tensors are absent the runtime falls back to
+the trunk's `output_norm` / `output` (lm_head), matching how am17an's
+GGUFs are typically packed.
+
+## MoE limitation
+
+`build_qwen35_mtp_graph` currently implements the dense-FFN path only. The
+35B-A3B MTP GGUFs require the MoE `TargetLayer` fields and the routed
+FFN path that howard0su is upstreaming in
+[PR #120 "Qwen3.5 MoE support"](https://github.com/Luce-Org/lucebox-hub/pull/120).
+A MoE-aware `build_qwen35_mtp_graph` is a one-line dispatch on top of
+this PR once #120 lands. Until then, loading a MoE-MTP GGUF + invoking
+the MTP graph returns a clear error rather than producing wrong output.
+
+## Why MTP is opt-in, not default-on
+
+Measured today against `DFlash + PFlash` on the same MTP GGUF with MTP
+disabled, on a single RTX 6000 Ada (sm_89), Qwen3.6-27B Q4_K_M target,
+`q4_0/q4_0` KV, FA_WINDOW=0, DDTree budget=16, draft feature mirror on:
+
+| n_gen | Same GGUF, MTP off (tok/s) | Same GGUF, MTP chain-2 (tok/s) | Δ |
+|---:|---:|---:|---:|
+| 64  | 57.58 | 54.72 | **−5.0%** |
+| 128 | 67.58 | 64.23 | **−5.0%** |
+| 256 | 60.40 | 82.18 | **+36.1%** |
+
+What changes between 64 and 256 tokens is that DDTree rounds drop from
+roughly 60 → 38 and average tokens committed per draft step rise from
+4.27 → 6.74, so the extra MTP forward starts paying for itself.
+
+This is real but workload-dependent acceleration, not a universal default.
+The next PR adds the speculative loop that turns this into a default-on
+mode for long generations; today's PR ships only the runtime contract and
+the tests that pin it.
+
+## Known follow-ups (next PR)
+
+1. Speculative decode loop wiring (`run_mtp_integrated_prompt`,
+   target-batched verify, fast rollback) inside `test_dflash`.
+2. Daemon-side `--mtp-integrated` CLI + metrics surface (`[mtp-daemon]`
+   line, `last_mtp` aggregated in `prefix_cache.py`).
+3. `mtp_baseline_gate.py` published as a reusable parity gate harness.
+4. CPU hidden-readback elimination — the current functional smoke still
+   round-trips token ids through CPU between MTP steps. Removing that is
+   the highest-value perf fix and is queued behind CUDA-graph capture.
+5. MoE MTP path after PR #120 merges.
diff --git a/dflash/docs/MTP_LINEAR_BENCH_2026-05-11.md b/dflash/docs/MTP_LINEAR_BENCH_2026-05-11.md
@@ -0,0 +1,79 @@
+# Linear MTP integrated decode — measured numbers, 2026-05-11
+
+Validation run for `feat(dflash): linear native MTP integrated decode CLI`
+(PR #154, stacked on PR #153 native MTP runtime).
+
+## Setup
+
+- **GPU**: NVIDIA RTX 6000 Ada Generation, sm_89, 48 GiB
+- **Build**: Release, `DFLASH27B_USER_CUDA_ARCHITECTURES=89`,
+  ninja + MSVC 19.44 + CUDA 12.8
+- **Target model**: `Qwen3.6-27B-MTP-Q4_K_M.gguf`
+  (`am17an/Qwen3.6-27B-MTP-GGUF` lineage)
+- **Mode**: `--mtp-integrated --mtp-baseline-check --mtp-draft-n-max=2 --dflash-mtp-policy=always`
+- **Context**: `--max-ctx=512`, `FA_WINDOW=2048` (default), KV `q8_0/q8_0` (default)
+- **Prompt**: synthetic 64-token sequence of distinct ids
+- **Method**: AR baseline (`run_target_ar_prompt`) vs integrated MTP loop
+  (`run_mtp_integrated_prompt` with `mtp_draft_n_max=2`); same GGUF, same
+  config; outputs compared token-by-token (`compare_ok` / `compare_fail`).
+
+## Results
+
+| n_gen | Baseline AR (tok/s) | MTP chain-2 (tok/s) | speed_ratio | MTP acceptance |
+|---:|---:|---:|---:|---:|
+| 64  | 9.48  | 12.04 | **1.270x** | 81.2% |
+| 128 | 14.48 | 15.33 | **1.059x** | 81.2% |
+| 256 | 15.11 | 17.38 | **1.151x** | 82.4% |
+
+All three runs report `[mtp-baseline] compare_ok tokens=N mismatches=0`,
+i.e. the integrated MTP loop produces the byte-identical greedy output to
+the AR baseline on this prompt.
+
+## Honest caveats
+
+- **This is vs the AR baseline on the same MTP GGUF**, not against
+  DFlash-classic + PFlash on a plain Qwen3.6-27B Q4_K_M. The AR baseline
+  is slower than DFlash-classic; a PR-publishable speedup claim against
+  DFlash-classic needs the `target_verify` graph-bucket cache work
+  ("Waves A-D" / P1 in `MTP_ACCELERATION_ROADMAP_FOR_NEXT_AI_2026-05-11.md`).
+- **Single-prompt result.** Workload sensitivity matters for MTP
+  acceptance; published prompt families (P3 of the roadmap, multi-prompt
+  gate) are pending.
+- **Linear path only.** No DDTree hybrid, no immediate-bonus, no batched
+  target verify, no bucketed graphs. This is the parity-correct floor
+  the next PR builds on.
+- **`--dflash-mtp-policy=auto`** is wired (P2 of the roadmap) but defaults
+  to `always` for now since the smoke shows MTP wins at every measured
+  `n_gen` against this baseline. The auto threshold (`min_n=192` default)
+  remains conservative for prompt families that haven't been measured.
+
+## Reproduce
+
+```bash
+# Build
+cmake -S dflash -B dflash/build -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DDFLASH27B_USER_CUDA_ARCHITECTURES=89
+cmake --build dflash/build --target test_dflash
+
+# Synthetic prompt
+python -c "import struct; \
+ids = list(range(1000, 1064)); \
+open('prompt.bin','wb').write(b''.join(struct.pack('<i', i) for i in ids))"
+
+# Gate
+./dflash/build/test_dflash \
+  /path/to/Qwen3.6-27B-MTP-Q4_K_M.gguf \
+  --mtp-integrated --mtp-baseline-check --mtp-draft-n-max=2 \
+  --dflash-mtp-policy=always \
+  --max-ctx=512 --n-gen=256 --prompt-file=prompt.bin --target-gpu=1
+```
+
+Or via the Python harness:
+
+```bash
+python dflash/scripts/mtp_baseline_gate.py \
+  --target /path/to/Qwen3.6-27B-MTP-Q4_K_M.gguf \
+  --prompt-file prompt.bin --n-gen 256 \
+  --mtp-draft-n-max 2 --max-ctx 512 --gpu 1 \
+  --min-speed-ratio 1.05 --out artifacts/mtp_n256.json
+```