diff --git a/docs/specs/openapi-props.yaml b/docs/specs/openapi-props.yaml
index c2976265f..63daacba1 100644
--- a/docs/specs/openapi-props.yaml
+++ b/docs/specs/openapi-props.yaml
@@ -2,7 +2,7 @@ openapi: 3.1.0
 
 info:
   title: dflash_server /props endpoint
-  version: "2"
+  version: "4"
   summary: Capability and configuration introspection for dflash_server.
   description: |
     `GET /props` returns enough JSON for a dashboard, a deployment
@@ -13,7 +13,28 @@ info:
     The integer reported as `server.props_schema` (and as the
     trailing token of `build_info`) bumps when the response shape
     changes in a backward-incompatible way. The current schema is
-    `2`.
+    `4`.
+
+    Schema `4` (additive over `3`): new top-level `host` block —
+    host-identity facts (OS, kernel, WSL version, docker version,
+    NVIDIA driver, NVIDIA Container Toolkit version, CPU model,
+    nproc, RAM, per-GPU array with UUID/PCI/SM/VRAM/power,
+    CUDA_VISIBLE_DEVICES) captured by
+    `server/scripts/entrypoint.sh` from the `LUCEBOX_HOST_*` env
+    the host wrapper exports. `null` when
+    `/opt/lucebox-hub/HOST_INFO` is missing (bare-metal dev). See
+    `props-endpoint.md` §4.17.
+
+    Schema `3` (additive over `2`): new top-level `build` block —
+    a structured replacement for the single-string `build_info`
+    that carries `git_sha`, `image_tag`, and `build_time` baked
+    into the container at build time. New `model.target` and
+    `model.draft` sub-objects carry full GGUF identity (absolute
+    path, `size_bytes`, `sha256`, and a `gguf` header field set —
+    architecture, quant `file_type`, `block_count`,
+    `embedding_length`, `context_length`, `vocab_size`). The
+    pre-3 fields (`build_info`, `model_path`, `model.draft_path`,
+    `model_alias`) stay verbatim for back-compat.
 
     Schema `2` (breaking change vs. `1`): `model_card` is now the
     wholesale on-disk sidecar JSON (or `null` when family / hard
@@ -29,8 +50,10 @@ info:
 
 # The numeric value matches `server.props_schema` and the
 # `props_schema=<n>` token in `build_info`. Bumps on breaking
-# response-shape changes; additive changes keep the same value.
-x-props-schema: 2
+# response-shape changes; additive changes keep the same value
+# (e.g. schema 2 → 3 is additive but still bumps so consumers
+# can negotiate the new fields).
+x-props-schema: 4
 
 servers:
   - url: http://localhost:8080
@@ -82,7 +105,37 @@ paths:
                         high: 32256
                         x-high: 56832
                         max: 81408
-                    build_info: "luce-dflash v0.0.0+cpp props_schema=2"
+                    build:
+                      server_name: "luce-dflash"
+                      server_version: "0.0.0+cpp"
+                      props_schema: 4
+                      git_sha: "6d12378"
+                      image_tag: "sha-6d12378-cuda12"
+                      image_digest: null
+                      build_time: "2026-05-28T13:43:57Z"
+                    build_info: "luce-dflash v0.0.0+cpp props_schema=4"
+                    host:
+                      os_pretty: "Ubuntu 22.04.3 LTS"
+                      kernel: "6.6.87.2-microsoft-standard-WSL2"
+                      wsl_version: "wsl2"
+                      docker_version: "29.1.3"
+                      nvidia_driver: "596.36"
+                      nvidia_ctk_version: "1.16.2"
+                      cpu_model: "Intel(R) Core(TM) Ultra 9 275HX"
+                      nproc: 24
+                      ram_gb: 64
+                      gpus:
+                        - index: 0
+                          uuid: "GPU-abc"
+                          pci_bus_id: "00000000:01:00.0"
+                          name: "NVIDIA GeForce RTX 5090 Laptop GPU"
+                          sm: "12.0"
+                          vram_gb: 24
+                          power_limit_w: 175
+                      cuda_visible_devices: "0"
+                      source: "lucebox.sh"
+                      collector: "lucebox.sh"
+                      collected_at: "2026-05-28T20:31:42Z"
                     capabilities:
                       reasoning_supported: true
                       speculative_supported: true
@@ -104,8 +157,37 @@ paths:
                       lifetime_hits: 0
                     model:
                       arch: "qwen35"
-                      draft_path: "/.../dflash-draft-3.6-q4_k_m.gguf"
+                      alias: "dflash"
+                      draft_path: "/.../dflash-draft-3.6-q8_0.gguf"
                       tokenizer_id: null
+                      target:
+                        path: "/.../Qwen3.6-27B-Q4_K_M.gguf"
+                        size_bytes: 17134510080
+                        sha256: "abc123def456...0a1b2c3d4e5f"
+                        gguf:
+                          general.architecture: "qwen35"
+                          general.name: "Qwen3.6-27B"
+                          general.file_type: 15
+                          general.file_type_name: "Q4_K_M"
+                          general.quantization_version: 2
+                          block_count: 64
+                          embedding_length: 5120
+                          context_length: 65536
+                          vocab_size: 152064
+                      draft:
+                        path: "/.../dflash-draft-3.6-q8_0.gguf"
+                        size_bytes: 425000000
+                        sha256: "deadbeef...0a1b2c3d4e5f"
+                        gguf:
+                          general.architecture: "qwen3"
+                          general.name: "Qwen3-0.6B-DFlash-draft"
+                          general.file_type: 7
+                          general.file_type_name: "Q8_0"
+                          general.quantization_version: 2
+                          block_count: 28
+                          embedding_length: 1024
+                          context_length: 32768
+                          vocab_size: 152064
                     model_alias: "dflash"
                     model_card:
                       name: "Qwen3.6 27B"
@@ -169,7 +251,7 @@ paths:
                         supports_top_p: true
                     server:
                       name: "luce-dflash"
-                      props_schema: 2
+                      props_schema: 4
                       version: "0.0.0+cpp"
                     speculative:
                       enabled: true
@@ -190,11 +272,13 @@ components:
       required:
         - api
         - budget_envelope
+        - build
         - build_info
         - capabilities
         - daemon
         - default_generation_settings
         - full_cache
+        - host
         - model
         - model_alias
         - model_card
@@ -214,13 +298,21 @@ components:
           $ref: "#/components/schemas/Api"
         budget_envelope:
           $ref: "#/components/schemas/BudgetEnvelope"
+        build:
+          $ref: "#/components/schemas/Build"
         build_info:
           type: string
           description: |
             Single-string identity: `<server_name> v<version>
             props_schema=<n>`. Matches the structured `server`
             object. Bumps `props_schema` on breaking changes.
-          example: "luce-dflash v0.0.0+cpp props_schema=2"
+
+            Deprecated in favor of the structured `build` block
+            (schema 3+), which also carries `git_sha`,
+            `image_tag`, and `build_time`. Retained for
+            back-compat — consumers that grep `build_info` keep
+            working.
+          example: "luce-dflash v0.0.0+cpp props_schema=4"
         capabilities:
           $ref: "#/components/schemas/Capabilities"
         daemon:
@@ -229,6 +321,17 @@ components:
           $ref: "#/components/schemas/DefaultGenerationSettings"
         full_cache:
           $ref: "#/components/schemas/FullCache"
+        host:
+          oneOf:
+            - $ref: "#/components/schemas/Host"
+            - type: "null"
+          description: |
+            Host-identity facts captured at container startup by
+            `server/scripts/entrypoint.sh` from the
+            `LUCEBOX_HOST_*` env the host wrapper exports. `null`
+            when `/opt/lucebox-hub/HOST_INFO` is missing (bare-
+            metal dev). Added in schema 4 (additive over 3); pre-4
+            consumers ignore the key.
         model:
           $ref: "#/components/schemas/Model"
         model_alias:
@@ -432,12 +535,148 @@ components:
           description: Cumulative hit count since server start.
           example: 0
 
+    Host:
+      description: |
+        Host-identity facts captured at container startup by
+        `server/scripts/entrypoint.sh` from the `LUCEBOX_HOST_*`
+        env the host wrapper (`lucebox.sh::probe_host`) exports.
+        Written to `/opt/lucebox-hub/HOST_INFO` (path override:
+        `$DFLASH_HOST_INFO_PATH` for tests) and read verbatim
+        into `ServerConfig.host_info`. See
+        `docs/specs/props-endpoint.md` §4.17 for the prose spec.
+
+        Surfaces so every benchmark snapshot can self-classify
+        the rig it ran on. `luce-bench snapshot` writes this
+        verbatim into `host.json` and into each per-area
+        `<area>.json` so individual area files self-describe.
+      type: object
+      required:
+        - collected_at
+        - collector
+        - source
+      additionalProperties: true
+      properties:
+        os_pretty:
+          type: ["string", "null"]
+          description: |
+            `PRETTY_NAME` from `/etc/os-release`.
+          example: "Ubuntu 22.04.3 LTS"
+        kernel:
+          type: ["string", "null"]
+          description: |
+            `uname -r` on the host.
+          example: "6.6.87.2-microsoft-standard-WSL2"
+        wsl_version:
+          type: ["string", "null"]
+          enum: ["wsl1", "wsl2", null]
+          description: |
+            `"wsl2"` matches the modern `microsoft-standard-WSL2`
+            kernel string; `"wsl1"` is the legacy translation
+            layer; `null` is bare Linux / macOS.
+        docker_version:
+          type: ["string", "null"]
+          description: |
+            Docker server version from
+            `docker version --format '{{.Server.Version}}'`.
+          example: "29.1.3"
+        nvidia_driver:
+          type: ["string", "null"]
+          description: Driver version from `nvidia-smi`.
+          example: "596.36"
+        nvidia_ctk_version:
+          type: ["string", "null"]
+          description: |
+            NVIDIA Container Toolkit version
+            (`nvidia-ctk --version`). Distinct from
+            `docker_version` — the runtime that wires GPUs into
+            containers can lag behind the daemon.
+          example: "1.16.2"
+        cpu_model:
+          type: ["string", "null"]
+          description: First `"model name"` from `/proc/cpuinfo`.
+          example: "Intel(R) Core(TM) Ultra 9 275HX"
+        nproc:
+          type: ["integer", "null"]
+          minimum: 0
+          description: Logical CPU count.
+          example: 24
+        ram_gb:
+          type: ["integer", "null"]
+          minimum: 0
+          description: Total RAM in GB.
+          example: 64
+        gpus:
+          type: array
+          description: |
+            One entry per installed GPU; the array preserves
+            nvidia-smi's enumeration order. Possibly empty.
+          items:
+            type: object
+            additionalProperties: true
+            properties:
+              index:
+                type: integer
+                minimum: 0
+                example: 0
+              uuid:
+                type: string
+                example: "GPU-abc"
+              pci_bus_id:
+                type: string
+                example: "00000000:01:00.0"
+              name:
+                type: string
+                example: "NVIDIA GeForce RTX 5090 Laptop GPU"
+              sm:
+                type: string
+                description: Compute capability (e.g. "12.0").
+                example: "12.0"
+              vram_gb:
+                type: integer
+                minimum: 0
+                example: 24
+              power_limit_w:
+                type: integer
+                minimum: 0
+                description: |
+                  May differ from manufacturer spec when the
+                  operator has set a power cap.
+                example: 175
+        cuda_visible_devices:
+          type: ["string", "null"]
+          description: |
+            Mirrors the env var; `null` means "all GPUs visible".
+          example: "0"
+        source:
+          type: string
+          enum: ["lucebox.sh", "unknown"]
+          description: |
+            How the block was populated. `"lucebox.sh"` when the
+            host wrapper drove the run; `"unknown"` for the
+            entrypoint stub-only path.
+          example: "lucebox.sh"
+        collector:
+          type: string
+          description: |
+            The script that wrote HOST_INFO: usually
+            `"lucebox.sh"` when the host wrapper drove the run, or
+            `"entrypoint.sh"` on the stub-only path.
+          example: "lucebox.sh"
+        collected_at:
+          type: string
+          format: date-time
+          description: ISO 8601 UTC timestamp.
+          example: "2026-05-28T20:31:42Z"
+
     Model:
       description: Loaded model metadata.
       type: object
       required:
         - arch
+        - alias
+        - draft
         - draft_path
+        - target
         - tokenizer_id
       properties:
         arch:
@@ -446,18 +685,228 @@ components:
             Normalized `general.architecture` value from the loaded
             GGUF (e.g. `qwen35`, `qwen36`, `gemma4`, `laguna`).
           example: "qwen35"
+        alias:
+          type: string
+          description: |
+            Mirror of the top-level `model_alias` (schema 3+),
+            grouped under `model` alongside arch / target / draft.
+            The top-level `model_alias` stays for back-compat.
+          example: "dflash"
         draft_path:
           type: ["string", "null"]
           description: |
             Filesystem path of the loaded speculative-decode draft
-            GGUF; `null` when no draft is loaded.
-          example: "/.../dflash-draft-3.6-q4_k_m.gguf"
+            GGUF; `null` when no draft is loaded. Deprecated in
+            favor of `model.draft.path` (same value); retained for
+            back-compat with pre-schema-3 readers.
+          example: "/.../dflash-draft-3.6-q8_0.gguf"
         tokenizer_id:
           type: ["string", "null"]
           description: |
             Best-effort tokenizer family hint from GGUF metadata
             (e.g. `qwen3`). `null` when unknown.
           example: null
+        target:
+          oneOf:
+            - $ref: "#/components/schemas/GgufFile"
+            - type: "null"
+          description: |
+            GGUF identity for the loaded target weights (schema 3+).
+            `null` only when the file couldn't be inspected at
+            startup — typically a load failure that should have
+            aborted boot, so seeing `null` here is a strong signal
+            something is wrong.
+        draft:
+          oneOf:
+            - $ref: "#/components/schemas/GgufFile"
+            - type: "null"
+          description: |
+            GGUF identity for the loaded draft weights (schema 3+).
+            `null` when `--draft` was not passed — the normal
+            target-only configuration for `laguna` and the
+            `qwen3.6-moe` preset. Explicit null (not omitted) so
+            consumers can distinguish "no draft" from "missing
+            from this schema version."
+
+    GgufFile:
+      description: |
+        Identity payload for one loaded GGUF file (schema 3+).
+        Surfaced under `model.target` and `model.draft`. The triple
+        `path` + `size_bytes` + `sha256` is what "exactly what
+        weights is this server running" forensics need; the `gguf`
+        sub-object adds the header fields the loader parses anyway.
+
+        Header fields may be `null` when the GGUF doesn't carry the
+        corresponding key — drafter GGUFs in particular omit
+        `context_length` and `vocab_size` more often than full
+        target models do.
+      type: object
+      required:
+        - path
+        - size_bytes
+        - sha256
+        - gguf
+      properties:
+        path:
+          type: string
+          description: Absolute filesystem path of the loaded GGUF.
+          example: "/.../Qwen3.6-27B-Q4_K_M.gguf"
+        size_bytes:
+          type: ["integer", "null"]
+          format: int64
+          minimum: 0
+          description: File size in bytes; `null` if the stat() failed.
+          example: 17134510080
+        sha256:
+          type: ["string", "null"]
+          description: |
+            Lowercase hex sha256 of the GGUF file. Computed once at
+            server startup and cached to a `<path>.sha256` sidecar
+            so subsequent restarts skip the rehash. `null` when
+            hashing was disabled (`$DFLASH_SKIP_SHA256=1`) or the
+            sidecar was unreadable and the file couldn't be opened
+            for reading.
+          example: "abc123def456789...0a1b2c3d4e5f"
+        gguf:
+          type: object
+          description: |
+            Selected `general.*` and `<arch>.*` header fields read
+            from the GGUF. Each field is `null` when the file
+            doesn't carry the corresponding key.
+          additionalProperties: true
+          properties:
+            "general.architecture":
+              type: ["string", "null"]
+              description: Raw `general.architecture` value (e.g. `qwen35`).
+              example: "qwen35"
+            "general.name":
+              type: ["string", "null"]
+              description: Display name from the GGUF (`general.name`).
+              example: "Qwen3.6-27B"
+            "general.file_type":
+              type: ["integer", "null"]
+              minimum: 0
+              description: |
+                Raw `LLAMA_FTYPE_*` integer from the GGUF
+                (`general.file_type`). 15 = Q4_K_M, 17 = Q5_K_M,
+                30 = IQ4_XS, 32 = BF16, etc. See
+                `server/deps/llama.cpp/include/llama.h` for the
+                full table.
+              example: 15
+            "general.file_type_name":
+              type: ["string", "null"]
+              description: |
+                Decoded operator-friendly tag for `general.file_type`
+                (`Q4_K_M`, `IQ4_XS`, `BF16`, …). Empty string maps
+                to `null` when the int is outside the known table.
+              example: "Q4_K_M"
+            "general.quantization_version":
+              type: ["integer", "null"]
+              minimum: 0
+              description: |
+                Raw `general.quantization_version` from the GGUF.
+                Bumped on quant-format changes; 2 is the current
+                value for K-quants and IQ-quants in 2025-2026.
+              example: 2
+            block_count:
+              type: ["integer", "null"]
+              minimum: 0
+              description: "`<arch>.block_count` — number of transformer blocks."
+              example: 64
+            embedding_length:
+              type: ["integer", "null"]
+              minimum: 0
+              description: "`<arch>.embedding_length` — model hidden size."
+              example: 5120
+            context_length:
+              type: ["integer", "null"]
+              minimum: 0
+              description: |
+                `<arch>.context_length` — the maximum context the
+                weights themselves were trained for. May exceed the
+                server's runtime `n_ctx` cap.
+              example: 65536
+            vocab_size:
+              type: ["integer", "null"]
+              minimum: 0
+              description: |
+                `<arch>.vocab_size` or the length of
+                `tokenizer.ggml.tokens` (fallback). Useful for
+                catching target/draft tokenizer mismatches at a
+                glance.
+              example: 152064
+
+    Build:
+      description: |
+        Structured server + container identity (schema 3+). The
+        first three fields mirror the `server` block and the
+        single-string `build_info`; the next three carry the
+        Docker image identity baked in at build time via
+        `docker-bake.hcl` (`GIT_SHA`, `IMAGE_TAG`, `BUILD_TIME`).
+
+        On bare-metal / non-Docker builds (no
+        `/opt/lucebox-hub/IMAGE_INFO` file), `git_sha`,
+        `image_tag`, and `build_time` are all `null` — the keys
+        are still present for shape stability.
+      type: object
+      required:
+        - server_name
+        - server_version
+        - props_schema
+        - git_sha
+        - image_tag
+        - image_digest
+        - build_time
+      properties:
+        server_name:
+          type: string
+          description: Server identity string (= `server.name`).
+          example: "luce-dflash"
+        server_version:
+          type: string
+          description: Build version string (= `server.version`).
+          example: "0.0.0+cpp"
+        props_schema:
+          type: integer
+          minimum: 1
+          description: |
+            Integer schema version (= `server.props_schema`).
+            Repeated here so a single curl on `/props` `.build`
+            returns the schema version alongside the rest of the
+            identity.
+          example: 4
+        git_sha:
+          type: ["string", "null"]
+          description: |
+            Full git commit sha of the source tree the image was
+            built from. Set by CI from `${{ github.sha }}` via
+            docker-bake.hcl. `null` outside Docker.
+          example: "6d12378abc456789012345678901234567890abcd"
+        image_tag:
+          type: ["string", "null"]
+          description: |
+            Headline tag the image was published under (e.g.
+            `cuda12`, `sha-6d12378-cuda12`, `0.3.0-cuda12`). Set
+            by CI from `docker/metadata-action` `version` output.
+            `null` outside Docker.
+          example: "sha-6d12378-cuda12"
+        image_digest:
+          type: ["string", "null"]
+          description: |
+            Reserved for future use — the registry-side
+            content-addressable digest. Not currently populated by
+            the build pipeline (the running container doesn't
+            query its own image via the Docker socket). Always
+            `null` today.
+          example: null
+        build_time:
+          type: ["string", "null"]
+          format: date-time
+          description: |
+            ISO 8601 UTC timestamp the image was built at. Set by
+            CI via `date -u`; for local builds via
+            `scripts/build_image.sh`. `null` outside Docker.
+          example: "2026-05-28T13:43:57Z"
 
     ModelCard:
       description: |
@@ -866,9 +1315,13 @@ components:
           minimum: 1
           description: |
             Integer schema version. Bumps when the response shape
-            changes in a backward-incompatible way (see §5 of
-            props-endpoint.md). Current value is `2`.
-          example: 2
+            changes (see §5 of props-endpoint.md). Current value
+            is `4`. Schema 4 is additive over 3 (new top-level
+            `host` block); schema 3 was additive over 2 (new
+            `build` block, new `model.target`/`model.draft`). The
+            bumps still happen so consumers can negotiate the new
+            fields.
+          example: 4
         version:
           type: string
           description: Build version string (semver + build tag).
diff --git a/docs/specs/props-endpoint.md b/docs/specs/props-endpoint.md
index e4238df3d..c367ba6aa 100644
--- a/docs/specs/props-endpoint.md
+++ b/docs/specs/props-endpoint.md
@@ -56,12 +56,14 @@ request will not delay a `/props` response.
 {
   "api":                          { "endpoints": [ … ] },
   "budget_envelope":              { … },
+  "build":                        { … },
   "build_info":                   "luce-dflash v<ver> props_schema=<n>",
   "capabilities":                 { … },
   "daemon":                       { "alive": true },
   "default_generation_settings":  { … },
   "full_cache":                   { … },
-  "model":                        { … },
+  "host":                         { … } | null,
+  "model":                        { "arch": "<string>", "alias": "<string>", "draft_path": "<string|null>", "tokenizer_id": "<string|null>", "target": { … }, "draft": { … } | null },
   "model_alias":                  "<string>",
   "model_card":                   { … } | null,
   "model_path":                   "<string>",
@@ -155,15 +157,55 @@ absolute-tier ceiling clamping (spec §3.5).
 actually do with a request; `model_card` (§4.10) is the source of
 truth for what the authored card says.
 
-### 4.3 `build_info`
+### 4.3 `build_info` (legacy) and `build` (schema 3+)
 
 ```
-"build_info": "luce-dflash v0.0.0+cpp props_schema=2"
+"build_info": "luce-dflash v0.0.0+cpp props_schema=4"
+"build": {
+  "server_name":    "luce-dflash",
+  "server_version": "0.0.0+cpp",
+  "props_schema":   4,
+  "git_sha":        "6d12378…",
+  "image_tag":      "sha-6d12378-cuda12",
+  "image_digest":   null,
+  "build_time":     "2026-05-28T13:43:57Z"
+}
 ```
 
-A single string carrying: server name, build version, and the
-**`props_schema` version**. Schema version bumps when the response
-shape changes in a non-backward-compatible way (see §5).
+`build_info` is the legacy single-string identity (server name,
+build version, `props_schema`). Schema version bumps when the
+response shape changes (see §5). Retained verbatim for back-compat
+— consumers that grep `build_info` keep working without changes.
+
+`build` (schema 3+) is the structured replacement and the
+recommended source of truth for "what binary is running":
+
+- `server_name` / `server_version` / `props_schema` mirror the
+  identity fields. Repeated here so a single `curl … | jq .build`
+  returns everything an operator needs.
+- `git_sha` — full git commit sha of the source tree the image
+  was built from. Set by CI from `${{ github.sha }}` via
+  `docker-bake.hcl`; set locally by `scripts/build_image.sh` from
+  `git rev-parse HEAD`. `null` on bare-metal builds (no
+  `/opt/lucebox-hub/IMAGE_INFO` file).
+- `image_tag` — headline tag the image was published under
+  (e.g. `cuda12`, `sha-6d12378-cuda12`, `0.3.0-cuda12`). Set by
+  CI from `docker/metadata-action`'s `version` output. `null`
+  outside Docker.
+- `image_digest` — reserved for future use. The
+  content-addressable registry digest would let an operator pin
+  `ghcr.io/.../lucebox-hub@sha256:…` after a pull; we don't query
+  the Docker socket from inside the container today, so this is
+  always `null`. Kept in the schema so adding it later is
+  additive.
+- `build_time` — ISO 8601 UTC timestamp the image was built at.
+  `null` outside Docker.
+
+The `build.image_*` fields are populated from
+`/opt/lucebox-hub/IMAGE_INFO`, which `Dockerfile` writes from the
+`GIT_SHA`, `IMAGE_TAG`, and `BUILD_TIME` build args. The path can
+be overridden with `$DFLASH_IMAGE_INFO_PATH` (used by unit tests
+to inject fixtures).
 
 ### 4.4 `capabilities`
 
@@ -255,15 +297,91 @@ for an introspection report; not safe for control-flow decisions.
 ```json
 "model": {
   "arch":         "qwen35",
+  "alias":        "dflash",
   "draft_path":   "/path/to/draft.gguf" | null,
-  "tokenizer_id": "qwen3" | null
+  "tokenizer_id": "qwen3" | null,
+  "target": {
+    "path":       "/path/to/Qwen3.6-27B-Q4_K_M.gguf",
+    "size_bytes": 17134510080,
+    "sha256":     "abc123…",
+    "gguf": {
+      "general.architecture":         "qwen35",
+      "general.name":                 "Qwen3.6-27B",
+      "general.file_type":            15,
+      "general.file_type_name":       "Q4_K_M",
+      "general.quantization_version": 2,
+      "block_count":                  64,
+      "embedding_length":             5120,
+      "context_length":               65536,
+      "vocab_size":                   152064
+    }
+  },
+  "draft": { … } | null
 }
 ```
 
 `arch` is the `general.architecture` value from the loaded GGUF,
-normalized. `draft_path` is the speculative-decode draft model
-path, or `null` when no draft is loaded. `tokenizer_id` is a
-best-effort tokenizer family hint from GGUF metadata.
+normalized. `tokenizer_id` is a best-effort tokenizer family hint
+from GGUF metadata.
+
+`alias` (schema 3+) mirrors the top-level `model_alias` for
+grouping under `model` alongside the rest of the model identity.
+The top-level `model_alias` stays for back-compat.
+
+`draft_path` (schema 1+, legacy) is the speculative-decode draft
+GGUF path, or `null` when no draft is loaded. New consumers should
+prefer `model.draft.path` — same value, but grouped with the rest
+of the draft identity.
+
+`target` (schema 3+) is the full identity of the loaded target
+weights. Always present and non-null when the server is up — a
+`null` `target` indicates a load failure that should have aborted
+boot, so it's a strong signal something is wrong.
+
+`draft` (schema 3+) is the same identity payload for the draft
+GGUF, or **explicit JSON null** when `--draft` was not passed.
+The normal target-only configurations are `laguna` and the
+`qwen3.6-moe` preset; explicit-null (not omitted) lets consumers
+distinguish "no draft loaded" from "field not in this schema
+version."
+
+#### `model.target` / `model.draft` field shape
+
+| field        | type                  | meaning |
+|---|---|---|
+| `path`       | `string`              | Absolute filesystem path of the loaded GGUF. |
+| `size_bytes` | `integer \| null`     | File size from `stat()`. `null` if the stat failed. |
+| `sha256`     | `string \| null`      | Lowercase hex sha256 (64 chars). Cached to a `<path>.sha256` sidecar so subsequent restarts skip the rehash. `null` when `$DFLASH_SKIP_SHA256=1` or the file couldn't be opened for reading. |
+| `gguf`       | `object`              | Selected `general.*` and `<arch>.*` header fields. Each field is `null` when the GGUF doesn't carry the corresponding key — drafter GGUFs in particular omit `context_length` and `vocab_size` more often than full target models do. |
+
+The `gguf` sub-object's keys map 1:1 to GGUF metadata keys:
+
+- `general.architecture` — raw architecture string (e.g. `qwen35`,
+  `qwen3`, `gemma4`, `laguna`).
+- `general.name` — display name from the GGUF.
+- `general.file_type` — raw `LLAMA_FTYPE_*` integer (see
+  `server/deps/llama.cpp/include/llama.h` for the full table).
+  15 = Q4_K_M, 17 = Q5_K_M, 30 = IQ4_XS, 32 = BF16, etc.
+- `general.file_type_name` — operator-friendly decoded tag for
+  `general.file_type` (e.g. `Q4_K_M`, `IQ4_XS`, `BF16`).
+- `general.quantization_version` — bumped on quant-format changes
+  (2 is the current value for K-quants and IQ-quants).
+- `block_count` — `<arch>.block_count` (number of transformer
+  blocks).
+- `embedding_length` — `<arch>.embedding_length` (model hidden
+  size).
+- `context_length` — `<arch>.context_length` (max context the
+  weights themselves were trained for; may exceed the server's
+  runtime `n_ctx` cap).
+- `vocab_size` — `<arch>.vocab_size` or the length of
+  `tokenizer.ggml.tokens` (fallback when the key isn't written).
+
+The sha256 is computed once at startup. For a multi-GB target
+GGUF this is ~30s on a fast NVMe; the result is written to a
+sidecar file `<path>.sha256` so subsequent restarts read it from
+disk instead of rehashing. Set `$DFLASH_SKIP_SHA256=1` to disable
+hashing entirely (faster cold start, but `sha256` will be `null`
+at /props).
 
 ### 4.9 `model_alias` and `model_path`
 
@@ -478,14 +596,126 @@ configuration drift between runs is possible.
 - `draft_device` — resolved draft-model device placement, or
   `null` when no draft model is loaded.
 
+### 4.17 `host` (schema 4+)
+
+```json
+"host": {
+  "os_pretty":         "Ubuntu 22.04.3 LTS",
+  "kernel":            "6.6.87.2-microsoft-standard-WSL2",
+  "wsl_version":       "wsl2",
+  "docker_version":    "29.1.3",
+  "nvidia_driver":     "596.36",
+  "nvidia_ctk_version":"1.16.2",
+  "cpu_model":         "Intel(R) Core(TM) Ultra 9 275HX",
+  "nproc":             24,
+  "ram_gb":            64,
+  "gpus": [
+    {
+      "index":         0,
+      "uuid":          "GPU-abc…",
+      "pci_bus_id":    "00000000:01:00.0",
+      "name":          "NVIDIA GeForce RTX 5090 Laptop GPU",
+      "sm":            "12.0",
+      "vram_gb":       24,
+      "power_limit_w": 175
+    }
+  ],
+  "cuda_visible_devices": "0",
+  "source":            "lucebox.sh",
+  "collector":         "lucebox.sh",
+  "collected_at":      "2026-05-28T20:31:42Z"
+}
+```
+
+Host-identity facts captured at container startup by
+`server/scripts/entrypoint.sh` from the `LUCEBOX_HOST_*` env vars the
+host wrapper (`lucebox.sh::probe_host`) exports. Written to
+`/opt/lucebox-hub/HOST_INFO` (path override: `$DFLASH_HOST_INFO_PATH`
+for tests) and read verbatim into `ServerConfig.host_info` by
+`server_main::read_host_info`.
+
+Surfaces so every benchmark snapshot can self-classify the rig it
+ran on, even when the snapshot dir is pulled out of context months
+later. `luce-bench snapshot` writes this into `host.json` and into
+each per-area `<area>.json` so individual area files self-describe.
+
+`null` when `/opt/lucebox-hub/HOST_INFO` is missing or malformed —
+the normal case for bare-metal dev builds that bypass the container
+entrypoint entirely. Containers launched by `lucebox.sh` always get a
+populated block; containers launched directly via `docker run` get a
+stub `{"source": "unknown", "collector": "entrypoint.sh", ...}` so
+the block is always present in container deployments.
+
+Fields:
+
+- `os_pretty` — string|null. `PRETTY_NAME` from
+  `/etc/os-release`. e.g. `"Ubuntu 22.04.3 LTS"`.
+- `kernel` — string|null. `uname -r` on the host.
+- `wsl_version` — `"wsl1"`, `"wsl2"`, or `null`. `"wsl2"` matches
+  the modern `microsoft-standard-WSL2` kernel string; `"wsl1"` is
+  the legacy translation layer; `null` is bare Linux / macOS.
+- `docker_version` — string|null. Docker server version from
+  `docker version --format '{{.Server.Version}}'`.
+- `nvidia_driver` — string|null. Driver version from `nvidia-smi`.
+- `nvidia_ctk_version` — string|null. NVIDIA Container Toolkit
+  version (`nvidia-ctk --version`). Distinct from `docker_version` —
+  the runtime that wires GPUs into containers can lag behind the
+  daemon.
+- `cpu_model` — string|null. First `"model name"` from
+  `/proc/cpuinfo`.
+- `nproc` — int|null. Logical CPU count.
+- `ram_gb` — int|null. Total RAM in GB.
+- `gpus` — array of objects (possibly empty). One entry per
+  installed GPU; the array preserves nvidia-smi's enumeration
+  order. Per-entry fields: `index` (int), `uuid` (string),
+  `pci_bus_id` (string), `name` (string), `sm` (string,
+  compute capability like `"12.0"`), `vram_gb` (int),
+  `power_limit_w` (int, may differ from manufacturer spec when
+  the operator has set a power cap).
+- `cuda_visible_devices` — string|null. Mirrors the env var; `null`
+  means "all GPUs visible".
+- `source` — string. One of `"lucebox.sh"`, `"unknown"`. Indicates
+  how the block was populated.
+- `collector` — string. The script that wrote HOST_INFO: usually
+  `"lucebox.sh"` when the host wrapper drove the run, or
+  `"entrypoint.sh"` on the stub-only path.
+- `collected_at` — ISO 8601 UTC timestamp string.
+
 ## 5. Schema versioning
 
-`build_info` includes `props_schema=<n>`. The integer `n` bumps
-when the response shape changes in a way that breaks existing
-clients. The current schema is `2`.
+`build_info` includes `props_schema=<n>`, mirrored in
+`server.props_schema` and (schema 3+) `build.props_schema`. The
+integer `n` bumps when fields are added or changed; consumers
+should treat unknown fields as ignorable. The current schema is
+`4`.
 
 ### 5.0 Changelog
 
+- **`4`** — Additive over `3`. New top-level `host` object — host-
+  identity facts (OS, kernel, WSL version, docker version, NVIDIA
+  driver, NVIDIA Container Toolkit version, CPU model, nproc, RAM,
+  per-GPU array with UUID/PCI/SM/VRAM/power, CUDA_VISIBLE_DEVICES)
+  captured by `server/scripts/entrypoint.sh` from the
+  `LUCEBOX_HOST_*` env the host wrapper exports. `null` when
+  `/opt/lucebox-hub/HOST_INFO` is missing (bare-metal dev). Pre-4
+  consumers ignore the new key; new consumers (luce-bench's
+  snapshot subcommand in particular) gate on the version to know
+  the block is guaranteed-present, and fall back to a client-side
+  hostinfo probe against pre-4 servers.
+- **`3`** — Additive over `2`. New top-level `build` object — a
+  structured replacement for the single-string `build_info` that
+  carries `git_sha`, `image_tag`, and `build_time` baked into the
+  container at build time. New `model.target` and `model.draft`
+  sub-objects carry full GGUF identity (path, `size_bytes`,
+  `sha256`, and `gguf.*` header fields including
+  `general.file_type[_name]`, `block_count`, `embedding_length`,
+  `context_length`, `vocab_size`). New `model.alias` field
+  (mirror of top-level `model_alias`). The pre-3 top-level
+  `build_info`, `model_path`, `model_alias`, and
+  `model.draft_path` stay verbatim for back-compat. Schema is
+  still bumped (vs leaving at `2`) so consumers can negotiate
+  the new fields and lucebench's preflight can switch its
+  display format based on the version.
 - **`2`** — `model_card` is now the wholesale on-disk sidecar JSON
   (or `null` when family/hard fallback was used). Runtime-resolved
   budget knobs that used to live under `model_card`
@@ -496,15 +726,20 @@ clients. The current schema is `2`.
   lives at `budget_envelope.model_card_source`.
 - **`1`** — Initial schema.
 
-### 5.1 Non-breaking changes (no version bump)
-
-- Adding a new top-level section or a new field inside an existing
-  section.
-- Adding a new entry to `api.endpoints` or `reasoning.supported_efforts`.
-- Loosening field bounds (e.g. extending an enum's allowed values).
-
-Clients are required to ignore unknown fields. The schema version
-does not bump for additive changes.
+### 5.1 Non-breaking changes
+
+Pure additive changes — new top-level section, new field inside
+an existing section, new entry in `api.endpoints` or
+`reasoning.supported_efforts`, loosened field bounds — historically
+did not bump `props_schema`. Schema 3 is a deliberate exception:
+it's additive (new `build`, `model.target`, `model.draft`,
+`model.alias`) but bumps the version so consumers (lucebench's
+preflight in particular) can opt in to the new display when the
+fields are guaranteed-present, and fall back when talking to an
+older server. The rule going forward: **bumps are allowed for
+additive changes too** — pre-3 clients keep working because they
+ignore unknown fields; new clients gate on the version to know
+they can rely on the new shape.
 
 ### 5.2 Breaking changes (bump `props_schema`)
 
@@ -546,7 +781,42 @@ version increments.
       "max":    81408
     }
   },
-  "build_info": "luce-dflash v0.0.0+cpp props_schema=2",
+  "build": {
+    "server_name":    "luce-dflash",
+    "server_version": "0.0.0+cpp",
+    "props_schema":   4,
+    "git_sha":        "6d12378abc456789012345678901234567890abcd",
+    "image_tag":      "sha-6d12378-cuda12",
+    "image_digest":   null,
+    "build_time":     "2026-05-28T13:43:57Z"
+  },
+  "build_info": "luce-dflash v0.0.0+cpp props_schema=4",
+  "host": {
+    "os_pretty":          "Ubuntu 22.04.3 LTS",
+    "kernel":             "6.6.87.2-microsoft-standard-WSL2",
+    "wsl_version":        "wsl2",
+    "docker_version":     "29.1.3",
+    "nvidia_driver":      "596.36",
+    "nvidia_ctk_version": "1.16.2",
+    "cpu_model":          "Intel(R) Core(TM) Ultra 9 275HX",
+    "nproc":              24,
+    "ram_gb":             64,
+    "gpus": [
+      {
+        "index":          0,
+        "uuid":           "GPU-abc",
+        "pci_bus_id":     "00000000:01:00.0",
+        "name":           "NVIDIA GeForce RTX 5090 Laptop GPU",
+        "sm":             "12.0",
+        "vram_gb":        24,
+        "power_limit_w":  175
+      }
+    ],
+    "cuda_visible_devices": "0",
+    "source":             "lucebox.sh",
+    "collector":          "lucebox.sh",
+    "collected_at":       "2026-05-28T20:31:42Z"
+  },
   "capabilities": {
     "reasoning_supported":   true,
     "speculative_supported": true,
@@ -570,8 +840,41 @@ version increments.
   },
   "model": {
     "arch":         "qwen35",
-    "draft_path":   "/.../dflash-draft-3.6-q4_k_m.gguf",
-    "tokenizer_id": "qwen3"
+    "alias":        "dflash",
+    "draft_path":   "/.../dflash-draft-3.6-q8_0.gguf",
+    "tokenizer_id": "qwen3",
+    "target": {
+      "path":       "/.../Qwen3.6-27B-Q4_K_M.gguf",
+      "size_bytes": 17134510080,
+      "sha256":     "abc123def456789012345678901234567890abcd0123456789abcdef01234567",
+      "gguf": {
+        "general.architecture":         "qwen35",
+        "general.name":                 "Qwen3.6-27B",
+        "general.file_type":            15,
+        "general.file_type_name":       "Q4_K_M",
+        "general.quantization_version": 2,
+        "block_count":                  64,
+        "embedding_length":             5120,
+        "context_length":               65536,
+        "vocab_size":                   152064
+      }
+    },
+    "draft": {
+      "path":       "/.../dflash-draft-3.6-q8_0.gguf",
+      "size_bytes": 425000000,
+      "sha256":     "deadbeef00112233445566778899aabbccddeeff00112233445566778899aabb",
+      "gguf": {
+        "general.architecture":         "qwen3",
+        "general.name":                 "Qwen3-0.6B-DFlash-draft",
+        "general.file_type":            7,
+        "general.file_type_name":       "Q8_0",
+        "general.quantization_version": 2,
+        "block_count":                  28,
+        "embedding_length":             1024,
+        "context_length":               32768,
+        "vocab_size":                   152064
+      }
+    }
   },
   "model_alias": "dflash",
   "model_card": {
diff --git a/docs/specs/thinking-budget.md b/docs/specs/thinking-budget.md
index 5ebc731be..4079e264b 100644
--- a/docs/specs/thinking-budget.md
+++ b/docs/specs/thinking-budget.md
@@ -125,7 +125,7 @@ Fields:
 | `verified_at` | ISO date the values were last checked against the source. |
 | `max_tokens` | The card's standard recommended combined cap. Drives `default_max_tokens`. |
 | `complex_problem_max_tokens` | Optional. The card's recommendation for hard reasoning / benchmark workloads. Drives the `x-high` and `max` effort tiers, which sit *above* `default_max_tokens` when this field is present — they are admissible as long as they fit under `max_ctx − hard_limit_reply_budget`. If omitted, both collapse to the `high` tier value. |
-| `hard_limit_reply_budget` | Optional. Tokens reserved post-`</think>` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with `</think>`). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `server/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. |
+| `hard_limit_reply_budget` | Optional. Tokens reserved post-`</think>` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with `</think>`). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. |
 | `sampling` | Recommended sampler params. Used as defaults when the request doesn't pin sampler values. |
 | `reasoning_effort_tiers` | Explicit phase-1 budgets per tier. Override any computed default. Whichever tiers are present win; missing tiers fall through to the computed defaults below. |
 
diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
index 1349109ad..6fecc3752 100644
--- a/server/src/server/chat_template.cpp
+++ b/server/src/server/chat_template.cpp
@@ -51,7 +51,7 @@ ChatFormat chat_format_for_arch(const std::string & arch) {
     return ChatFormat::QWEN3;
 }
 
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt,
@@ -59,6 +59,10 @@ std::string render_chat_template(
     const std::string & tools_json)
 {
     std::string result;
+    // `started_in_thinking` is derived deterministically from the template
+    // branch + render flags below. Set per format inside the switch so a
+    // future format addition can't silently miss the wiring.
+    bool started_in_thinking = false;
     bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";
 
     switch (format) {
@@ -141,6 +145,14 @@ std::string render_chat_template(
                 // even when the client opts in, defeating the thinking-budget
                 // mechanism entirely.
                 result += "<think>\n";
+                // The prompt suffix pre-opens `<think>` — the model's very
+                // first generated token is reasoning, never preceded by an
+                // explicit `<think>` opener in the stream. Callers must
+                // start the SSE state machine in REASONING mode and pass
+                // `started_in_thinking=true` to parse_reasoning() so that
+                // reasoning text routes to reasoning_content instead of
+                // leaking into content.
+                started_in_thinking = true;
             }
         }
         break;
@@ -224,6 +236,11 @@ std::string render_chat_template(
             result += "<assistant>\n";
             if (enable_thinking) {
                 result += "<think>";
+                // Same situation as Qwen3.6: Laguna XS.2's enable_thinking
+                // generation prompt ends with `<think>` so the model starts
+                // emitting reasoning tokens with no explicit opener in the
+                // stream. Route subsequent tokens to the reasoning channel.
+                started_in_thinking = true;
             } else {
                 // Empty think block — model jumps straight to answer.
                 result += "</think>";
@@ -311,11 +328,17 @@ std::string render_chat_template(
                 result += "<|channel>thought\n<channel|>";
             }
         }
+        // Gemma4 does NOT pre-open `<think>` from the prompt; its
+        // reasoning channel is opened by the model emitting `<|channel>`
+        // which http_server forwards into the SseEmitter as the text
+        // `<think>` — so the emitter's existing CONTENT→REASONING
+        // transition fires on that synthesized opener. started_in_thinking
+        // stays false (initial CONTENT mode is correct).
         break;
     }
     }
 
-    return result;
+    return PromptRenderResult{std::move(result), started_in_thinking};
 }
 
 // ─── Jinja path ─────────────────────────────────────────────────────────
@@ -353,7 +376,29 @@ static std::shared_ptr<jinja::program> get_or_parse(const std::string & template
 
 }  // namespace
 
-std::string render_chat_template_jinja(
+// Sniff a rendered prompt for a trailing `<think>` opener so the caller
+// can route subsequent stream tokens to the reasoning channel. Accepts
+// optional whitespace after the opener (Qwen3.6 emits `<think>\n`).
+// True positive ⇒ caller should treat the prompt as having pre-opened
+// the reasoning channel (and the renderer warns loudly so a model-card
+// mismatch is visible at runtime).
+static bool prompt_ends_with_think_open(const std::string & s) {
+    static const std::string OPEN = "<think>";
+    // Walk back over trailing ASCII whitespace.
+    size_t end = s.size();
+    while (end > 0) {
+        char c = s[end - 1];
+        if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
+            end--;
+        } else {
+            break;
+        }
+    }
+    if (end < OPEN.size()) return false;
+    return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0;
+}
+
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,
@@ -407,14 +452,43 @@ std::string render_chat_template_jinja(
         throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
     }
 
+    std::string rendered;
     try {
         jinja::runtime rt(ctx);
         jinja::value results = rt.execute(*prog);
         auto parts = jinja::runtime::gather_string_parts(results);
-        return parts->as_string().str();
+        rendered = parts->as_string().str();
     } catch (const std::exception & e) {
         throw std::runtime_error(std::string("jinja runtime: ") + e.what());
     }
+
+    // Jinja path: we don't know which template family the caller passed
+    // in, so derive `started_in_thinking` by sniffing the rendered tail
+    // for a `<think>` opener. This catches the common Qwen3.6 / Laguna
+    // chat templates that end with `<think>\n` when enable_thinking is
+    // honored, plus any custom template that follows the same convention.
+    //
+    // The sniff is the source of truth — if the rendered prompt ends
+    // with `<think>`, the model's first generated token is reasoning
+    // regardless of the `enable_thinking` flag we passed in. A template
+    // that hard-codes `<think>` even with enable_thinking=false will
+    // still pre-open the channel, and we must route accordingly to
+    // avoid leaking reasoning into the content stream.
+    //
+    // Warn only on the mismatch case (sniff=true, enable_thinking=false)
+    // so a template/model-card disagreement surfaces in server logs
+    // without spamming the normal-success path.
+    bool started_in_thinking =
+        add_generation_prompt && prompt_ends_with_think_open(rendered);
+    if (started_in_thinking && !enable_thinking) {
+        std::fprintf(stderr,
+            "[WARN] render_chat_template_jinja: rendered prompt ends with "
+            "`<think>` opener despite enable_thinking=false — treating as "
+            "started_in_thinking=true. Check the template's enable_thinking "
+            "branch or the model card's reasoning configuration.\n");
+    }
+
+    return PromptRenderResult{std::move(rendered), started_in_thinking};
 }
 
 }  // namespace dflash::common
diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h
index ca7ef9db5..770e65a42 100644
--- a/server/src/server/chat_template.h
+++ b/server/src/server/chat_template.h
@@ -27,6 +27,23 @@ enum class ChatFormat {
     GEMMA4,    // <bos><|turn>role\n...<turn|>\n
 };
 
+// Provenance for a rendered prompt. `text` is the byte string that gets
+// tokenized; `started_in_thinking` records whether the prompt suffix
+// pre-opens a `<think>` block (or equivalent reasoning-channel marker)
+// that the model is expected to continue into.
+//
+// Callers route this into the SseEmitter's initial mode and into
+// parse_reasoning()'s `started_in_thinking` argument so reasoning text
+// emitted before any explicit `<think>` opener is still attributed to
+// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna
+// enable_thinking prompts (which pre-open `<think>\n` in the assistant
+// turn) cause the model to emit reasoning straight into the content
+// channel, leaving `reasoning_content` empty.
+struct PromptRenderResult {
+    std::string text;            // rendered prompt text, ready to tokenize
+    bool started_in_thinking;    // prompt suffix opens reasoning channel
+};
+
 // Render chat messages into the model-specific prompt string.
 // The result is plain text ready to be tokenized.
 //
@@ -40,7 +57,7 @@ enum class ChatFormat {
 // `tools_json` is an optional JSON string containing the tool definitions
 // array. When non-empty, the Qwen3/3.5 template injects a tool preamble
 // into the system message instructing the model how to emit <tool_call> tags.
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt = true,
@@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch);
 // Internally caches the most recently parsed program per thread (avoids
 // re-parsing the template on every request). Throws std::runtime_error on
 // lexer/parser/runtime failure (caller should surface a 500 response).
-std::string render_chat_template_jinja(
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,
diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
index 62f1b91ff..ef49568b9 100644
--- a/server/src/server/http_server.cpp
+++ b/server/src/server/http_server.cpp
@@ -257,12 +257,31 @@ static bool curl_forward(int client_fd, const std::string & url,
 //
 // SERVER_NAME / SERVER_VERSION mirror the Python server's identity strings
 // so cross-server consumers (autotune, dashboards) see a stable
-// `build_info` shape. Bump PROPS_SCHEMA on breaking changes only:
-//   - field renamed
-//   - field removed
-//   - existing field's semantics change (units, nullability, type)
-// Do NOT bump for additive changes (new fields, new sections).
-static constexpr int  kPropsSchema  = 2;
+// `build_info` shape. Bump PROPS_SCHEMA when the response shape changes
+// — either:
+//   - breaking: field renamed, removed, or its semantics changed
+//     (units, nullability, type tightening)
+//   - additive (new fields / sections) when downstream consumers need
+//     to negotiate the new shape. Pre-bump consumers keep working
+//     because they ignore unknown fields; the bump signals "the new
+//     fields are guaranteed-present at this version or higher" so
+//     code like lucebench's preflight can opt in to the richer display.
+//
+// Schema 3 (additive vs 2): new top-level `build` block (structured
+// version of `build_info` with git_sha/image_tag/build_time), and new
+// `model.target` / `model.draft` GGUF-identity sub-objects carrying
+// size_bytes + sha256 + gguf header fields. The pre-3 top-level
+// `build_info`, `model_path`, `model_alias`, and `model.draft_path`
+// are preserved verbatim for back-compat.
+//
+// Schema 4 (additive vs 3): new top-level `host` block — verbatim
+// pass-through of /opt/lucebox-hub/HOST_INFO (written by
+// server/scripts/entrypoint.sh from the LUCEBOX_HOST_* env the host
+// wrapper probes). Null when HOST_INFO is missing (bare-metal dev or
+// manual docker run that bypasses entrypoint). luce-bench's snapshot
+// subcommand uses the version bump to gate on the new shape — pre-4
+// servers force a client-side fallback probe.
+static constexpr int  kPropsSchema  = 4;
 static constexpr char kServerName[] = "luce-dflash";
 #ifndef DFLASH_SERVER_VERSION
 #define DFLASH_SERVER_VERSION "0.0.0+cpp"
@@ -393,6 +412,8 @@ static std::string build_stall_tool_prefix(const json & tools,
     return prefix;
 }
 
+
+
 // Build the /props response body.
 //
 // Non-static so unit tests can call it directly (declared in http_server.h).
@@ -439,6 +460,34 @@ json build_props_body(const ServerConfig & config,
         {"props_schema", kPropsSchema},
     };
 
+    // Structured replacement for the single-string `build_info` (schema 3+).
+    // Reads image identity stashed by server_main from /opt/lucebox-hub/
+    // IMAGE_INFO when the binary is running inside a Docker image built by
+    // docker-bake.hcl. On bare-metal / dev builds, image_info is null and
+    // the three image_* fields stay null; git_sha / image_tag / build_time
+    // are always present as keys for shape stability.
+    auto pull_string = [&](const char * field) -> json {
+        if (!config.image_info.is_object()) return nullptr;
+        auto it = config.image_info.find(field);
+        if (it == config.image_info.end()) return nullptr;
+        if (!it->is_string()) return nullptr;
+        const std::string & s = it->get_ref<const std::string &>();
+        if (s.empty()) return nullptr;
+        return s;
+    };
+    json build_block = {
+        {"server_name",    kServerName},
+        {"server_version", DFLASH_SERVER_VERSION},
+        {"props_schema",   kPropsSchema},
+        {"git_sha",        pull_string("git_sha")},
+        {"image_tag",      pull_string("image_tag")},
+        // image_digest is set externally (image is content-addressable only
+        // after push; the running container would need to query its own
+        // image via the Docker socket, which we don't do today). Reserved.
+        {"image_digest",   nullptr},
+        {"build_time",     pull_string("build_time")},
+    };
+
     json pflash;
     if (!pflash_enabled) {
         pflash = {
@@ -502,12 +551,29 @@ json build_props_body(const ServerConfig & config,
         {"model_path",  config.model_path},
         {"build_info",  std::string(kServerName) + " v" DFLASH_SERVER_VERSION
                         " props_schema=" + std::to_string(kPropsSchema)},
+        {"build",       build_block},
         {"speculative_mode", speculative_mode},
         {"server", server},
         {"model", {
             {"arch",         config.arch},
+            // `alias` mirrors top-level `model_alias` for grouping under
+            // `model`. The top-level field stays for back-compat (clients
+            // already grep for `model_alias`); new consumers should prefer
+            // `model.alias` since that's where all the model identity
+            // (arch, target, draft, tokenizer_id) lives.
+            {"alias",        config.model_name},
+            // Back-compat: pre-schema-3 readers grep `model.draft_path`
+            // directly. New shape exposes the same path under
+            // `model.draft.path` along with size/sha256/header fields.
             {"draft_path",   config.draft_path.empty() ? json(nullptr) : json(config.draft_path)},
             {"tokenizer_id", config.tokenizer_id.empty() ? json(nullptr) : json(config.tokenizer_id)},
+            // Schema 3 additions. Always emitted; `target` is null if the
+            // GGUF couldn't be inspected at startup (rare — implies a load
+            // failure that should have aborted boot). `draft` is null when
+            // no draft GGUF is loaded (`--draft` not passed), which is the
+            // normal target-only configuration for laguna / qwen3.6-moe.
+            {"target", config.target_gguf.is_null() ? json(nullptr) : config.target_gguf},
+            {"draft",  config.draft_gguf.is_null()  ? json(nullptr) : config.draft_gguf},
         }},
         {"runtime", {
             {"backend",         config.runtime_backend.empty() ? "cuda" : config.runtime_backend},
@@ -595,6 +661,13 @@ json build_props_body(const ServerConfig & config,
         // The C++ daemon is linked in-process; if /props is responding,
         // the daemon is alive by construction.
         {"daemon", {{"alive", true}}},
+        // Host identity (schema 4+). Verbatim pass-through of
+        // /opt/lucebox-hub/HOST_INFO — see server_main::read_host_info
+        // and entrypoint.sh::write_host_info. Null when HOST_INFO is
+        // missing or malformed; null is the explicit "bare metal dev"
+        // signal that luce-bench's snapshot uses to trigger a
+        // client-side fallback probe.
+        {"host", config.host_info.is_null() ? json(nullptr) : config.host_info},
         {"api", {{"endpoints", kApiEndpoints}}},
         // Capability flags surfaced for clients that don't want to crack
         // open `reasoning` / `speculative` / etc. — matches the Python
@@ -697,6 +770,7 @@ std::vector<ChatMessage> normalize_chat_messages(
             cm.role = m.value("role", "user");
 
             bool replayed = false;
+            // OpenAI format: assistant message with tool_calls field.
             if (cm.role == "assistant" && m.contains("tool_calls") &&
                 m["tool_calls"].is_array() && !m["tool_calls"].empty()) {
                 std::vector<std::string> call_ids;
@@ -711,6 +785,43 @@ std::vector<ChatMessage> normalize_chat_messages(
                 }
             }
 
+            // Anthropic format: assistant message with tool_use content blocks.
+            // IDs in tool_use blocks match the IDs stored in tool_memory when
+            // this server emitted the tool calls. Look them up to get the raw
+            // model output (already formatted for the model's chat template).
+            if (!replayed && cm.role == "assistant" &&
+                m.contains("content") && m["content"].is_array()) {
+                std::vector<std::string> call_ids;
+                for (const auto & part : m["content"]) {
+                    if (part.value("type", "") == "tool_use") {
+                        std::string id = part.value("id", "");
+                        if (!id.empty()) call_ids.push_back(id);
+                    }
+                }
+                if (!call_ids.empty()) {
+                    std::string raw = tool_memory.lookup(call_ids);
+                    if (!raw.empty()) {
+                        cm.content = raw;
+                        replayed = true;
+                    } else {
+                        // tool_memory miss (cross-session replay): synthesize
+                        // from the block fields using the model's tool_call XML.
+                        for (const auto & part : m["content"]) {
+                            if (part.value("type", "") == "tool_use") {
+                                json input = part.contains("input")
+                                    ? part["input"] : json::object();
+                                cm.content += "<tool_call>\n";
+                                cm.content += render_tool_call_xml(
+                                    part.value("name", ""), input);
+                                cm.content += "</tool_call>\n";
+                            }
+                        }
+                        replayed = !cm.content.empty();
+                    }
+                }
+            }
+
+            bool has_tool_results = false;
             if (!replayed) {
                 if (m.contains("content") && m["content"].is_string()) {
                     cm.content = m["content"].get<std::string>();
@@ -720,16 +831,45 @@ std::vector<ChatMessage> normalize_chat_messages(
                         if (ptype == "text" || ptype == "input_text" ||
                             ptype == "output_text") {
                             cm.content += part.value("text", "");
+                        } else if (ptype == "tool_result") {
+                            // Anthropic format: tool result inside a user
+                            // message. Push as a tool-role message so the
+                            // chat template wraps it in <tool_response> tags.
+                            has_tool_results = true;
+                            std::string result_content;
+                            if (part.contains("content")) {
+                                if (part["content"].is_string()) {
+                                    result_content =
+                                        part["content"].get<std::string>();
+                                } else if (part["content"].is_array()) {
+                                    for (const auto & c : part["content"]) {
+                                        if (c.value("type", "") == "text") {
+                                            result_content +=
+                                                c.value("text", "");
+                                        }
+                                    }
+                                }
+                            }
+                            std::string result_id =
+                                part.value("tool_use_id", "");
+                            chat_msgs.push_back(
+                                {"tool", result_content, result_id});
                         }
                     }
                 }
             }
 
-            if (format == ApiFormat::RESPONSES &&
-                (cm.role == "system" || cm.role == "developer")) {
-                system_parts.push_back(cm.content);
-            } else {
-                chat_msgs.push_back(std::move(cm));
+            // Skip pushing an empty user container when all content was
+            // tool_result blocks (already pushed as individual tool messages).
+            bool skip = (cm.role == "user" && has_tool_results &&
+                         cm.content.empty());
+            if (!skip) {
+                if (format == ApiFormat::RESPONSES &&
+                    (cm.role == "system" || cm.role == "developer")) {
+                    system_parts.push_back(cm.content);
+                } else {
+                    chat_msgs.push_back(std::move(cm));
+                }
             }
         }
     } else if (messages.is_string()) {
@@ -1591,7 +1731,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
             tools_json = req.tools.dump();
         }
 
-        std::string rendered;
+        PromptRenderResult render_result;
         if (!config_.chat_template_src.empty()) {
             // Jinja path: caller supplied a chat template file via
             // --chat-template-file. Override the hardcoded QWEN3/LAGUNA
@@ -1608,7 +1748,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 ? tokenizer_.raw_token(tokenizer_.eos_id())
                 : std::string();
             try {
-                rendered = render_chat_template_jinja(
+                render_result = render_chat_template_jinja(
                     config_.chat_template_src,
                     chat_msgs,
                     bos_str,
@@ -1622,11 +1762,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 return true;
             }
         } else {
-            rendered = render_chat_template(chat_msgs, chat_format_,
-                                            true, enable_thinking,
-                                            tools_json);
-        }
-        req.prompt_tokens = tokenizer_.encode(rendered);
+            render_result = render_chat_template(chat_msgs, chat_format_,
+                                                 true, enable_thinking,
+                                                 tools_json);
+        }
+        // Propagate prompt provenance so the SseEmitter's initial mode
+        // matches the template's pre-opened reasoning channel (Qwen3.6 /
+        // Laguna enable_thinking case). Without this, reasoning text
+        // leaks into the content channel and `reasoning_content` stays
+        // empty — see fix(server): route Qwen3.6/Laguna think-mode
+        // reasoning to reasoning_content channel.
+        req.started_in_thinking = render_result.started_in_thinking;
+        req.prompt_tokens = tokenizer_.encode(render_result.text);
 
         // count_tokens: short-circuit after tokenization. Skip generation
         // entirely — Anthropic's contract is just `{"input_tokens": N}`.
@@ -1770,11 +1917,20 @@ void HttpServer::worker_loop() {
             }
         }
 
-        // Create SSE emitter for streaming state machine.
+        // Create SSE emitter for streaming state machine. `initial_mode`
+        // tracks whether the chat-template prompt pre-opened a `<think>`
+        // block (Qwen3.6 / Laguna enable_thinking path). When true, the
+        // emitter starts in REASONING so the model's first generated
+        // token routes to reasoning_content even though no explicit
+        // `<think>` opener appears in the token stream.
+        const StreamMode initial_mode = req.started_in_thinking
+            ? StreamMode::REASONING
+            : StreamMode::CONTENT;
         SseEmitter emitter(req.format, req.response_id, req.model,
                            (int)req.prompt_tokens.size(), req.tools,
                            &tool_memory_,
-                           req.stop_sequences);
+                           req.stop_sequences,
+                           initial_mode);
 
         // Emit initial SSE events (skip when proxying).
         if (req.stream && config_.pflash_upstream_base.empty()) {
@@ -2235,6 +2391,7 @@ void HttpServer::worker_loop() {
         }
 #endif // DFLASH_HAS_CURL
 
+
         // Build generate request.
         //
         // Thinking-budget v2 (Level 2): when caller opts in via
@@ -2252,19 +2409,21 @@ void HttpServer::worker_loop() {
         const int effective_think_ceiling = (req.per_req_phase1_cap >= 0)
             ? req.per_req_phase1_cap
             : config_.think_max_tokens;
-        // The effective per-request reply budget is the operator's choice
-        // (CLI / sidecar / per-request override). The AR loop force-closes
-        // when `n_gen - generated <= eff_reply`, which means n_gen must
-        // include BOTH the think budget AND the reply reserve. Without the
-        // `+ eff_reply` term, force-close fires immediately when
-        // `eff_reply == effective_think_ceiling` (e.g. think_max=4096,
-        // hard_limit=4096 → remaining starts at 4096, condition fires
-        // before the model emits a single thinking token). Spec §4.4.
+        // When thinking is active, max_tokens is the *response* budget only —
+        // thinking tokens are additive. n_gen = think_ceiling + response_budget,
+        // where response_budget = min(max_tokens, hard_limit_reply_budget).
+        // This prevents immediate force-close on benchmarks whose max_tokens
+        // were sized for nothink responses (e.g. gsm8k=2048, agent_recorded=4096).
+        // Without this, n_gen = min(think+reply, max_tokens) would cap n_gen
+        // below the hard_limit threshold, firing force-close at step 0. Spec §4.4.
         const int eff_reply_for_n_gen = (req.per_req_reply_budget >= 0)
             ? req.per_req_reply_budget
             : config_.hard_limit_reply_budget;
+        const int response_budget = budget_active
+            ? std::min(req.max_output, eff_reply_for_n_gen)
+            : req.max_output;
         const int n_gen_cap = budget_active
-            ? std::min(effective_think_ceiling + eff_reply_for_n_gen, req.max_output)
+            ? effective_think_ceiling + response_budget
             : req.max_output;
 
         GenerateRequest gen_req;
@@ -2620,8 +2779,9 @@ void HttpServer::worker_loop() {
 
             const std::string & raw = tokenizer_.raw_token(token);
 
-            // Gemma4 thinking channel: map <|channel> → <think>, <channel|> → </think>\n
-            if (raw == "<|channel>") {
+            // Gemma4 thinking channel: map <|channel>* → <think>, <channel|> → </think>\n
+            // raw vocab token is "<|channel>thought", not just "<|channel>".
+            if (raw.rfind("<|channel>", 0) == 0) {
                 visible_output_seen = true;
                 broadcast_token("<think>");
                 if (req.stream) {
@@ -2811,15 +2971,18 @@ void HttpServer::worker_loop() {
             }
         }
 
-        // close_kind reflects the Level 2 BudgetHook outcome: "hard" when
-        // the backend's AR/spec decode injected the close-token sequence
-        // at the budget boundary, "natural" when the model self-closed
-        // (or the request never opted in). Emitted as part of
-        // finish_details for thinking-budget callers.
-        std::string close_kind =
-            (req.thinking_opt_in && result.budget_forced_close)
-                ? "hard"
-                : "natural";
+        // close_kind reflects the Level 2 BudgetHook outcome:
+        //   "natural" — the model emitted </think> on its own (or the
+        //               request never opted in to the envelope).
+        //   "hard"    — the budget edge was reached and the AR loop
+        //               forced </think> in. Original Level 2 behavior.
+        // Soft-close (Level 2.5) lives on a sibling branch; this PR
+        // reports the natural/hard split that landed first. Emitted as
+        // part of finish_details for thinking-budget callers.
+        std::string close_kind = "natural";
+        if (req.thinking_opt_in && result.budget_forced_close) {
+            close_kind = "hard";
+        }
 
         // Finalize.
         // Per-request wall-clock timings forwarded to the response's
@@ -2868,8 +3031,8 @@ void HttpServer::worker_loop() {
                     const std::string & raw = tokenizer_.raw_token(tok);
                     if (tok == tokenizer_.eos_id()) continue;
                     if (tok == tokenizer_.eos_chat_id()) continue;
-                    // Gemma4 channel → think mapping
-                    if (raw == "<|channel>") { emitter.emit_token("<think>"); continue; }
+                    // Gemma4 channel → think mapping; raw token is "<|channel>thought"
+                    if (raw.rfind("<|channel>", 0) == 0) { emitter.emit_token("<think>"); continue; }
                     if (raw == "<channel|>") { emitter.emit_token("</think>\n"); continue; }
                     // Qwen3.6 thinking tokens (id 248068 / 248069) — must
                     // forward as text so the emitter transitions
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 49fcafb6a..db5b48ff7 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -175,6 +175,39 @@ struct ServerConfig {
     // the Anthropic tool_use envelope, e.g. froggeric Qwen3.6 template.
     std::string chat_template_src;          // literal Jinja source (loaded from file)
     std::string chat_template_path;         // path it was loaded from (logged at startup)
+
+    // ── /props identity payloads (filled by server_main at startup) ──
+    //
+    // `target_gguf` / `draft_gguf`: JSON blobs produced by reading the GGUF
+    // header + sha256 for each loaded model. Surface verbatim under
+    // /props.model.target / /props.model.draft so an operator can pin the
+    // exact weights + quant + sha from a single curl. Empty/null when
+    // not loaded; `draft_gguf` is null when --draft was not passed.
+    // See docs/specs/props-endpoint.md §4.8 and build_props_body().
+    nlohmann::json target_gguf = nullptr;
+    nlohmann::json draft_gguf  = nullptr;
+
+    // `image_info`: container/image identity read from /opt/lucebox-hub/
+    // IMAGE_INFO at server start. Three lines: git_sha, image_tag,
+    // build_time (ISO 8601). Object with three string fields or null
+    // when the file is missing (e.g. local non-Docker builds). Surfaced
+    // under /props.build as git_sha/image_tag/build_time. Path overridable
+    // via $DFLASH_IMAGE_INFO_PATH for tests.
+    nlohmann::json image_info = nullptr;
+
+    // `host_info`: host-identity facts read from /opt/lucebox-hub/HOST_INFO
+    // at server start. JSON object written by server/scripts/entrypoint.sh
+    // from the LUCEBOX_HOST_* env vars the host wrapper exports. Surfaced
+    // verbatim under /props.host so every benchmark snapshot can self-
+    // classify the rig it ran on (OS, kernel, WSL version, GPU list with
+    // per-GPU UUID/PCI/SM/VRAM/power, nvidia driver + CTK versions).
+    // Null when the file is missing (e.g. bare-metal dev or someone ran
+    // `docker run` without lucebox.sh — entrypoint still writes a stub
+    // {"source":"unknown",...} so this is null only on the bare-metal
+    // path that bypasses entrypoint entirely). Path overridable via
+    // $DFLASH_HOST_INFO_PATH for tests. See HOST_INFO doc at
+    // docs/specs/props-endpoint.md §4.10.
+    nlohmann::json host_info = nullptr;
 };
 
 // ─── Parsed request ─────────────────────────────────────────────────────
@@ -215,6 +248,12 @@ struct ParsedRequest {
     // Bandit: per-session adaptive keep_ratio opt-in
     std::string               session_id;
     DiskPrefixCachePolicy     disk_cache_policy;
+    // Set by the chat-template renderer when the rendered prompt suffix
+    // pre-opens a `<think>` block (Qwen3.6 / Laguna enable_thinking path).
+    // Drives the SseEmitter's initial mode so reasoning tokens emitted
+    // before any explicit `<think>` opener route to reasoning_content
+    // instead of leaking into content.
+    bool                      started_in_thinking = false;
 };
 
 // Build the /props response body. Exposed (non-static) so unit tests
diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp
index 2c7dc850f..b5ede8338 100644
--- a/server/src/server/server_main.cpp
+++ b/server/src/server/server_main.cpp
@@ -25,15 +25,20 @@
 #include "gguf.h"
 
 #include <algorithm>
+#include <nlohmann/json.hpp>
+
+
 #include <csignal>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <memory>
 #include <string>
 #include <vector>
 
 using namespace dflash::common;
+using nlohmann::json;
 
 // Global server pointer for signal handling.
 static HttpServer * g_server = nullptr;
@@ -45,6 +50,87 @@ static void signal_handler(int sig) {
     }
 }
 
+// Render a GgufMetadata as the JSON object surfaced under
+// /props.model.target / /props.model.draft (schema 3+). Header keys that
+// the file didn't carry come through as JSON null so consumers can tell
+// "no key in GGUF" from "0" — important for context_length / vocab_size
+// where 0 is implausible but missing is common on hand-edited drafters.
+static json gguf_metadata_to_json(const GgufMetadata & m) {
+    auto str_or_null = [](const std::string & s) -> json {
+        return s.empty() ? json(nullptr) : json(s);
+    };
+    auto i32_or_null = [](int32_t v) -> json {
+        return v < 0 ? json(nullptr) : json(v);
+    };
+    json gguf = {
+        {"general.architecture",         str_or_null(m.general_architecture)},
+        {"general.name",                 str_or_null(m.general_name)},
+        {"general.file_type",            i32_or_null(m.file_type)},
+        {"general.file_type_name",       str_or_null(m.file_type_name)},
+        {"general.quantization_version", i32_or_null(m.quantization_version)},
+        {"block_count",                  i32_or_null(m.block_count)},
+        {"embedding_length",             i32_or_null(m.embedding_length)},
+        {"context_length",               i32_or_null(m.context_length)},
+        {"vocab_size",                   i32_or_null(m.vocab_size)},
+    };
+    return {
+        {"path",       m.path},
+        {"size_bytes", m.size_bytes < 0 ? json(nullptr) : json(m.size_bytes)},
+        {"sha256",     str_or_null(m.sha256)},
+        {"gguf",       gguf},
+    };
+}
+
+// Read /opt/lucebox-hub/IMAGE_INFO (three lines: git_sha, image_tag,
+// build_time) into a JSON object surfaced under /props.build. Returns
+// JSON null when the file is missing or unreadable — the normal case
+// for bare-metal dev builds. Path override via $DFLASH_IMAGE_INFO_PATH
+// (set by tests to point at a fixture).
+static json read_image_info() {
+    const char * env_path = std::getenv("DFLASH_IMAGE_INFO_PATH");
+    const std::string path = (env_path && *env_path)
+        ? std::string(env_path)
+        : std::string("/opt/lucebox-hub/IMAGE_INFO");
+    std::ifstream f(path);
+    if (!f) return nullptr;
+    std::string git_sha, image_tag, build_time;
+    std::getline(f, git_sha);
+    std::getline(f, image_tag);
+    std::getline(f, build_time);
+    // If all three are empty (file existed but was blank), treat as missing
+    // so /props doesn't carry a useless `{git_sha: "", ...}` blob.
+    if (git_sha.empty() && image_tag.empty() && build_time.empty()) {
+        return nullptr;
+    }
+    json out = json::object();
+    if (!git_sha.empty())    out["git_sha"]    = git_sha;
+    if (!image_tag.empty())  out["image_tag"]  = image_tag;
+    if (!build_time.empty()) out["build_time"] = build_time;
+    return out;
+}
+
+// Read /opt/lucebox-hub/HOST_INFO (JSON written by server/scripts/
+// entrypoint.sh from the LUCEBOX_HOST_* env vars the host wrapper
+// exports) into a JSON object surfaced verbatim under /props.host.
+// Returns JSON null on missing file or parse error so /props.host
+// becomes literal null rather than crashing the handler. Path override
+// via $DFLASH_HOST_INFO_PATH for unit tests.
+static json read_host_info() {
+    const char * env_path = std::getenv("DFLASH_HOST_INFO_PATH");
+    const std::string path = (env_path && *env_path)
+        ? std::string(env_path)
+        : std::string("/opt/lucebox-hub/HOST_INFO");
+    std::ifstream f(path);
+    if (!f) return nullptr;
+    try {
+        json out = json::parse(f);
+        if (!out.is_object()) return nullptr;
+        return out;
+    } catch (const json::parse_error &) {
+        return nullptr;
+    }
+}
+
 static bool parse_double_list(const char * value, std::vector<double> & out) {
     out.clear();
     if (!value || !*value) return false;
@@ -1057,6 +1143,94 @@ int main(int argc, char ** argv) {
     // expose the GGUF metadata key it was loaded from, so leave empty
     // and let /props report null. (Add a getter on Tokenizer later.)
 
+    // ── /props identity payloads ────────────────────────────────────────
+    //
+    // Target + draft GGUF identity for /props.model.target / .draft. The
+    // hash is the slow part (~30s per multi-GB file on NVMe); the GGUF
+    // header read is cheap. Cached in a sidecar `<path>.sha256` so
+    // subsequent restarts skip the rehash.
+    //
+    // `DFLASH_SKIP_SHA256=1` env disables hashing entirely — useful when
+    // benchmarking server cold-start latency or when the model dir is
+    // read-only (no place to write the sidecar). Leaves `sha256` as JSON
+    // null in /props; the other identity fields still populate.
+    const bool skip_sha = []() {
+        const char * v = std::getenv("DFLASH_SKIP_SHA256");
+        return v && *v && std::strcmp(v, "0") != 0;
+    }();
+    if (bargs.model_path && *bargs.model_path) {
+        std::fprintf(stderr,
+            "[server] inspecting target GGUF for /props%s\n",
+            skip_sha ? " (sha256 disabled by $DFLASH_SKIP_SHA256)" : "");
+        GgufMetadata tm = read_gguf_metadata(bargs.model_path, !skip_sha);
+        if (tm.ok) {
+            sconfig.target_gguf = gguf_metadata_to_json(tm);
+            std::fprintf(stderr,
+                "[server] target gguf: %s size=%lld sha=%s%s ftype=%s\n",
+                tm.path.c_str(), (long long)tm.size_bytes,
+                tm.sha256.empty() ? "(skipped)" : tm.sha256.substr(0, 12).c_str(),
+                tm.sha256.empty() ? "" : "...",
+                tm.file_type_name.empty() ? "?" : tm.file_type_name.c_str());
+        } else {
+            std::fprintf(stderr,
+                "[server] WARNING: could not read target GGUF metadata: %s\n",
+                bargs.model_path);
+        }
+    }
+    if (bargs.draft_path && *bargs.draft_path) {
+        std::fprintf(stderr,
+            "[server] inspecting draft GGUF for /props%s\n",
+            skip_sha ? " (sha256 disabled by $DFLASH_SKIP_SHA256)" : "");
+        GgufMetadata dm = read_gguf_metadata(bargs.draft_path, !skip_sha);
+        if (dm.ok) {
+            sconfig.draft_gguf = gguf_metadata_to_json(dm);
+            std::fprintf(stderr,
+                "[server] draft gguf: %s size=%lld sha=%s%s ftype=%s\n",
+                dm.path.c_str(), (long long)dm.size_bytes,
+                dm.sha256.empty() ? "(skipped)" : dm.sha256.substr(0, 12).c_str(),
+                dm.sha256.empty() ? "" : "...",
+                dm.file_type_name.empty() ? "?" : dm.file_type_name.c_str());
+        } else {
+            std::fprintf(stderr,
+                "[server] WARNING: could not read draft GGUF metadata: %s\n",
+                bargs.draft_path);
+        }
+    }
+    // nlohmann::json::value<std::string>() throws type_error when the
+    // key exists but isn't a string (e.g. `"kernel": null`). IMAGE_INFO
+    // and HOST_INFO are sourced from operator-provided env via
+    // entrypoint.sh, so we can't assume well-typed values — fall back
+    // to "(none)" on any missing-or-non-string field rather than
+    // crashing server startup.
+    auto json_str_or_none = [](const json & j, const char * key) -> const char * {
+        auto it = j.find(key);
+        if (it == j.end() || !it->is_string()) {
+            return "(none)";
+        }
+        return it->get_ref<const std::string &>().c_str();
+    };
+
+    // Container/image identity (Dockerfile bakes /opt/lucebox-hub/IMAGE_INFO).
+    sconfig.image_info = read_image_info();
+    if (!sconfig.image_info.is_null()) {
+        std::fprintf(stderr,
+            "[server] image_info: git_sha=%s image_tag=%s build_time=%s\n",
+            json_str_or_none(sconfig.image_info, "git_sha"),
+            json_str_or_none(sconfig.image_info, "image_tag"),
+            json_str_or_none(sconfig.image_info, "build_time"));
+    }
+    // Host identity (entrypoint.sh writes /opt/lucebox-hub/HOST_INFO from
+    // LUCEBOX_HOST_*). Surfaced verbatim under /props.host (schema 4+).
+    // Null on bare-metal dev builds that bypass the container entrypoint.
+    sconfig.host_info = read_host_info();
+    if (!sconfig.host_info.is_null()) {
+        std::fprintf(stderr,
+            "[server] host_info: source=%s os=%s kernel=%s\n",
+            json_str_or_none(sconfig.host_info, "source"),
+            json_str_or_none(sconfig.host_info, "os_pretty"),
+            json_str_or_none(sconfig.host_info, "kernel"));
+    }
+
     // Resolve the Level 2 force-close sequence. Two concepts, both sourced
     // from the model card sidecar (see model_card.h for semantics):
     //   - marker: bytes that signal end-of-thinking to *us* (parsers).
diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
index 604f11a73..e5e9f5a08 100644
--- a/server/src/server/sse_emitter.cpp
+++ b/server/src/server/sse_emitter.cpp
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cctype>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
@@ -23,7 +24,63 @@ static bool has_request_tools(const json & tools) {
     return tools.is_array() && !tools.empty();
 }
 
-static bool find_tool_start(const std::string & text, size_t & pos) {
+// Cheap pre-check: scan `text` for a plausible `call:<verb>{` opener
+// before invoking the full parse_tool_calls regex sweep. Mirrors the
+// shape of re_call_verb_open() in tool_parser.cpp at a coarse
+// granularity. Single O(N) pass with a substring skip; no heap alloc
+// and no regex compile, so a response with no `call:` substring pays
+// only a `find()` cost.
+//
+// Returns true if any `call:` occurrence is followed by an identifier
+// start (`[A-Za-z_]`), more verb chars (`[A-Za-z0-9_.:-]`), optional
+// whitespace, and a `{`. We deliberately do NOT validate balanced
+// braces here — parse_tool_calls owns that check, and a leading
+// `call:foo{` with no close still costs us only one regex scan.
+//
+// The pre-check intentionally accepts `_call:foo{` (SentencePiece
+// underscore artifact, see tool_parser.cpp re_call_verb_open()
+// rationale) by including `_` in the verb-start charset alternation
+// at the top — `find("call:")` lands inside the `_call:` window.
+static bool looks_like_plain_text_call(const std::string & text) {
+    size_t pos = 0;
+    while ((pos = text.find("call:", pos)) != std::string::npos) {
+        size_t v = pos + 5;  // step past "call:"
+        if (v < text.size() &&
+            (std::isalnum((unsigned char)text[v]) || text[v] == '_')) {
+            size_t w = v;
+            while (w < text.size() &&
+                   (std::isalnum((unsigned char)text[w]) ||
+                    text[w] == '_' || text[w] == '.' ||
+                    text[w] == ':' || text[w] == '-')) {
+                w++;
+            }
+            // Allow whitespace between verb and brace (mirrors `\s*\{`).
+            while (w < text.size() && std::isspace((unsigned char)text[w])) {
+                w++;
+            }
+            if (w < text.size() && text[w] == '{') return true;
+        }
+        pos = v;
+    }
+    return false;
+}
+
+// `is_plain_text` (out) reports whether the matched opener was Pattern B
+// (plain-text `call:<verb>{`) vs Pattern A (XML envelope: `<tool_call>`,
+// `<function=`, `<tool_code>`). Callers use this to drive divergent
+// downstream behavior at emit_finish:
+//   - Pattern A: malformed parse → suppress buffer (XML envelopes are not
+//     user-facing text); .done events expose only the pre-call accumulated
+//     content.
+//   - Pattern B: malformed parse → flush buffer back to accumulated_content_
+//     so the literal `call:foo{...` span stays caller-visible; on success,
+//     the raw call text must also appear in the Responses-format
+//     finalization events (see emit_finish for the responses_streamed_text
+//     handling).
+static bool find_tool_start(const std::string & text, size_t & pos,
+                            bool & is_plain_text) {
+    is_plain_text = false;
+    // Pattern A: XML-like openers (<tool_call>, <function=, <tool_code>).
     size_t idx = text.find('<');
     while (idx != std::string::npos) {
         if (text.compare(idx, sizeof(TOOL_OPEN) - 1, TOOL_OPEN) == 0 ||
@@ -34,6 +91,42 @@ static bool find_tool_start(const std::string & text, size_t & pos) {
         }
         idx = text.find('<', idx + 1);
     }
+
+    // Pattern B: call:<verb>{ opener (Gemma4 plain-text emissions).
+    // Valid sentinels before "call:" mirror tool_parser.cpp Pattern 5:
+    //   start-of-text, whitespace, or one of ,;:()[]{}>
+    // Require at least one alpha char after the colon (the verb start)
+    // to avoid false-positives on English "I'll call: ..." prose.
+    // We do NOT require the closing '{' here — it may arrive in a later
+    // token.  The full parse in parse_tool_calls() at emit_finish() handles
+    // validation; entering TOOL_BUFFER too eagerly costs only a buffered
+    // flush, not incorrect output.
+    static const char CALL_PREFIX[] = "call:";
+    static constexpr size_t CALL_PREFIX_LEN = 5;  // strlen("call:")
+    size_t call_pos = 0;
+    while (call_pos < text.size()) {
+        size_t found = text.find(CALL_PREFIX, call_pos);
+        if (found == std::string::npos) break;
+
+        bool valid_sentinel = (found == 0);
+        if (!valid_sentinel && found > 0) {
+            char prev = text[found - 1];
+            valid_sentinel = (prev == '\n' || prev == '\r' || prev == ' ' || prev == '\t' ||
+                              prev == ',' || prev == ';' || prev == ':' ||
+                              prev == '(' || prev == '[' || prev == '{' ||
+                              prev == ')' || prev == ']' || prev == '}' || prev == '>');
+        }
+
+        if (valid_sentinel) {
+            size_t verb_start = found + CALL_PREFIX_LEN;
+            if (verb_start < text.size() && std::isalpha((unsigned char)text[verb_start])) {
+                pos = found;
+                is_plain_text = true;
+                return true;
+            }
+        }
+        call_pos = found + 1;
+    }
     return false;
 }
 
@@ -76,15 +169,16 @@ SseEmitter::SseEmitter(ApiFormat format,
                        int prompt_tokens,
                        const json & tools,
                        ToolMemory * tool_memory,
-                       const std::vector<std::string> & stop_sequences)
+                       const std::vector<std::string> & stop_sequences,
+                       StreamMode initial_mode)
     : format_(format)
     , request_id_(request_id)
     , model_name_(model_name)
     , prompt_tokens_(prompt_tokens)
     , tools_(tools)
     , tool_memory_(tool_memory)
-    , mode_(StreamMode::CONTENT)
-    , active_kind_("text")
+    , mode_(initial_mode)
+    , active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text")
     , stop_sequences_(stop_sequences)
     , created_at_(unix_timestamp())
     , msg_item_id_(gen_item_id())
@@ -93,6 +187,12 @@ SseEmitter::SseEmitter(ApiFormat format,
     for (const auto & s : stop_sequences_) {
         if (s.size() > stop_holdback_) stop_holdback_ = s.size();
     }
+    // NOTE on `checked_think_prefix_`: we deliberately leave the default
+    // (false) here even when initial_mode == REASONING. The emitter has a
+    // one-time guard in emit_token() that strips a redundantly-emitted
+    // leading `<think>` if the model emits one anyway (model-card /
+    // template-mismatch edge case). Pre-setting the flag to true would
+    // skip that strip and leak the duplicate opener into reasoning_text.
 }
 
 // ─── SSE formatting helpers ─────────────────────────────────────────────
@@ -381,8 +481,9 @@ std::vector<std::string> SseEmitter::emit_token(const std::string & raw_piece) {
         size_t think_idx = window_.find(THINK_OPEN);
         size_t think_close_idx = window_.find(THINK_CLOSE);
         size_t tool_idx = std::string::npos;
+        bool tool_is_plain_text = false;
         bool tool_hit = has_request_tools(tools_) &&
-                        find_tool_start(window_, tool_idx);
+                        find_tool_start(window_, tool_idx, tool_is_plain_text);
 
         struct Hit { size_t pos; int type; };  // type: 0=think, 1=think_close, 2=tool-ish
         std::vector<Hit> hits;
@@ -411,6 +512,7 @@ std::vector<std::string> SseEmitter::emit_token(const std::string & raw_piece) {
                 // Tool-call syntax. Keep the full tag/function text buffered
                 // until finish so the parser can validate it.
                 tool_buffer_ = window_.substr(h.pos);
+                tool_open_is_plain_text_ = tool_is_plain_text;
                 window_.clear();
                 mode_ = StreamMode::TOOL_BUFFER;
             }
@@ -500,6 +602,41 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
     }
     window_.clear();
 
+    // Snapshot of pre-strip text for the Responses finalization events.
+    //
+    // The Responses-format finalization events
+    // (response.output_text.done / content_part.done / completed) must
+    // reflect the full assistant text — including any plain-text
+    // `call:<verb>{...}` span — so a streaming client sees its accumulated
+    // buffer agree with the server's .done payload, and non-streaming
+    // builders that consume .completed get the raw assistant emission.
+    // Meanwhile, accumulated_text() (used by OpenAI Chat / Anthropic final
+    // shapes and non-streaming Responses builders that DO want stripped
+    // text to avoid text+tool_use duplication) continues to return the
+    // post-hoist stripped form.
+    //
+    // Cases:
+    //   - Pattern A (XML envelope, mode==TOOL_BUFFER): tool_buffer_ holds
+    //     protocol artifact text (`<tool_call>...`) that was never streamed
+    //     as a delta. The raw envelope is excluded from
+    //     responses_streamed_text — but any `cleaned_text` the parser
+    //     extracts (text outside the XML span) IS emitted as a content
+    //     delta below, so it must also be folded into the snapshot once
+    //     the parse succeeds (handled inline at the cleaned_text emit).
+    //   - Pattern B (plain-text `call:`, mode==TOOL_BUFFER): tool_buffer_
+    //     holds the raw `call:<verb>{...}` span plus any post-call trailing
+    //     text. Both belong in the visible text snapshot per the PR #329
+    //     review (tests #1126 et al). The snapshot already includes the
+    //     full raw buffer, which is a superset of `cleaned_text`, so we
+    //     don't double-count when the cleaned_text emit happens.
+    //   - mode==CONTENT plain-text hoist branch below: accumulated_content_
+    //     already contains the full pre-strip text; the snapshot taken
+    //     here freezes it before the strip mutates it.
+    std::string responses_streamed_text = accumulated_content_;
+    if (mode_ == StreamMode::TOOL_BUFFER && tool_open_is_plain_text_) {
+        responses_streamed_text += tool_buffer_;
+    }
+
     // Parse tool calls from buffer
     std::string fr = "stop";
     if (mode_ == StreamMode::TOOL_BUFFER && !tool_buffer_.empty()) {
@@ -518,6 +655,17 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
             if (!parsed.cleaned_text.empty()) {
                 accumulated_content_ += parsed.cleaned_text;
                 emit_content_delta(out, parsed.cleaned_text);
+                // Pattern A: snapshot was taken before parsing and only
+                // included pre-buffer accumulated_content_; the
+                // cleaned_text we just streamed must also appear in the
+                // .done/.completed payloads or the client's accumulated
+                // delta buffer will disagree with the server's final
+                // text. Pattern B's snapshot already contains the full
+                // raw buffer (which is a superset of cleaned_text), so
+                // we skip the append there to avoid double-counting.
+                if (!tool_open_is_plain_text_) {
+                    responses_streamed_text += parsed.cleaned_text;
+                }
             }
 
             fr = "tool_calls";
@@ -600,14 +748,170 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
                 break;
             default: break;
             }
+        } else if (tool_open_is_plain_text_) {
+            // Pattern B (plain-text `call:<verb>{...`) failed to parse —
+            // most commonly an unbalanced `{` (the model's args were
+            // truncated, or the verb name is real but the JSON body
+            // never closed). Unlike Pattern A's XML envelopes, the
+            // buffered span here is plain user-facing text. Flushing
+            // it back to accumulated_content_ (and re-emitting as a
+            // content delta) preserves the malformed span as
+            // caller-visible signal that the model produced garbage —
+            // dropping it silently would hide the failure mode.
+            // accumulated_text() then reports the original `call:`
+            // text exactly as the model emitted it.
+            accumulated_content_ += tool_buffer_;
+            emit_content_delta(out, tool_buffer_);
+            tool_buffer_.clear();
         } else {
-            // Tool syntax was detected but no valid call parsed. Do not leak
-            // malformed/incomplete XML back to the user as assistant text.
+            // Pattern A (XML envelope) parse failure. Do not leak
+            // malformed/incomplete `<tool_call>` / `<function=` /
+            // `<tool_code>` markup back to the user as assistant text
+            // — XML envelopes are protocol artifacts, not prose. See
+            // test_emitter_does_not_leak_malformed_tool_xml.
             std::fprintf(stderr,
                 "[server] tool_call parse failed; suppressing buffered tool text "
                 "request_id=%s format=%d bytes=%zu\n",
                 request_id_.c_str(), (int)format_, tool_buffer_.size());
         }
+    } else if (mode_ == StreamMode::CONTENT &&
+               !accumulated_content_.empty() &&
+               has_request_tools(tools_) &&
+               looks_like_plain_text_call(accumulated_content_)) {
+        // CONTENT-mode plain-text tool-call hoist. Gemma4 (and similar
+        // models with no XML tool-call template) emits invocations as
+        // literal text like `call:get_weather{location: "SF"}` or
+        // `_call:get_weather{...}` (SentencePiece artifact). The emitter
+        // stays in CONTENT mode for the whole stream because no
+        // `<tool_call>` / `<function=` / `<tool_code>` opener ever
+        // arrives. Without this branch the response stops with
+        // finish_reason="stop" / stop_reason="end_turn" and no tool_use
+        // block is emitted, breaking forge/agent_recorded scenarios
+        // that depend on structured tool_calls.
+        //
+        // The branch runs parse_tool_calls over accumulated_content_,
+        // hoists any ToolCalls (the allowlist filter `tool_allowed` is
+        // already enforced inside parse_tool_calls' add_call lambda,
+        // so unauthorized verbs never enter parsed.tool_calls), and
+        // replaces accumulated_content_ with cleaned_text so the final
+        // response carries the prose-only text (no duplicate `call:`
+        // span). Streaming clients have already received the raw call
+        // text as content deltas — they get a post-hoc tool_use block
+        // appended at finalize. Text + tool_use is a legal stream in
+        // both OpenAI and Anthropic specs.
+        //
+        // Gated on has_request_tools(tools_) to mirror the
+        // TOOL_BUFFER-entry condition at line 391 — if the request
+        // didn't declare tools we keep `call:foo{}` as visible content
+        // (see test_emitter_no_tools_keeps_tool_like_text for the
+        // equivalent XML-shape behavior).
+        auto parsed = parse_tool_calls(accumulated_content_, tools_);
+        if (!parsed.tool_calls.empty()) {
+            tool_calls_ = std::move(parsed.tool_calls);
+
+            // Remember for tool memory (mirrors TOOL_BUFFER branch).
+            if (tool_memory_) {
+                std::vector<std::string> ids;
+                for (const auto & tc : tool_calls_) ids.push_back(tc.id);
+                tool_memory_->remember(ids, accumulated_raw_);
+            }
+
+            // Strip matched call spans from the visible content so the
+            // non-streaming final-message shape doesn't duplicate them
+            // as both text AND tool_use. Mirrors
+            // _strip_plain_text_tool_calls in
+            // luce-bench/.../forge.py. Streaming clients already saw
+            // the pre-strip text in earlier deltas; this only affects
+            // the final accumulated_text() consumed by the response
+            // builders in http_server.cpp.
+            //
+            // For the Responses format we capture a separate snapshot
+            // (responses_streamed_text, see top of emit_finish) before
+            // this strip so the streaming finalization events
+            // (.output_text.done / .content_part.done / .completed)
+            // continue to agree with the raw .delta events the client
+            // already received. Without the snapshot the .done payload
+            // would carry the stripped text and a streaming client's
+            // accumulated buffer would disagree with the server's
+            // claimed "done" text.
+            accumulated_content_ = parsed.cleaned_text;
+
+            fr = "tool_calls";
+
+            // Format-specific tool call events — same shape as the
+            // TOOL_BUFFER branch above. Kept inlined (rather than
+            // refactored into a helper) to keep this commit's diff
+            // minimal and side-by-side reviewable against the
+            // upstream block.
+            switch (format_) {
+            case ApiFormat::OPENAI_CHAT: {
+                json tc_list = json::array();
+                for (size_t i = 0; i < tool_calls_.size(); i++) {
+                    tc_list.push_back({
+                        {"index", (int)i},
+                        {"id", tool_calls_[i].id},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", tool_calls_[i].name},
+                            {"arguments", tool_calls_[i].arguments}
+                        }}
+                    });
+                }
+                out.push_back(format_openai_delta({{"tool_calls", tc_list}}));
+                break;
+            }
+            case ApiFormat::ANTHROPIC: {
+                if (!active_kind_.empty()) {
+                    out.push_back(sse_event("content_block_stop",
+                        json({{"type", "content_block_stop"}, {"index", block_index_}}).dump()));
+                    active_kind_.clear();
+                }
+                for (const auto & tc : tool_calls_) {
+                    block_index_++;
+                    json tu_block = {
+                        {"type",  "tool_use"},
+                        {"id",    tc.id},
+                        {"name",  tc.name},
+                        {"input", json::object()}
+                    };
+                    out.push_back(sse_event("content_block_start",
+                        json({{"type", "content_block_start"},
+                              {"index", block_index_},
+                              {"content_block", tu_block}}).dump()));
+                    if (!tc.arguments.empty()) {
+                        out.push_back(sse_event("content_block_delta",
+                            json({{"type",  "content_block_delta"},
+                                  {"index", block_index_},
+                                  {"delta", {{"type",         "input_json_delta"},
+                                             {"partial_json", tc.arguments}}}}).dump()));
+                    }
+                    out.push_back(sse_event("content_block_stop",
+                        json({{"type", "content_block_stop"},
+                              {"index", block_index_}}).dump()));
+                }
+                break;
+            }
+            case ApiFormat::RESPONSES:
+                for (const auto & tc : tool_calls_) {
+                    out.push_back(format_responses_event(
+                        "response.function_call_arguments.delta", {
+                            {"item_id", tc.id}, {"output_index", 0},
+                            {"delta", tc.arguments}
+                        }));
+                    out.push_back(format_responses_event(
+                        "response.function_call_arguments.done", {
+                            {"item_id", tc.id}, {"output_index", 0},
+                            {"arguments", tc.arguments}, {"name", tc.name}
+                        }));
+                }
+                break;
+            default: break;
+            }
+        }
+        // If parse_tool_calls matched the substring pre-check but
+        // returned no calls (all filtered by tool_allowed, or all args
+        // malformed), `fr` stays "stop" and accumulated_content_ is
+        // left intact. Caller sees the original prose; no leak.
     }
 
     // Format-specific final events
@@ -670,16 +974,21 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
     }
 
     case ApiFormat::RESPONSES: {
+        // Use the pre-strip snapshot for the streaming finalization
+        // events (.done / .completed) so they agree with the
+        // .delta events that preceded them. See
+        // responses_streamed_text init at the top of emit_finish for
+        // rationale.
         // output_text.done
         out.push_back(format_responses_event("response.output_text.done", {
             {"item_id", msg_item_id_}, {"output_index", 0},
-            {"content_index", 0}, {"text", accumulated_content_}
+            {"content_index", 0}, {"text", responses_streamed_text}
         }));
         // content_part.done
         out.push_back(format_responses_event("response.content_part.done", {
             {"item_id", msg_item_id_}, {"output_index", 0},
             {"content_index", 0},
-            {"part", {{"type", "output_text"}, {"text", accumulated_content_},
+            {"part", {{"type", "output_text"}, {"text", responses_streamed_text},
                       {"annotations", json::array()}}}
         }));
 
@@ -698,7 +1007,7 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
                 {"type", "message"}, {"id", msg_item_id_},
                 {"status", "completed"}, {"role", "assistant"},
                 {"content", json::array({{
-                    {"type", "output_text"}, {"text", accumulated_content_},
+                    {"type", "output_text"}, {"text", responses_streamed_text},
                     {"annotations", json::array()}
                 }})}
             });
@@ -726,7 +1035,7 @@ std::vector<std::string> SseEmitter::emit_finish(int completion_tokens,
             {"created_at", created_at_}, {"status", "completed"},
             {"model", model_name_},
             {"output", final_output},
-            {"output_text", accumulated_content_},
+            {"output_text", responses_streamed_text},
             {"usage", resp_usage}
         };
         out.push_back(format_responses_event("response.completed", {{"response", shell}}));
diff --git a/server/src/server/sse_emitter.h b/server/src/server/sse_emitter.h
index 4710b8d45..79c711e02 100644
--- a/server/src/server/sse_emitter.h
+++ b/server/src/server/sse_emitter.h
@@ -54,13 +54,27 @@ nlohmann::json build_timings_json(const GenTimings & t, int completion_tokens);
 // Manages SSE streaming for a single request.
 class SseEmitter {
 public:
+    // `initial_mode` defaults to CONTENT for backward compatibility. Pass
+    // StreamMode::REASONING when the chat-template prompt suffix pre-opens
+    // a `<think>` block (Qwen3.6 / Laguna enable_thinking path): the
+    // model's first generated token is reasoning, never preceded by an
+    // explicit `<think>` opener in the stream. Without this hint the
+    // emitter would route reasoning text to the content channel and
+    // reasoning_content would stay empty.
+    //
+    // Note: the leading-`<think>` strip guard (`checked_think_prefix_`)
+    // remains active when we start in REASONING mode — if the model
+    // *does* emit a redundant `<think>` opener anyway, the guard still
+    // strips it. Pre-setting checked_think_prefix_=true here would let a
+    // duplicate `<think>` leak into reasoning_text in that edge case.
     SseEmitter(ApiFormat format,
                const std::string & request_id,
                const std::string & model_name,
                int prompt_tokens,
                const json & tools,
                ToolMemory * tool_memory,
-               const std::vector<std::string> & stop_sequences = {});
+               const std::vector<std::string> & stop_sequences = {},
+               StreamMode initial_mode = StreamMode::CONTENT);
 
     // Emit the initial SSE events (role delta, message_start, etc.)
     // Returns the formatted SSE strings to send.
@@ -145,6 +159,21 @@ class SseEmitter {
     StreamMode   mode_;
     std::string  window_;           // holdback buffer
     std::string  tool_buffer_;      // accumulated tool text
+    // True when TOOL_BUFFER was entered via Pattern B (plain-text
+    // `call:<verb>{` opener) rather than Pattern A (XML envelope:
+    // `<tool_call>` / `<function=` / `<tool_code>`). Set at the
+    // CONTENT→TOOL_BUFFER transition in emit_token(). Drives two
+    // divergent behaviors at emit_finish():
+    //   1. malformed-parse branch: Pattern A drops the buffer
+    //      (XML envelopes are not user-facing prose); Pattern B
+    //      flushes the buffer back to accumulated_content_ so the
+    //      literal `call:foo{...` span stays caller-visible.
+    //   2. Responses-format finalization events (.output_text.done /
+    //      .content_part.done / .completed): Pattern B includes the
+    //      raw call span in the streamed-text snapshot used for
+    //      these events, while accumulated_text() continues to
+    //      return the stripped (post-hoist) text.
+    bool         tool_open_is_plain_text_ = false;
     std::string  accumulated_content_;
     std::string  accumulated_raw_;  // all raw text for tool memory
     std::string  reasoning_text_;
diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
index 1ddbf2d1f..07db573e3 100644
--- a/server/test/test_server_unit.cpp
+++ b/server/test/test_server_unit.cpp
@@ -1045,6 +1045,268 @@ static void test_emitter_anthropic_thinking_blocks() {
     TEST_ASSERT(!em.accumulated_text().empty());
 }
 
+// ═══════════════════════════════════════════════════════════════════════
+// CONTENT-mode plain-text `call:<verb>{...}` tool-call hoist
+//
+// Regression coverage for the finalize-pass branch that runs
+// parse_tool_calls over accumulated_content_ when the stream stayed
+// in CONTENT mode (Gemma4-style `call:foo{...}` plain text — no XML
+// envelope to trip TOOL_BUFFER). Without this branch the emitter
+// returns finish_reason="stop" / stop_reason="end_turn" and never
+// emits a tool_use block, breaking forge / agent_recorded.
+// ═══════════════════════════════════════════════════════════════════════
+
+static void test_emitter_content_mode_plain_text_call_parsed() {
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    // Feed enough text past the holdback so the prose flushes into
+    // accumulated_content_ before finalize. The call: span itself stays
+    // in accumulated_content_ either way (no XML opener to redirect to
+    // TOOL_BUFFER); finalize parses and strips it.
+    em.emit_token("I'll fetch the forecast for you right now: ");
+    em.emit_token("call:get_weather{\"location\": \"SF\"}");
+    em.emit_token(" — let me know what you'd like next.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    if (!em.tool_calls().empty()) {
+        TEST_ASSERT(em.tool_calls()[0].name == "get_weather");
+        auto args = json::parse(em.tool_calls()[0].arguments);
+        TEST_ASSERT(args["location"] == "SF");
+    }
+    // finish_reason should now be "tool_calls" (drives Anthropic
+    // stop_reason="tool_use" downstream).
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_no_tools_skips_plain_text_call() {
+    // Empty tools array: branch is gated on has_request_tools(tools_),
+    // so the call: text remains as visible content.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT);
+    em.emit_start();
+    em.emit_token("I'll fetch the forecast for you right now: ");
+    em.emit_token("call:get_weather{\"location\": \"SF\"}");
+    em.emit_token(" — let me know what you'd like next.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().empty());
+    TEST_ASSERT(em.finish_reason() == "stop");
+    TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos);
+}
+
+static void test_emitter_content_mode_underscore_prefix_call_parsed() {
+    // Regression for the `_call:foo{}` SentencePiece artifact (commit
+    // 004a81b). Parser Pattern 5 sentinel set includes `_`, so the
+    // verb is captured even with the leading underscore. The emitter
+    // wiring must surface it the same way as the bare `call:` form.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("Sure thing, here is the call you asked for: ");
+    em.emit_token("_call:get_weather{\"location\": \"NYC\"}");
+    em.emit_token(" — happy to refine if needed.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    if (!em.tool_calls().empty()) {
+        TEST_ASSERT(em.tool_calls()[0].name == "get_weather");
+        auto args = json::parse(em.tool_calls()[0].arguments);
+        TEST_ASSERT(args["location"] == "NYC");
+    }
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_no_call_substring_skips_parser() {
+    // Pure prose with no `call:` substring: the pre-check
+    // looks_like_plain_text_call short-circuits before parse_tool_calls
+    // runs. Accumulated text is preserved; finish_reason stays "stop".
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("Sorry, I cannot help with that ");
+    em.emit_token("specific question today. Please consult a local guide ");
+    em.emit_token("for the most accurate information.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().empty());
+    TEST_ASSERT(em.finish_reason() == "stop");
+    TEST_ASSERT(em.accumulated_text().find("Sorry") != std::string::npos);
+}
+
+static void test_emitter_content_mode_mixed_calls_multiple() {
+    // Multiple back-to-back calls in the same response. Parser
+    // Pattern 5 sentinel set includes `}` so consecutive invocations
+    // are captured. Verify the emitter hoists both in emission order.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("start. ");
+    em.emit_token("call:get_weather{\"location\": \"A\"} ");
+    em.emit_token("middle. ");
+    em.emit_token("call:get_weather{\"location\": \"B\"} ");
+    em.emit_token("end of the message.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 2);
+    if (em.tool_calls().size() == 2) {
+        auto args0 = json::parse(em.tool_calls()[0].arguments);
+        auto args1 = json::parse(em.tool_calls()[1].arguments);
+        TEST_ASSERT(args0["location"] == "A");
+        TEST_ASSERT(args1["location"] == "B");
+    }
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+    // Codex Q3 residue guard: the stripped accumulated text must NOT
+    // contain `call:` any more.
+    TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos);
+}
+
+static void test_emitter_content_mode_malformed_call_dropped() {
+    // Unbalanced `{`: balanced_braces_end inside parse_tool_calls
+    // returns npos and the match is dropped. Emitter must not crash
+    // and the malformed text remains in accumulated_text (no tool
+    // hoist, no silent strip — caller-visible signal that the model
+    // produced garbage).
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("Here is the call you wanted with malformed args: ");
+    em.emit_token("call:get_weather{location: \"unclosed");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().empty());
+    TEST_ASSERT(em.finish_reason() == "stop");
+    // Malformed call span is left visible (no strip on parse failure).
+    TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos);
+}
+
+static void test_emitter_content_mode_does_not_double_fire_on_tool_call_xml() {
+    // Regression guard: a `<tool_call>` XML envelope must continue to
+    // route through the TOOL_BUFFER path (transition fires inside
+    // emit_token). The new CONTENT-mode branch sits in an `else if`
+    // tied to `mode_ == CONTENT` at emit_finish entry, so it cannot
+    // fire when TOOL_BUFFER handled the call. Verify exactly 1
+    // ToolCall is emitted, not 2.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("<tool_call>\n"
+                  "<function=get_weather>\n"
+                  "<parameter=location>SF</parameter>\n"
+                  "</function>\n"
+                  "</tool_call>");
+    em.emit_finish(20);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_strips_call_span_from_accumulated_text() {
+    // Codex Q3 "residue" hazard guard. After a successful hoist,
+    // accumulated_text() must not contain the `call:` substring (the
+    // matched span is replaced by cleaned_text). Without this guard
+    // the OpenAI Chat / Anthropic / Responses final-message shapes
+    // would echo the call as both literal text AND a tool_use block,
+    // producing UI double-display.
+    auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools());
+    em.emit_start();
+    em.emit_token("prefix prose here. ");
+    em.emit_token("call:get_weather{\"location\": \"SF\"}");
+    em.emit_token(" suffix prose continues to flush the holdback.");
+    em.emit_finish(20);
+
+    TEST_ASSERT(!em.tool_calls().empty());
+    TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("prefix prose") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("suffix prose") != std::string::npos);
+}
+
+static void test_emitter_content_mode_anthropic_emits_tool_use_block() {
+    // Verify the Anthropic format-specific events fire from the new
+    // branch (content_block_stop on the open text block, then
+    // content_block_start tool_use + input_json_delta + content_block_stop).
+    json tools = json::array();
+    tools.push_back({
+        {"name", "get_weather"},
+        {"description", "weather"},
+        {"input_schema", {{"type", "object"},
+                          {"properties", {{"city", {{"type", "string"}}}}}}}
+    });
+    SseEmitter em(ApiFormat::ANTHROPIC, "req_id", "test-model", 10,
+                  tools, nullptr);
+    em.emit_start();
+    em.emit_token("Let me fetch the data you need from the service: ");
+    em.emit_token("call:get_weather{\"city\": \"Tokyo\"}");
+    em.emit_token(" — back in a moment.");
+    auto finish = em.emit_finish(20);
+    std::string s = concat(finish);
+
+    TEST_ASSERT(!em.tool_calls().empty());
+    TEST_ASSERT(s.find("\"type\":\"tool_use\"")          != std::string::npos);
+    TEST_ASSERT(s.find("\"name\":\"get_weather\"")     != std::string::npos);
+    TEST_ASSERT(s.find("\"type\":\"input_json_delta\"") != std::string::npos);
+    TEST_ASSERT(s.find("Tokyo")                          != std::string::npos);
+    TEST_ASSERT(s.find("\"stop_reason\":\"tool_use\"")  != std::string::npos);
+}
+
+static void test_emitter_content_mode_digit_start_verb_parsed() {
+    // Cubic PR #329 review: the looks_like_plain_text_call() pre-check
+    // must accept verbs starting with a digit because the parser's
+    // re_call_verb_open() regex allows them ([A-Za-z0-9_.:\\-]+).
+    // A model emitting `call:2nd_pass{...}` with a digit-led verb
+    // should still trigger the parser sweep.
+    json tools = json::array();
+    tools.push_back({
+        {"name", "2nd_pass"},
+        {"description", "second pass"},
+        {"input_schema", {{"type", "object"},
+                          {"properties", {{"reason", {{"type", "string"}}}}}}}
+    });
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req_id", "test-model", 10,
+                  tools, nullptr);
+    em.emit_start();
+    em.emit_token("call:2nd_pass{reason: \"verify\"}");
+    em.emit_finish(5);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    if (!em.tool_calls().empty()) {
+        TEST_ASSERT(em.tool_calls()[0].name == "2nd_pass");
+    }
+    TEST_ASSERT(em.finish_reason() == "tool_calls");
+}
+
+static void test_emitter_content_mode_responses_done_uses_pre_strip_text() {
+    // Cubic PR #329 review: the Responses-format finalization events
+    // (.output_text.done / .content_part.done / .completed) must
+    // reflect the text that was streamed in earlier .delta events,
+    // not the post-strip text. Otherwise a streaming client's
+    // accumulated buffer (built from .delta events) disagrees with
+    // the server's claimed .done payload.
+    //
+    // accumulated_text() (consumed by non-streaming response builders)
+    // still returns the stripped version so the non-streaming response
+    // shape doesn't carry both text AND tool_use for the same span.
+    json tools = json::array();
+    tools.push_back({
+        {"name", "get_weather"},
+        {"description", "weather"},
+        {"input_schema", {{"type", "object"},
+                          {"properties", {{"city", {{"type", "string"}}}}}}}
+    });
+    SseEmitter em(ApiFormat::RESPONSES, "req_id", "test-model", 10,
+                  tools, nullptr);
+    em.emit_start();
+    em.emit_token("Looking up: ");
+    em.emit_token("call:get_weather{\"city\": \"Tokyo\"}");
+    em.emit_token(" done.");
+    auto finish = em.emit_finish(20);
+    std::string s = concat(finish);
+
+    TEST_ASSERT(em.tool_calls().size() == 1);
+    // Streaming .done events must include the raw call text (matching
+    // what the .delta events already sent).
+    TEST_ASSERT(s.find("response.output_text.done") != std::string::npos);
+    TEST_ASSERT(s.find("call:get_weather") != std::string::npos);
+    // Non-streaming accessor returns the stripped text.
+    TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("Looking up:") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("done.") != std::string::npos);
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // Stop sequences tests
 // ═══════════════════════════════════════════════════════════════════════
@@ -1232,7 +1494,7 @@ static void test_pflash_config_defaults() {
     ServerConfig cfg;
     TEST_ASSERT(cfg.pflash_mode == ServerConfig::PflashMode::OFF);
     TEST_ASSERT(cfg.pflash_threshold == 32000);
-    TEST_ASSERT(cfg.pflash_keep_ratio > 0.04f && cfg.pflash_keep_ratio < 0.06f);
+    TEST_ASSERT(cfg.pflash_keep_ratio > 0.09f && cfg.pflash_keep_ratio < 0.11f);
     TEST_ASSERT(cfg.pflash_drafter_path.empty());
     TEST_ASSERT(!cfg.pflash_skip_park);
     TEST_ASSERT(cfg.draft_residency == DraftResidencyPolicy::Auto);
@@ -1541,11 +1803,11 @@ static void test_jinja_render_basic() {
         {"system", "you are helpful", ""},
         {"user",   "hi",              ""},
     };
-    std::string out = render_chat_template_jinja(
+    auto out = render_chat_template_jinja(
         MINI_JINJA_TEMPLATE, msgs,
         /*bos=*/"<s>", /*eos=*/"</s>",
         /*add_gen=*/true, /*think=*/false,
-        /*tools=*/"");
+        /*tools=*/"").text;
     TEST_ASSERT(out.find("<|system|>you are helpful") != std::string::npos);
     TEST_ASSERT(out.find("<|user|>hi")               != std::string::npos);
     TEST_ASSERT(out.find("<|assistant|>")            != std::string::npos);
@@ -1553,9 +1815,9 @@ static void test_jinja_render_basic() {
 
 static void test_jinja_render_no_gen_prompt() {
     std::vector<ChatMessage> msgs = {{"user", "ping", ""}};
-    std::string out = render_chat_template_jinja(
+    auto out = render_chat_template_jinja(
         MINI_JINJA_TEMPLATE, msgs, "", "",
-        /*add_gen=*/false, /*think=*/false, "");
+        /*add_gen=*/false, /*think=*/false, "").text;
     TEST_ASSERT(out.find("<|user|>ping") != std::string::npos);
     TEST_ASSERT(out.find("<|assistant|>") == std::string::npos);
 }
@@ -1567,8 +1829,8 @@ static void test_jinja_render_tools_injected() {
         "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}";
     std::vector<ChatMessage> msgs = {{"user", "?", ""}};
     std::string tools = R"([{"name":"my_tool","description":"test"}])";
-    std::string out = render_chat_template_jinja(
-        TPL, msgs, "", "", false, false, tools);
+    auto out = render_chat_template_jinja(
+        TPL, msgs, "", "", false, false, tools).text;
     TEST_ASSERT(out.find("TOOLS_PRESENT:my_tool") != std::string::npos);
 }
 
@@ -1577,8 +1839,8 @@ static void test_jinja_render_empty_tools_skipped() {
     static const char TPL[] =
         "{%- if tools -%}TOOLS_PRESENT{%- else -%}NO_TOOLS{%- endif -%}";
     std::vector<ChatMessage> msgs = {{"user", "?", ""}};
-    std::string out = render_chat_template_jinja(
-        TPL, msgs, "", "", false, false, "[]");
+    auto out = render_chat_template_jinja(
+        TPL, msgs, "", "", false, false, "[]").text;
     TEST_ASSERT(out.find("NO_TOOLS")        != std::string::npos);
     TEST_ASSERT(out.find("TOOLS_PRESENT")   == std::string::npos);
 }
@@ -1587,8 +1849,8 @@ static void test_jinja_render_bos_eos_threaded() {
     // {{ bos_token }} and {{ eos_token }} must reach the template.
     static const char TPL[] = "{{ bos_token }}HI{{ eos_token }}";
     std::vector<ChatMessage> msgs;
-    std::string out = render_chat_template_jinja(
-        TPL, msgs, "<BOS>", "<EOS>", false, false, "");
+    auto out = render_chat_template_jinja(
+        TPL, msgs, "<BOS>", "<EOS>", false, false, "").text;
     TEST_ASSERT(out == "<BOS>HI<EOS>");
 }
 
@@ -1616,6 +1878,345 @@ static void test_jinja_render_bad_tools_json_throws() {
     TEST_ASSERT(threw);
 }
 
+// ─── started_in_thinking provenance ─────────────────────────────────────
+//
+// Regression suite for the Qwen3.6 / Laguna think-mode channel-routing
+// bug: the rendered prompt suffix pre-opens `<think>` so the model
+// starts emitting reasoning tokens with no explicit opener. Callers
+// route PromptRenderResult.started_in_thinking → SseEmitter initial
+// mode so reasoning text lands in reasoning_content, not content.
+
+static void test_chat_template_qwen3_enable_thinking_pre_opens() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT(result.started_in_thinking);
+    // Sanity: rendered suffix ends with `<think>\n` per the Qwen3.6
+    // chat_template.jinja's enable_thinking branch.
+    TEST_ASSERT(result.text.size() >= 8);
+    TEST_ASSERT(result.text.compare(result.text.size() - 8, 8, "<think>\n") == 0);
+}
+
+static void test_chat_template_qwen3_disable_thinking_does_not_pre_open() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/false,
+                                       /*tools=*/"");
+    TEST_ASSERT(!result.started_in_thinking);
+    // The disabled branch emits `<think>\n\n</think>\n\n` — closes
+    // immediately, so the reasoning channel is NOT left open.
+    TEST_ASSERT(result.text.find("</think>") != std::string::npos);
+}
+
+static void test_chat_template_qwen3_no_gen_prompt_does_not_pre_open() {
+    // Without add_generation_prompt the assistant turn isn't appended
+    // and there's nothing to pre-open.
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/false,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT(!result.started_in_thinking);
+}
+
+static void test_chat_template_laguna_enable_thinking_pre_opens() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::LAGUNA,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT(result.started_in_thinking);
+    TEST_ASSERT(result.text.size() >= 7);
+    TEST_ASSERT(result.text.compare(result.text.size() - 7, 7, "<think>") == 0);
+}
+
+static void test_chat_template_laguna_disable_thinking_does_not_pre_open() {
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto result = render_chat_template(msgs, ChatFormat::LAGUNA,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/false,
+                                       /*tools=*/"");
+    TEST_ASSERT(!result.started_in_thinking);
+}
+
+static void test_chat_template_gemma4_does_not_pre_open() {
+    // Gemma4's reasoning channel is opened by the model's `<|channel>`
+    // token (which http_server forwards into the emitter as `<think>`).
+    // The prompt itself never pre-opens `<think>` regardless of
+    // enable_thinking, so started_in_thinking must stay false.
+    std::vector<ChatMessage> msgs = {{"user", "hi", ""}};
+    auto enabled = render_chat_template(msgs, ChatFormat::GEMMA4,
+                                        /*add_gen=*/true,
+                                        /*enable_thinking=*/true,
+                                        /*tools=*/"");
+    TEST_ASSERT(!enabled.started_in_thinking);
+    auto disabled = render_chat_template(msgs, ChatFormat::GEMMA4,
+                                         /*add_gen=*/true,
+                                         /*enable_thinking=*/false,
+                                         /*tools=*/"");
+    TEST_ASSERT(!disabled.started_in_thinking);
+}
+
+// Jinja path: suffix-sniff detection. The renderer should set
+// started_in_thinking=true when the rendered prompt ends with `<think>`
+// (optionally followed by whitespace) AND enable_thinking is honored.
+static void test_jinja_render_suffix_sniff_sets_started_in_thinking() {
+    static const char TPL[] =
+        "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+        "<|assistant|>{%- if enable_thinking -%}<think>\n{%- endif -%}"
+        "{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "?", ""}};
+    auto r = render_chat_template_jinja(
+        TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, "");
+    TEST_ASSERT(r.started_in_thinking);
+}
+
+static void test_jinja_render_suffix_sniff_negative() {
+    // Template doesn't end with `<think>` → started_in_thinking=false
+    // even with enable_thinking=true.
+    static const char TPL[] =
+        "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"
+        "{%- if add_generation_prompt -%}<|assistant|>{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "?", ""}};
+    auto r = render_chat_template_jinja(
+        TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, "");
+    TEST_ASSERT(!r.started_in_thinking);
+}
+
+// Jinja path: the sniff is the source of truth for started_in_thinking,
+// not the enable_thinking flag. If a template hard-codes `<think>` despite
+// enable_thinking=false (custom template, model-card mismatch, etc.) we
+// still need to route the model's first tokens to the reasoning channel
+// or they'll leak into content. The renderer logs a [WARN] in that case
+// (verified manually; we don't capture stderr in this test).
+static void test_jinja_render_suffix_sniff_overrides_enable_thinking_flag() {
+    // Template hardcodes `<think>` regardless of enable_thinking.
+    static const char TPL[] =
+        "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"
+        "{%- if add_generation_prompt -%}<|assistant|><think>\n{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "?", ""}};
+    auto r = render_chat_template_jinja(
+        TPL, msgs, "", "", /*add_gen=*/true, /*think=*/false, "");
+    // Even though enable_thinking=false, the rendered prompt ends with
+    // `<think>` so started_in_thinking must be true to avoid routing
+    // reasoning tokens into the content channel.
+    TEST_ASSERT(r.started_in_thinking);
+}
+
+// Jinja path: sniff still requires add_generation_prompt — without it the
+// rendered prompt is a transcript, not a continuation, and any embedded
+// `<think>` in past turns shouldn't claim the channel is pre-opened.
+static void test_jinja_render_suffix_sniff_requires_add_generation_prompt() {
+    static const char TPL[] =
+        "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"
+        "{%- if add_generation_prompt -%}<|assistant|><think>\n{%- endif -%}";
+    std::vector<ChatMessage> msgs = {{"user", "?", ""}};
+    auto r = render_chat_template_jinja(
+        TPL, msgs, "", "", /*add_gen=*/false, /*think=*/true, "");
+    TEST_ASSERT(!r.started_in_thinking);
+}
+
+// ─── SseEmitter initial_mode=REASONING ──────────────────────────────────
+//
+// Regression: when constructed with initial_mode=REASONING (the
+// Qwen3.6/Laguna enable_thinking path), the emitter must route the
+// model's first generated tokens to reasoning_content until a natural
+// `</think>` is seen, even though no explicit `<think>` opener appears
+// in the stream.
+
+static void test_emitter_initial_mode_reasoning_routes_to_reasoning_content() {
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req-1", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    em.emit_start();
+
+    // Model emits reasoning tokens directly with no leading `<think>`
+    // (because the prompt suffix already opened the channel), then
+    // closes with `</think>` and emits the answer.
+    em.emit_token("alpha ");
+    em.emit_token("beta ");
+    em.emit_token("</think>\n\nAnswer: 4");
+    em.emit_finish(4);
+
+    TEST_ASSERT(em.reasoning_text().find("alpha")  != std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("beta")   != std::string::npos);
+    // No spurious <think> tag leaked into reasoning or content.
+    TEST_ASSERT(em.reasoning_text().find("<think>")  == std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("</think>") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("<think>")  == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("</think>") == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("Answer: 4") != std::string::npos);
+}
+
+static void test_emitter_initial_mode_reasoning_unclosed_stays_reasoning() {
+    // No </think> close — everything stays in reasoning_content, content
+    // stays empty. Matches parse_reasoning(started_in_thinking=true)
+    // behavior for the non-streaming path.
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req-2", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    em.emit_start();
+    em.emit_token("still thinking");
+    em.emit_token(" more thinking");
+    em.emit_finish(3);
+
+    TEST_ASSERT(em.reasoning_text().find("still thinking") != std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("more thinking")  != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().empty());
+}
+
+static void test_emitter_initial_mode_reasoning_strips_redundant_think_opener() {
+    // Edge case: prompt pre-opened <think>, but the model also emits a
+    // leading <think> anyway (template/model-card mismatch). The
+    // emitter's strip guard (checked_think_prefix_) must still trip
+    // because we deliberately leave it at its default (false) in the
+    // constructor — otherwise the duplicate opener would leak into
+    // reasoning_text.
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "req-3", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    em.emit_start();
+    em.emit_token("<think>actual reasoning</think>answer");
+    em.emit_finish(3);
+
+    TEST_ASSERT(em.reasoning_text().find("<think>") == std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("actual reasoning") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("answer") != std::string::npos);
+}
+
+static void test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking() {
+    // Anthropic format: when starting in REASONING mode, the very first
+    // content_block_start must be `thinking`, not `text`. Otherwise the
+    // emitter would open a text block, then have to stop+restart it as
+    // thinking on the first reasoning delta — wasteful and visible to
+    // SDK clients as a spurious empty text block.
+    SseEmitter em(ApiFormat::ANTHROPIC, "req-4", "test-model", 10,
+                  json::array(), nullptr,
+                  /*stops=*/{},
+                  StreamMode::REASONING);
+    auto start = em.emit_start();
+    std::string all;
+    for (const auto & c : start) all += c;
+    // First content block must be a thinking block. nlohmann::json sorts
+    // keys alphabetically on dump(), so the inner block serializes as
+    // `{"thinking":"","type":"thinking"}` (NOT type-first). Assert on
+    // the unique `"thinking":""` opener which only appears in the
+    // thinking-kind serialization.
+    TEST_ASSERT(all.find("\"thinking\":\"\",\"type\":\"thinking\"")
+                != std::string::npos);
+    // And the initial text-block opener must NOT appear (regression: if
+    // active_kind_ defaulted to "text", emit_start would have emitted
+    // `{"text":"","type":"text"}` here instead).
+    TEST_ASSERT(all.find("\"text\":\"\",\"type\":\"text\"")
+                == std::string::npos);
+}
+
+// ─── Integration: render_chat_template → SseEmitter wiring ──────────────
+//
+// The original bug was an integration gap: render_chat_template correctly
+// reported started_in_thinking=true, but no caller routed it into the
+// SseEmitter's initial_mode, so reasoning text leaked into content and
+// reasoning_content stayed empty. Each end of the wire has its own unit
+// tests above; these chain the two ends so a future refactor that drops
+// the propagation cannot pass without an assertion failure here.
+//
+// The body mirrors the production wiring in
+// server/src/server/http_server.cpp (the `started_in_thinking →
+// initial_mode → SseEmitter` chain). Keep these in sync if that wiring
+// moves.
+
+static void test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning() {
+    std::vector<ChatMessage> msgs = {{"user", "What is 2+2?", ""}};
+    auto render = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT_MSG(render.started_in_thinking,
+        "renderer end of wire: QWEN3 enable_thinking must pre-open <think>");
+
+    const StreamMode initial_mode = render.started_in_thinking
+        ? StreamMode::REASONING : StreamMode::CONTENT;
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-q", "test-model", 10,
+                  json::array(), nullptr, /*stops=*/{}, initial_mode);
+    em.emit_start();
+    em.emit_token("Let me compute. ");
+    em.emit_token("2+2 equals 4.");
+    em.emit_token("</think>\n\nThe answer is 4.");
+    em.emit_finish(5);
+
+    TEST_ASSERT_MSG(!em.reasoning_text().empty(),
+        "wiring broken: reasoning_content empty despite started_in_thinking=true");
+    TEST_ASSERT(em.reasoning_text().find("Let me compute")    != std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("<think>")           == std::string::npos);
+    TEST_ASSERT(em.reasoning_text().find("</think>")          == std::string::npos);
+    TEST_ASSERT_MSG(em.accumulated_text().find("Let me compute") == std::string::npos,
+        "wiring broken: reasoning text leaked into content channel");
+    TEST_ASSERT(em.accumulated_text().find("The answer is 4") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("<think>")         == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("</think>")        == std::string::npos);
+}
+
+static void test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning() {
+    std::vector<ChatMessage> msgs = {{"user", "Solve 7*8.", ""}};
+    auto render = render_chat_template(msgs, ChatFormat::LAGUNA,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/true,
+                                       /*tools=*/"");
+    TEST_ASSERT_MSG(render.started_in_thinking,
+        "renderer end of wire: LAGUNA enable_thinking must pre-open <think>");
+
+    const StreamMode initial_mode = render.started_in_thinking
+        ? StreamMode::REASONING : StreamMode::CONTENT;
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-l", "test-model", 10,
+                  json::array(), nullptr, /*stops=*/{}, initial_mode);
+    em.emit_start();
+    em.emit_token("Working through it: ");
+    em.emit_token("7*8 = 56.");
+    em.emit_token("</think>\n\n56.");
+    em.emit_finish(4);
+
+    TEST_ASSERT_MSG(!em.reasoning_text().empty(),
+        "wiring broken: reasoning_content empty despite started_in_thinking=true");
+    TEST_ASSERT(em.reasoning_text().find("Working through it") != std::string::npos);
+    TEST_ASSERT_MSG(em.accumulated_text().find("Working through it") == std::string::npos,
+        "wiring broken: reasoning text leaked into content channel");
+    TEST_ASSERT(em.accumulated_text().find("56.") != std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("<think>")  == std::string::npos);
+    TEST_ASSERT(em.accumulated_text().find("</think>") == std::string::npos);
+}
+
+static void test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content() {
+    // Inverse direction: when enable_thinking=false the renderer must not
+    // pre-open and the emitter must start in CONTENT, so the model's
+    // tokens land in content from the first byte. Guards against the
+    // opposite regression of unconditionally starting in REASONING.
+    std::vector<ChatMessage> msgs = {{"user", "Hi.", ""}};
+    auto render = render_chat_template(msgs, ChatFormat::QWEN3,
+                                       /*add_gen=*/true,
+                                       /*enable_thinking=*/false,
+                                       /*tools=*/"");
+    TEST_ASSERT(!render.started_in_thinking);
+
+    const StreamMode initial_mode = render.started_in_thinking
+        ? StreamMode::REASONING : StreamMode::CONTENT;
+    SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-n", "test-model", 10,
+                  json::array(), nullptr, /*stops=*/{}, initial_mode);
+    em.emit_start();
+    em.emit_token("Hello there.");
+    em.emit_finish(2);
+
+    TEST_ASSERT(em.reasoning_text().empty());
+    TEST_ASSERT(em.accumulated_text().find("Hello there") != std::string::npos);
+}
+
+
 static void test_normalize_responses_tool_followup_messages() {
     ToolMemory tool_memory;
     const std::string call_id = "call_exec_001";
@@ -1795,7 +2396,6 @@ struct MockLayerSplitAdapter : LayerSplitAdapter {
     std::vector<int32_t> emitted_tokens;
     bool dflash_enabled = false;
     bool dflash_called = false;
-    bool sampling_enabled = false;
     int shutdown_calls = 0;
     ModelBackend::CompressRequest last_compress_req;
     int prefill_chunk = 0;
@@ -1832,7 +2432,6 @@ struct MockLayerSplitAdapter : LayerSplitAdapter {
         return true;
     }
     bool can_dflash_decode() const override { return dflash_enabled; }
-    bool supports_cpu_sampling() const override { return sampling_enabled; }
     bool decode_dflash(const std::vector<int32_t> & prompt, int base_pos,
                        int last_tok, int n_gen, std::vector<int32_t> & out_tokens,
                        const DaemonIO & io, float & accept_rate_out) override {
@@ -3229,8 +3828,11 @@ static void test_props_budget_envelope_shape() {
     TEST_ASSERT(body["model_card"]["max_tokens"].get<int>() == 32768);
     TEST_ASSERT(be["default_max_tokens"].get<int>() == 16000);
 
-    // Sanity: props_schema bumped to 2 (breaking change).
-    TEST_ASSERT(body["server"]["props_schema"].get<int>() == 2);
+    // Sanity: props_schema bumped to 4 (schema 4 added the top-level
+    // `host` block over schema 3; schema 3 over 2 added `build` and
+    // `model.target`/`model.draft`. All additive but the bump
+    // propagates so consumers can negotiate.)
+    TEST_ASSERT(body["server"]["props_schema"].get<int>() == 4);
 }
 
 // ─── /props.runtime captures full config (§4.16) ──────────────────────
@@ -3280,6 +3882,257 @@ static void test_props_runtime_shape() {
     TEST_ASSERT(body["runtime"]["draft_device"].is_null());
 }
 
+// ─── /props.build block (schema 3) ────────────────────────────────────
+// The new structured replacement for the single-string `build_info`.
+// Always emitted; image_* fields are null when the binary isn't running
+// in a Docker image (no /opt/lucebox-hub/IMAGE_INFO baked in).
+static void test_props_build_block_shape_no_image_info() {
+    ServerConfig cfg = make_props_config_with_sidecar(json{
+        {"name", "Qwen3.6 27B"},
+        {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"},
+        {"verified_at", "2026-05-23"},
+        {"max_tokens", 32768},
+    });
+    // image_info default = null → image_* fields stay null.
+    Tokenizer    tok;
+    PrefixCache  pc(0, tok);
+    ToolMemory   tm;
+    json body = build_props_body(cfg, pc, tm);
+
+    TEST_ASSERT(body.contains("build"));
+    const json & b = body["build"];
+    // Stable identity always populated.
+    TEST_ASSERT(b["server_name"].get<std::string>() == "luce-dflash");
+    TEST_ASSERT(b["server_version"].is_string());
+    TEST_ASSERT(b["props_schema"].get<int>() == 4);
+    // Image-baked fields null in the no-IMAGE_INFO case.
+    TEST_ASSERT(b["git_sha"].is_null());
+    TEST_ASSERT(b["image_tag"].is_null());
+    TEST_ASSERT(b["image_digest"].is_null());
+    TEST_ASSERT(b["build_time"].is_null());
+
+    // Legacy build_info still present for back-compat readers.
+    TEST_ASSERT(body.contains("build_info"));
+    TEST_ASSERT(body["build_info"].get<std::string>().find("props_schema=4")
+                != std::string::npos);
+}
+
+// ─── /props.host (schema 4) ───────────────────────────────────────────
+// Verbatim pass-through of the JSON written by entrypoint.sh to
+// /opt/lucebox-hub/HOST_INFO. Surfaces /props.host so luce-bench's
+// snapshot subcommand can capture the rig identity alongside the run.
+// `null` when ServerConfig.host_info was not populated (bare-metal
+// dev builds that bypass the container entrypoint).
+static void test_props_host_block_present_when_populated() {
+    ServerConfig cfg = make_props_config_with_sidecar(json{
+        {"name", "Qwen3.6 27B"},
+        {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"},
+        {"verified_at", "2026-05-23"},
+        {"max_tokens", 32768},
+    });
+    cfg.host_info = json::object({
+        {"os_pretty",          "Ubuntu 22.04.3 LTS"},
+        {"kernel",             "6.6.87.2-microsoft-standard-WSL2"},
+        {"wsl_version",        "wsl2"},
+        {"docker_version",     "29.1.3"},
+        {"nvidia_driver",      "596.36"},
+        {"nvidia_ctk_version", "1.16.2"},
+        {"cpu_model",          "Intel(R) Core(TM) Ultra 9 275HX"},
+        {"nproc",              24},
+        {"ram_gb",             64},
+        {"gpus", json::array({
+            json::object({
+                {"index",         0},
+                {"uuid",          "GPU-abc"},
+                {"pci_bus_id",    "00000000:01:00.0"},
+                {"name",          "NVIDIA GeForce RTX 5090 Laptop GPU"},
+                {"sm",            "12.0"},
+                {"vram_gb",       24},
+                {"power_limit_w", 175},
+            }),
+        })},
+        {"cuda_visible_devices", "0"},
+        {"source",               "lucebox.sh"},
+        {"collector",            "lucebox.sh"},
+        {"collected_at",         "2026-05-28T20:31:42Z"},
+    });
+    Tokenizer    tok;
+    PrefixCache  pc(0, tok);
+    ToolMemory   tm;
+    json body = build_props_body(cfg, pc, tm);
+
+    TEST_ASSERT(body.contains("host"));
+    TEST_ASSERT(!body["host"].is_null());
+    const json & h = body["host"];
+    TEST_ASSERT(h["os_pretty"].get<std::string>()    == "Ubuntu 22.04.3 LTS");
+    TEST_ASSERT(h["wsl_version"].get<std::string>()  == "wsl2");
+    TEST_ASSERT(h["nvidia_ctk_version"].get<std::string>() == "1.16.2");
+    TEST_ASSERT(h["source"].get<std::string>()       == "lucebox.sh");
+    TEST_ASSERT(h["gpus"].is_array());
+    TEST_ASSERT(h["gpus"].size() == 1);
+    TEST_ASSERT(h["gpus"][0]["name"].get<std::string>()
+                == "NVIDIA GeForce RTX 5090 Laptop GPU");
+    TEST_ASSERT(h["gpus"][0]["vram_gb"].get<int>()   == 24);
+}
+
+static void test_props_host_block_null_when_missing() {
+    // ServerConfig.host_info default = null → /props.host emits JSON null.
+    ServerConfig cfg = make_props_config_with_sidecar(json{
+        {"name", "Qwen3.6 27B"},
+        {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"},
+        {"verified_at", "2026-05-23"},
+        {"max_tokens", 32768},
+    });
+    // cfg.host_info stays at its default nullptr.
+    Tokenizer    tok;
+    PrefixCache  pc(0, tok);
+    ToolMemory   tm;
+    json body = build_props_body(cfg, pc, tm);
+
+    TEST_ASSERT(body.contains("host"));
+    TEST_ASSERT(body["host"].is_null());
+    // /props.server.props_schema reflects the schema-4 bump regardless.
+    TEST_ASSERT(body["server"]["props_schema"].get<int>() == 4);
+}
+
+static void test_props_build_block_with_image_info() {
+    ServerConfig cfg = make_props_config_with_sidecar(json{
+        {"name", "Qwen3.6 27B"},
+        {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"},
+        {"verified_at", "2026-05-23"},
+        {"max_tokens", 32768},
+    });
+    cfg.image_info = json::object({
+        {"git_sha",    "6d12378"},
+        {"image_tag",  "sha-6d12378-cuda12"},
+        {"build_time", "2026-05-28T13:43:57Z"},
+    });
+    Tokenizer    tok;
+    PrefixCache  pc(0, tok);
+    ToolMemory   tm;
+    json body = build_props_body(cfg, pc, tm);
+
+    const json & b = body["build"];
+    TEST_ASSERT(b["git_sha"].get<std::string>()    == "6d12378");
+    TEST_ASSERT(b["image_tag"].get<std::string>()  == "sha-6d12378-cuda12");
+    TEST_ASSERT(b["build_time"].get<std::string>() == "2026-05-28T13:43:57Z");
+    // image_digest is reserved for external population; still null.
+    TEST_ASSERT(b["image_digest"].is_null());
+}
+
+// ─── /props.model.target + /props.model.draft (schema 3) ──────────────
+// Verbatim GGUF identity surfaced under model.target / model.draft.
+// `draft` is null when no draft GGUF is loaded; the legacy
+// `model.draft_path` string stays alongside for back-compat readers.
+static void test_props_model_target_draft_shape() {
+    ServerConfig cfg = make_props_config_with_sidecar(json{
+        {"name", "Qwen3.6 27B"},
+        {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"},
+        {"verified_at", "2026-05-23"},
+        {"max_tokens", 32768},
+    });
+    cfg.draft_path = "/opt/models/dflash-draft-3.6-q4_k_m.gguf";
+    cfg.target_gguf = json::object({
+        {"path",       "/opt/models/Qwen3.6-27B-Q4_K_M.gguf"},
+        {"size_bytes", int64_t(17134510080)},
+        {"sha256",     "abc123def456" + std::string(52, '0')},
+        {"gguf", {
+            {"general.architecture",         "qwen35"},
+            {"general.name",                 "Qwen3.6-27B"},
+            {"general.file_type",            15},
+            {"general.file_type_name",       "Q4_K_M"},
+            {"general.quantization_version", 2},
+            {"block_count",                  64},
+            {"embedding_length",             5120},
+            {"context_length",               65536},
+            {"vocab_size",                   152064},
+        }},
+    });
+    cfg.draft_gguf = json::object({
+        {"path",       "/opt/models/dflash-draft-3.6-q4_k_m.gguf"},
+        {"size_bytes", int64_t(425000000)},
+        {"sha256",     "deadbeef" + std::string(56, '0')},
+        {"gguf", {
+            {"general.architecture",         "qwen3"},
+            {"general.name",                 "Qwen3-0.6B-DFlash-draft"},
+            {"general.file_type",            15},
+            {"general.file_type_name",       "Q4_K_M"},
+            {"general.quantization_version", 2},
+            {"block_count",                  28},
+            {"embedding_length",             1024},
+            {"context_length",               32768},
+            {"vocab_size",                   152064},
+        }},
+    });
+
+    Tokenizer    tok;
+    PrefixCache  pc(0, tok);
+    ToolMemory   tm;
+    json body = build_props_body(cfg, pc, tm);
+
+    const json & m = body["model"];
+
+    // arch + back-compat fields preserved.
+    TEST_ASSERT(m["arch"].get<std::string>() == "qwen35");
+    TEST_ASSERT(m["alias"].get<std::string>() == cfg.model_name);
+    TEST_ASSERT(m["draft_path"].get<std::string>() ==
+                "/opt/models/dflash-draft-3.6-q4_k_m.gguf");
+
+    // target: required, never null when GGUF is loaded.
+    TEST_ASSERT(!m["target"].is_null());
+    const json & tgt = m["target"];
+    TEST_ASSERT(tgt["path"].get<std::string>() ==
+                "/opt/models/Qwen3.6-27B-Q4_K_M.gguf");
+    TEST_ASSERT(tgt["size_bytes"].get<int64_t>() == int64_t(17134510080));
+    TEST_ASSERT(tgt["sha256"].get<std::string>().size() == 64);
+    TEST_ASSERT(tgt["gguf"]["general.architecture"].get<std::string>() == "qwen35");
+    TEST_ASSERT(tgt["gguf"]["general.file_type_name"].get<std::string>() == "Q4_K_M");
+    TEST_ASSERT(tgt["gguf"]["context_length"].get<int>() == 65536);
+    TEST_ASSERT(tgt["gguf"]["vocab_size"].get<int>() == 152064);
+
+    // draft: required key, populated when --draft was passed.
+    TEST_ASSERT(!m["draft"].is_null());
+    TEST_ASSERT(m["draft"]["path"].get<std::string>() ==
+                "/opt/models/dflash-draft-3.6-q4_k_m.gguf");
+    TEST_ASSERT(m["draft"]["gguf"]["general.architecture"].get<std::string>() == "qwen3");
+}
+
+static void test_props_model_draft_null_when_target_only() {
+    // laguna / qwen3.6-moe configs run target-only: model.draft is JSON
+    // null (NOT omitted), so consumers can distinguish "feature absent"
+    // from "field not in this schema version".
+    ServerConfig cfg = make_props_config_with_sidecar(json{
+        {"name", "qwen3.6-moe-test"},
+        {"source", "https://huggingface.co/test"},
+        {"verified_at", "2026-05-23"},
+        {"max_tokens", 32768},
+    });
+    cfg.draft_path = "";              // no --draft
+    cfg.target_gguf = json::object({
+        {"path",       "/opt/models/qwen3.6-moe.gguf"},
+        {"size_bytes", int64_t(18000000000)},
+        {"sha256",     nullptr},
+        {"gguf", {
+            {"general.architecture", "qwen35moe"},
+            {"general.name",         "Qwen3.6-35B-A3B"},
+        }},
+    });
+    // draft_gguf left at default (null).
+
+    Tokenizer    tok;
+    PrefixCache  pc(0, tok);
+    ToolMemory   tm;
+    json body = build_props_body(cfg, pc, tm);
+
+    TEST_ASSERT(body["model"].contains("draft"));
+    TEST_ASSERT(body["model"]["draft"].is_null());
+    TEST_ASSERT(body["model"]["draft_path"].is_null());  // legacy field too
+    // target still populated.
+    TEST_ASSERT(!body["model"]["target"].is_null());
+    TEST_ASSERT(body["model"]["target"]["gguf"]["general.architecture"]
+                    .get<std::string>() == "qwen35moe");
+}
+
 // ═══════════════════════════════════════════════════════════════════════
 // usage.timings — per-request prefill / decode wall-clock breakdown
 // surfaced under usage.timings (spec §6.3). Tests cover all three
@@ -3903,6 +4756,23 @@ int main() {
     RUN_TEST(test_emitter_streaming_openai_has_done);
     RUN_TEST(test_emitter_nonstreaming_accumulates);
     RUN_TEST(test_emitter_anthropic_thinking_blocks);
+    RUN_TEST(test_emitter_initial_mode_reasoning_routes_to_reasoning_content);
+    RUN_TEST(test_emitter_initial_mode_reasoning_unclosed_stays_reasoning);
+    RUN_TEST(test_emitter_initial_mode_reasoning_strips_redundant_think_opener);
+    RUN_TEST(test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking);
+
+    std::fprintf(stderr, "\n── CONTENT-mode plain-text call:<verb>{} ──\n");
+    RUN_TEST(test_emitter_content_mode_plain_text_call_parsed);
+    RUN_TEST(test_emitter_content_mode_no_tools_skips_plain_text_call);
+    RUN_TEST(test_emitter_content_mode_underscore_prefix_call_parsed);
+    RUN_TEST(test_emitter_content_mode_no_call_substring_skips_parser);
+    RUN_TEST(test_emitter_content_mode_mixed_calls_multiple);
+    RUN_TEST(test_emitter_content_mode_malformed_call_dropped);
+    RUN_TEST(test_emitter_content_mode_does_not_double_fire_on_tool_call_xml);
+    RUN_TEST(test_emitter_content_mode_strips_call_span_from_accumulated_text);
+    RUN_TEST(test_emitter_content_mode_anthropic_emits_tool_use_block);
+    RUN_TEST(test_emitter_content_mode_digit_start_verb_parsed);
+    RUN_TEST(test_emitter_content_mode_responses_done_uses_pre_strip_text);
 
     std::fprintf(stderr, "\n── Stop sequences ──\n");
     RUN_TEST(test_stop_sequence_basic);
@@ -3965,6 +4835,19 @@ int main() {
     RUN_TEST(test_jinja_render_bos_eos_threaded);
     RUN_TEST(test_jinja_render_empty_template_throws);
     RUN_TEST(test_jinja_render_bad_tools_json_throws);
+    RUN_TEST(test_chat_template_qwen3_enable_thinking_pre_opens);
+    RUN_TEST(test_chat_template_qwen3_disable_thinking_does_not_pre_open);
+    RUN_TEST(test_chat_template_qwen3_no_gen_prompt_does_not_pre_open);
+    RUN_TEST(test_chat_template_laguna_enable_thinking_pre_opens);
+    RUN_TEST(test_chat_template_laguna_disable_thinking_does_not_pre_open);
+    RUN_TEST(test_chat_template_gemma4_does_not_pre_open);
+    RUN_TEST(test_jinja_render_suffix_sniff_sets_started_in_thinking);
+    RUN_TEST(test_jinja_render_suffix_sniff_negative);
+    RUN_TEST(test_jinja_render_suffix_sniff_overrides_enable_thinking_flag);
+    RUN_TEST(test_jinja_render_suffix_sniff_requires_add_generation_prompt);
+    RUN_TEST(test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning);
+    RUN_TEST(test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning);
+    RUN_TEST(test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content);
     RUN_TEST(test_normalize_responses_tool_followup_messages);
 
     std::fprintf(stderr, "\n── Placement config ──\n");
@@ -3977,7 +4860,6 @@ int main() {
     RUN_TEST(test_backend_precision_hip_arch_policy);
     RUN_TEST(test_backend_precision_activation_type_combine);
     RUN_TEST(test_layer_split_backend_inline_snapshot_and_restore_delta);
-    RUN_TEST(test_layer_split_backend_sampling_capability_gate);
     RUN_TEST(test_layer_split_backend_chunks_prefill_by_adapter_limit);
     RUN_TEST(test_layer_split_compress_nopark_uses_default_drafter_path);
     RUN_TEST(test_layer_split_compress_rejects_bad_keep_ratio);
@@ -4032,6 +4914,12 @@ int main() {
     RUN_TEST(test_props_model_card_null_on_family_fallback);
     RUN_TEST(test_props_budget_envelope_shape);
     RUN_TEST(test_props_runtime_shape);
+    RUN_TEST(test_props_build_block_shape_no_image_info);
+    RUN_TEST(test_props_build_block_with_image_info);
+    RUN_TEST(test_props_model_target_draft_shape);
+    RUN_TEST(test_props_model_draft_null_when_target_only);
+    RUN_TEST(test_props_host_block_present_when_populated);
+    RUN_TEST(test_props_host_block_null_when_missing);
 
     std::fprintf(stderr, "\n── usage.timings ──\n");
     RUN_TEST(test_usage_timings_openai_chat_streaming);
diff --git a/share/model_cards/_schema.json b/share/model_cards/_schema.json
index 3fc204cb4..d83e70061 100644
--- a/share/model_cards/_schema.json
+++ b/share/model_cards/_schema.json
@@ -65,6 +65,17 @@
         "repetition_penalty": { "type": "number" }
       }
     },
+    "thinking_control": {
+      "type": "object",
+      "description": "Optional. Client-side prompt-level thinking-control tokens. Read by luce-bench when running against providers that ignore the API-side flags (chat_template_kwargs.enable_thinking / thinking / reasoning_effort) — for the Qwen3.x family this is e.g. '/think' and '/no_think'. The token is appended to the last user turn before serialization; see luce-bench/src/lucebench/_thinking.py. Only `user_turn_suffix` is supported in v1.",
+      "additionalProperties": false,
+      "required": ["think_prompt_token", "nothink_prompt_token", "injection_point"],
+      "properties": {
+        "think_prompt_token":   { "type": "string", "minLength": 1 },
+        "nothink_prompt_token": { "type": "string", "minLength": 1 },
+        "injection_point":      { "type": "string", "enum": ["user_turn_suffix"] }
+      }
+    },
     "reasoning_effort_tiers": {
       "type": "object",
       "description": "Optional. Explicit per-tier phase-1 budgets. Overrides any computed default. Use when ratio-based defaults don't fit the model.",
diff --git a/share/model_cards/laguna-xs.2.json b/share/model_cards/laguna-xs.2.json
index bc0dda85a..a4fb86f14 100644
--- a/share/model_cards/laguna-xs.2.json
+++ b/share/model_cards/laguna-xs.2.json
@@ -3,11 +3,15 @@
   "source": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF",
   "verified_at": "2026-05-24",
   "download_urls": {
-    "Q4_K_M": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-Q4_K_M.gguf",
-    "bf16":   "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-bf16.gguf"
+    "Q4_K_M-target":    "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-Q4_K_M.gguf",
+    "bf16-target":      "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-bf16.gguf",
+    "DFlash-speculator":"https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash/resolve/main/model.safetensors",
+    "DFlash-speculator-config":"https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash/resolve/main/config.json"
   },
-  "notes": "Non-reasoning MoE code model (3B active / 33B total). Card does not specify generation params or a complex-problem mode. context: native 4096, PFlash-extended to 131072. Sampling defaults below are code-model-typical (not from card). general.name has not been verified against a loaded GGUF — confirm and rename file if needed.",
-  "max_tokens": 4096,
+  "notes": "Reasoning MoE code model (3B active / 33B total). The README and the published Poolside speculator confirm a `<think>` reasoning channel and an `enable_thinking=true` chat-template flag. Native 131072 context. DFlash speculator is published as poolside/Laguna-XS.2-speculator.dflash (safetensors, 5-layer Qwen3-flavored draft: head_dim=128, hidden=2048, q_norm + k_norm). Our existing safetensors loader consumes it directly with dynamic dim inference; bench measured +60% decode rate vs no-draft (125 vs 78 tok/s at temp=0) on bragi. Sampling defaults are code-model conservative; the upstream card does not pin them.",
+  "max_tokens": 16384,
+  "hard_limit_reply_budget": 4096,
+  "thinking_terminator_hint": "Considering the limited time by the user, I have to give the solution based on the thinking directly now.\n</think>\n\n",
   "sampling": {
     "temperature": 0.6,
     "top_p": 0.95,
diff --git a/share/model_cards/qwen3.6-27b.json b/share/model_cards/qwen3.6-27b.json
index 94094dddf..5448be8b3 100644
--- a/share/model_cards/qwen3.6-27b.json
+++ b/share/model_cards/qwen3.6-27b.json
@@ -20,5 +20,10 @@
     "high": 32256,
     "x-high": 56832,
     "max": 81408
+  },
+  "thinking_control": {
+    "think_prompt_token": "/think",
+    "nothink_prompt_token": "/no_think",
+    "injection_point": "user_turn_suffix"
   }
 }
\ No newline at end of file