diff --git a/docs/specs/openapi-props.yaml b/docs/specs/openapi-props.yaml index c2976265f..63daacba1 100644 --- a/docs/specs/openapi-props.yaml +++ b/docs/specs/openapi-props.yaml @@ -2,7 +2,7 @@ openapi: 3.1.0 info: title: dflash_server /props endpoint - version: "2" + version: "4" summary: Capability and configuration introspection for dflash_server. description: | `GET /props` returns enough JSON for a dashboard, a deployment @@ -13,7 +13,28 @@ info: The integer reported as `server.props_schema` (and as the trailing token of `build_info`) bumps when the response shape changes in a backward-incompatible way. The current schema is - `2`. + `4`. + + Schema `4` (additive over `3`): new top-level `host` block — + host-identity facts (OS, kernel, WSL version, docker version, + NVIDIA driver, NVIDIA Container Toolkit version, CPU model, + nproc, RAM, per-GPU array with UUID/PCI/SM/VRAM/power, + CUDA_VISIBLE_DEVICES) captured by + `server/scripts/entrypoint.sh` from the `LUCEBOX_HOST_*` env + the host wrapper exports. `null` when + `/opt/lucebox-hub/HOST_INFO` is missing (bare-metal dev). See + `props-endpoint.md` §4.17. + + Schema `3` (additive over `2`): new top-level `build` block — + a structured replacement for the single-string `build_info` + that carries `git_sha`, `image_tag`, and `build_time` baked + into the container at build time. New `model.target` and + `model.draft` sub-objects carry full GGUF identity (absolute + path, `size_bytes`, `sha256`, and a `gguf` header field set — + architecture, quant `file_type`, `block_count`, + `embedding_length`, `context_length`, `vocab_size`). The + pre-3 fields (`build_info`, `model_path`, `model.draft_path`, + `model_alias`) stay verbatim for back-compat. Schema `2` (breaking change vs. `1`): `model_card` is now the wholesale on-disk sidecar JSON (or `null` when family / hard @@ -29,8 +50,10 @@ info: # The numeric value matches `server.props_schema` and the # `props_schema=` token in `build_info`. Bumps on breaking -# response-shape changes; additive changes keep the same value. -x-props-schema: 2 +# response-shape changes; additive changes keep the same value +# (e.g. schema 2 → 3 is additive but still bumps so consumers +# can negotiate the new fields). +x-props-schema: 4 servers: - url: http://localhost:8080 @@ -82,7 +105,37 @@ paths: high: 32256 x-high: 56832 max: 81408 - build_info: "luce-dflash v0.0.0+cpp props_schema=2" + build: + server_name: "luce-dflash" + server_version: "0.0.0+cpp" + props_schema: 4 + git_sha: "6d12378" + image_tag: "sha-6d12378-cuda12" + image_digest: null + build_time: "2026-05-28T13:43:57Z" + build_info: "luce-dflash v0.0.0+cpp props_schema=4" + host: + os_pretty: "Ubuntu 22.04.3 LTS" + kernel: "6.6.87.2-microsoft-standard-WSL2" + wsl_version: "wsl2" + docker_version: "29.1.3" + nvidia_driver: "596.36" + nvidia_ctk_version: "1.16.2" + cpu_model: "Intel(R) Core(TM) Ultra 9 275HX" + nproc: 24 + ram_gb: 64 + gpus: + - index: 0 + uuid: "GPU-abc" + pci_bus_id: "00000000:01:00.0" + name: "NVIDIA GeForce RTX 5090 Laptop GPU" + sm: "12.0" + vram_gb: 24 + power_limit_w: 175 + cuda_visible_devices: "0" + source: "lucebox.sh" + collector: "lucebox.sh" + collected_at: "2026-05-28T20:31:42Z" capabilities: reasoning_supported: true speculative_supported: true @@ -104,8 +157,37 @@ paths: lifetime_hits: 0 model: arch: "qwen35" - draft_path: "/.../dflash-draft-3.6-q4_k_m.gguf" + alias: "dflash" + draft_path: "/.../dflash-draft-3.6-q8_0.gguf" tokenizer_id: null + target: + path: "/.../Qwen3.6-27B-Q4_K_M.gguf" + size_bytes: 17134510080 + sha256: "abc123def456...0a1b2c3d4e5f" + gguf: + general.architecture: "qwen35" + general.name: "Qwen3.6-27B" + general.file_type: 15 + general.file_type_name: "Q4_K_M" + general.quantization_version: 2 + block_count: 64 + embedding_length: 5120 + context_length: 65536 + vocab_size: 152064 + draft: + path: "/.../dflash-draft-3.6-q8_0.gguf" + size_bytes: 425000000 + sha256: "deadbeef...0a1b2c3d4e5f" + gguf: + general.architecture: "qwen3" + general.name: "Qwen3-0.6B-DFlash-draft" + general.file_type: 7 + general.file_type_name: "Q8_0" + general.quantization_version: 2 + block_count: 28 + embedding_length: 1024 + context_length: 32768 + vocab_size: 152064 model_alias: "dflash" model_card: name: "Qwen3.6 27B" @@ -169,7 +251,7 @@ paths: supports_top_p: true server: name: "luce-dflash" - props_schema: 2 + props_schema: 4 version: "0.0.0+cpp" speculative: enabled: true @@ -190,11 +272,13 @@ components: required: - api - budget_envelope + - build - build_info - capabilities - daemon - default_generation_settings - full_cache + - host - model - model_alias - model_card @@ -214,13 +298,21 @@ components: $ref: "#/components/schemas/Api" budget_envelope: $ref: "#/components/schemas/BudgetEnvelope" + build: + $ref: "#/components/schemas/Build" build_info: type: string description: | Single-string identity: ` v props_schema=`. Matches the structured `server` object. Bumps `props_schema` on breaking changes. - example: "luce-dflash v0.0.0+cpp props_schema=2" + + Deprecated in favor of the structured `build` block + (schema 3+), which also carries `git_sha`, + `image_tag`, and `build_time`. Retained for + back-compat — consumers that grep `build_info` keep + working. + example: "luce-dflash v0.0.0+cpp props_schema=4" capabilities: $ref: "#/components/schemas/Capabilities" daemon: @@ -229,6 +321,17 @@ components: $ref: "#/components/schemas/DefaultGenerationSettings" full_cache: $ref: "#/components/schemas/FullCache" + host: + oneOf: + - $ref: "#/components/schemas/Host" + - type: "null" + description: | + Host-identity facts captured at container startup by + `server/scripts/entrypoint.sh` from the + `LUCEBOX_HOST_*` env the host wrapper exports. `null` + when `/opt/lucebox-hub/HOST_INFO` is missing (bare- + metal dev). Added in schema 4 (additive over 3); pre-4 + consumers ignore the key. model: $ref: "#/components/schemas/Model" model_alias: @@ -432,12 +535,148 @@ components: description: Cumulative hit count since server start. example: 0 + Host: + description: | + Host-identity facts captured at container startup by + `server/scripts/entrypoint.sh` from the `LUCEBOX_HOST_*` + env the host wrapper (`lucebox.sh::probe_host`) exports. + Written to `/opt/lucebox-hub/HOST_INFO` (path override: + `$DFLASH_HOST_INFO_PATH` for tests) and read verbatim + into `ServerConfig.host_info`. See + `docs/specs/props-endpoint.md` §4.17 for the prose spec. + + Surfaces so every benchmark snapshot can self-classify + the rig it ran on. `luce-bench snapshot` writes this + verbatim into `host.json` and into each per-area + `.json` so individual area files self-describe. + type: object + required: + - collected_at + - collector + - source + additionalProperties: true + properties: + os_pretty: + type: ["string", "null"] + description: | + `PRETTY_NAME` from `/etc/os-release`. + example: "Ubuntu 22.04.3 LTS" + kernel: + type: ["string", "null"] + description: | + `uname -r` on the host. + example: "6.6.87.2-microsoft-standard-WSL2" + wsl_version: + type: ["string", "null"] + enum: ["wsl1", "wsl2", null] + description: | + `"wsl2"` matches the modern `microsoft-standard-WSL2` + kernel string; `"wsl1"` is the legacy translation + layer; `null` is bare Linux / macOS. + docker_version: + type: ["string", "null"] + description: | + Docker server version from + `docker version --format '{{.Server.Version}}'`. + example: "29.1.3" + nvidia_driver: + type: ["string", "null"] + description: Driver version from `nvidia-smi`. + example: "596.36" + nvidia_ctk_version: + type: ["string", "null"] + description: | + NVIDIA Container Toolkit version + (`nvidia-ctk --version`). Distinct from + `docker_version` — the runtime that wires GPUs into + containers can lag behind the daemon. + example: "1.16.2" + cpu_model: + type: ["string", "null"] + description: First `"model name"` from `/proc/cpuinfo`. + example: "Intel(R) Core(TM) Ultra 9 275HX" + nproc: + type: ["integer", "null"] + minimum: 0 + description: Logical CPU count. + example: 24 + ram_gb: + type: ["integer", "null"] + minimum: 0 + description: Total RAM in GB. + example: 64 + gpus: + type: array + description: | + One entry per installed GPU; the array preserves + nvidia-smi's enumeration order. Possibly empty. + items: + type: object + additionalProperties: true + properties: + index: + type: integer + minimum: 0 + example: 0 + uuid: + type: string + example: "GPU-abc" + pci_bus_id: + type: string + example: "00000000:01:00.0" + name: + type: string + example: "NVIDIA GeForce RTX 5090 Laptop GPU" + sm: + type: string + description: Compute capability (e.g. "12.0"). + example: "12.0" + vram_gb: + type: integer + minimum: 0 + example: 24 + power_limit_w: + type: integer + minimum: 0 + description: | + May differ from manufacturer spec when the + operator has set a power cap. + example: 175 + cuda_visible_devices: + type: ["string", "null"] + description: | + Mirrors the env var; `null` means "all GPUs visible". + example: "0" + source: + type: string + enum: ["lucebox.sh", "unknown"] + description: | + How the block was populated. `"lucebox.sh"` when the + host wrapper drove the run; `"unknown"` for the + entrypoint stub-only path. + example: "lucebox.sh" + collector: + type: string + description: | + The script that wrote HOST_INFO: usually + `"lucebox.sh"` when the host wrapper drove the run, or + `"entrypoint.sh"` on the stub-only path. + example: "lucebox.sh" + collected_at: + type: string + format: date-time + description: ISO 8601 UTC timestamp. + example: "2026-05-28T20:31:42Z" + Model: description: Loaded model metadata. type: object required: - arch + - alias + - draft - draft_path + - target - tokenizer_id properties: arch: @@ -446,18 +685,228 @@ components: Normalized `general.architecture` value from the loaded GGUF (e.g. `qwen35`, `qwen36`, `gemma4`, `laguna`). example: "qwen35" + alias: + type: string + description: | + Mirror of the top-level `model_alias` (schema 3+), + grouped under `model` alongside arch / target / draft. + The top-level `model_alias` stays for back-compat. + example: "dflash" draft_path: type: ["string", "null"] description: | Filesystem path of the loaded speculative-decode draft - GGUF; `null` when no draft is loaded. - example: "/.../dflash-draft-3.6-q4_k_m.gguf" + GGUF; `null` when no draft is loaded. Deprecated in + favor of `model.draft.path` (same value); retained for + back-compat with pre-schema-3 readers. + example: "/.../dflash-draft-3.6-q8_0.gguf" tokenizer_id: type: ["string", "null"] description: | Best-effort tokenizer family hint from GGUF metadata (e.g. `qwen3`). `null` when unknown. example: null + target: + oneOf: + - $ref: "#/components/schemas/GgufFile" + - type: "null" + description: | + GGUF identity for the loaded target weights (schema 3+). + `null` only when the file couldn't be inspected at + startup — typically a load failure that should have + aborted boot, so seeing `null` here is a strong signal + something is wrong. + draft: + oneOf: + - $ref: "#/components/schemas/GgufFile" + - type: "null" + description: | + GGUF identity for the loaded draft weights (schema 3+). + `null` when `--draft` was not passed — the normal + target-only configuration for `laguna` and the + `qwen3.6-moe` preset. Explicit null (not omitted) so + consumers can distinguish "no draft" from "missing + from this schema version." + + GgufFile: + description: | + Identity payload for one loaded GGUF file (schema 3+). + Surfaced under `model.target` and `model.draft`. The triple + `path` + `size_bytes` + `sha256` is what "exactly what + weights is this server running" forensics need; the `gguf` + sub-object adds the header fields the loader parses anyway. + + Header fields may be `null` when the GGUF doesn't carry the + corresponding key — drafter GGUFs in particular omit + `context_length` and `vocab_size` more often than full + target models do. + type: object + required: + - path + - size_bytes + - sha256 + - gguf + properties: + path: + type: string + description: Absolute filesystem path of the loaded GGUF. + example: "/.../Qwen3.6-27B-Q4_K_M.gguf" + size_bytes: + type: ["integer", "null"] + format: int64 + minimum: 0 + description: File size in bytes; `null` if the stat() failed. + example: 17134510080 + sha256: + type: ["string", "null"] + description: | + Lowercase hex sha256 of the GGUF file. Computed once at + server startup and cached to a `.sha256` sidecar + so subsequent restarts skip the rehash. `null` when + hashing was disabled (`$DFLASH_SKIP_SHA256=1`) or the + sidecar was unreadable and the file couldn't be opened + for reading. + example: "abc123def456789...0a1b2c3d4e5f" + gguf: + type: object + description: | + Selected `general.*` and `.*` header fields read + from the GGUF. Each field is `null` when the file + doesn't carry the corresponding key. + additionalProperties: true + properties: + "general.architecture": + type: ["string", "null"] + description: Raw `general.architecture` value (e.g. `qwen35`). + example: "qwen35" + "general.name": + type: ["string", "null"] + description: Display name from the GGUF (`general.name`). + example: "Qwen3.6-27B" + "general.file_type": + type: ["integer", "null"] + minimum: 0 + description: | + Raw `LLAMA_FTYPE_*` integer from the GGUF + (`general.file_type`). 15 = Q4_K_M, 17 = Q5_K_M, + 30 = IQ4_XS, 32 = BF16, etc. See + `server/deps/llama.cpp/include/llama.h` for the + full table. + example: 15 + "general.file_type_name": + type: ["string", "null"] + description: | + Decoded operator-friendly tag for `general.file_type` + (`Q4_K_M`, `IQ4_XS`, `BF16`, …). Empty string maps + to `null` when the int is outside the known table. + example: "Q4_K_M" + "general.quantization_version": + type: ["integer", "null"] + minimum: 0 + description: | + Raw `general.quantization_version` from the GGUF. + Bumped on quant-format changes; 2 is the current + value for K-quants and IQ-quants in 2025-2026. + example: 2 + block_count: + type: ["integer", "null"] + minimum: 0 + description: "`.block_count` — number of transformer blocks." + example: 64 + embedding_length: + type: ["integer", "null"] + minimum: 0 + description: "`.embedding_length` — model hidden size." + example: 5120 + context_length: + type: ["integer", "null"] + minimum: 0 + description: | + `.context_length` — the maximum context the + weights themselves were trained for. May exceed the + server's runtime `n_ctx` cap. + example: 65536 + vocab_size: + type: ["integer", "null"] + minimum: 0 + description: | + `.vocab_size` or the length of + `tokenizer.ggml.tokens` (fallback). Useful for + catching target/draft tokenizer mismatches at a + glance. + example: 152064 + + Build: + description: | + Structured server + container identity (schema 3+). The + first three fields mirror the `server` block and the + single-string `build_info`; the next three carry the + Docker image identity baked in at build time via + `docker-bake.hcl` (`GIT_SHA`, `IMAGE_TAG`, `BUILD_TIME`). + + On bare-metal / non-Docker builds (no + `/opt/lucebox-hub/IMAGE_INFO` file), `git_sha`, + `image_tag`, and `build_time` are all `null` — the keys + are still present for shape stability. + type: object + required: + - server_name + - server_version + - props_schema + - git_sha + - image_tag + - image_digest + - build_time + properties: + server_name: + type: string + description: Server identity string (= `server.name`). + example: "luce-dflash" + server_version: + type: string + description: Build version string (= `server.version`). + example: "0.0.0+cpp" + props_schema: + type: integer + minimum: 1 + description: | + Integer schema version (= `server.props_schema`). + Repeated here so a single curl on `/props` `.build` + returns the schema version alongside the rest of the + identity. + example: 4 + git_sha: + type: ["string", "null"] + description: | + Full git commit sha of the source tree the image was + built from. Set by CI from `${{ github.sha }}` via + docker-bake.hcl. `null` outside Docker. + example: "6d12378abc456789012345678901234567890abcd" + image_tag: + type: ["string", "null"] + description: | + Headline tag the image was published under (e.g. + `cuda12`, `sha-6d12378-cuda12`, `0.3.0-cuda12`). Set + by CI from `docker/metadata-action` `version` output. + `null` outside Docker. + example: "sha-6d12378-cuda12" + image_digest: + type: ["string", "null"] + description: | + Reserved for future use — the registry-side + content-addressable digest. Not currently populated by + the build pipeline (the running container doesn't + query its own image via the Docker socket). Always + `null` today. + example: null + build_time: + type: ["string", "null"] + format: date-time + description: | + ISO 8601 UTC timestamp the image was built at. Set by + CI via `date -u`; for local builds via + `scripts/build_image.sh`. `null` outside Docker. + example: "2026-05-28T13:43:57Z" ModelCard: description: | @@ -866,9 +1315,13 @@ components: minimum: 1 description: | Integer schema version. Bumps when the response shape - changes in a backward-incompatible way (see §5 of - props-endpoint.md). Current value is `2`. - example: 2 + changes (see §5 of props-endpoint.md). Current value + is `4`. Schema 4 is additive over 3 (new top-level + `host` block); schema 3 was additive over 2 (new + `build` block, new `model.target`/`model.draft`). The + bumps still happen so consumers can negotiate the new + fields. + example: 4 version: type: string description: Build version string (semver + build tag). diff --git a/docs/specs/props-endpoint.md b/docs/specs/props-endpoint.md index e4238df3d..c367ba6aa 100644 --- a/docs/specs/props-endpoint.md +++ b/docs/specs/props-endpoint.md @@ -56,12 +56,14 @@ request will not delay a `/props` response. { "api": { "endpoints": [ … ] }, "budget_envelope": { … }, + "build": { … }, "build_info": "luce-dflash v props_schema=", "capabilities": { … }, "daemon": { "alive": true }, "default_generation_settings": { … }, "full_cache": { … }, - "model": { … }, + "host": { … } | null, + "model": { "arch": "", "alias": "", "draft_path": "", "tokenizer_id": "", "target": { … }, "draft": { … } | null }, "model_alias": "", "model_card": { … } | null, "model_path": "", @@ -155,15 +157,55 @@ absolute-tier ceiling clamping (spec §3.5). actually do with a request; `model_card` (§4.10) is the source of truth for what the authored card says. -### 4.3 `build_info` +### 4.3 `build_info` (legacy) and `build` (schema 3+) ``` -"build_info": "luce-dflash v0.0.0+cpp props_schema=2" +"build_info": "luce-dflash v0.0.0+cpp props_schema=4" +"build": { + "server_name": "luce-dflash", + "server_version": "0.0.0+cpp", + "props_schema": 4, + "git_sha": "6d12378…", + "image_tag": "sha-6d12378-cuda12", + "image_digest": null, + "build_time": "2026-05-28T13:43:57Z" +} ``` -A single string carrying: server name, build version, and the -**`props_schema` version**. Schema version bumps when the response -shape changes in a non-backward-compatible way (see §5). +`build_info` is the legacy single-string identity (server name, +build version, `props_schema`). Schema version bumps when the +response shape changes (see §5). Retained verbatim for back-compat +— consumers that grep `build_info` keep working without changes. + +`build` (schema 3+) is the structured replacement and the +recommended source of truth for "what binary is running": + +- `server_name` / `server_version` / `props_schema` mirror the + identity fields. Repeated here so a single `curl … | jq .build` + returns everything an operator needs. +- `git_sha` — full git commit sha of the source tree the image + was built from. Set by CI from `${{ github.sha }}` via + `docker-bake.hcl`; set locally by `scripts/build_image.sh` from + `git rev-parse HEAD`. `null` on bare-metal builds (no + `/opt/lucebox-hub/IMAGE_INFO` file). +- `image_tag` — headline tag the image was published under + (e.g. `cuda12`, `sha-6d12378-cuda12`, `0.3.0-cuda12`). Set by + CI from `docker/metadata-action`'s `version` output. `null` + outside Docker. +- `image_digest` — reserved for future use. The + content-addressable registry digest would let an operator pin + `ghcr.io/.../lucebox-hub@sha256:…` after a pull; we don't query + the Docker socket from inside the container today, so this is + always `null`. Kept in the schema so adding it later is + additive. +- `build_time` — ISO 8601 UTC timestamp the image was built at. + `null` outside Docker. + +The `build.image_*` fields are populated from +`/opt/lucebox-hub/IMAGE_INFO`, which `Dockerfile` writes from the +`GIT_SHA`, `IMAGE_TAG`, and `BUILD_TIME` build args. The path can +be overridden with `$DFLASH_IMAGE_INFO_PATH` (used by unit tests +to inject fixtures). ### 4.4 `capabilities` @@ -255,15 +297,91 @@ for an introspection report; not safe for control-flow decisions. ```json "model": { "arch": "qwen35", + "alias": "dflash", "draft_path": "/path/to/draft.gguf" | null, - "tokenizer_id": "qwen3" | null + "tokenizer_id": "qwen3" | null, + "target": { + "path": "/path/to/Qwen3.6-27B-Q4_K_M.gguf", + "size_bytes": 17134510080, + "sha256": "abc123…", + "gguf": { + "general.architecture": "qwen35", + "general.name": "Qwen3.6-27B", + "general.file_type": 15, + "general.file_type_name": "Q4_K_M", + "general.quantization_version": 2, + "block_count": 64, + "embedding_length": 5120, + "context_length": 65536, + "vocab_size": 152064 + } + }, + "draft": { … } | null } ``` `arch` is the `general.architecture` value from the loaded GGUF, -normalized. `draft_path` is the speculative-decode draft model -path, or `null` when no draft is loaded. `tokenizer_id` is a -best-effort tokenizer family hint from GGUF metadata. +normalized. `tokenizer_id` is a best-effort tokenizer family hint +from GGUF metadata. + +`alias` (schema 3+) mirrors the top-level `model_alias` for +grouping under `model` alongside the rest of the model identity. +The top-level `model_alias` stays for back-compat. + +`draft_path` (schema 1+, legacy) is the speculative-decode draft +GGUF path, or `null` when no draft is loaded. New consumers should +prefer `model.draft.path` — same value, but grouped with the rest +of the draft identity. + +`target` (schema 3+) is the full identity of the loaded target +weights. Always present and non-null when the server is up — a +`null` `target` indicates a load failure that should have aborted +boot, so it's a strong signal something is wrong. + +`draft` (schema 3+) is the same identity payload for the draft +GGUF, or **explicit JSON null** when `--draft` was not passed. +The normal target-only configurations are `laguna` and the +`qwen3.6-moe` preset; explicit-null (not omitted) lets consumers +distinguish "no draft loaded" from "field not in this schema +version." + +#### `model.target` / `model.draft` field shape + +| field | type | meaning | +|---|---|---| +| `path` | `string` | Absolute filesystem path of the loaded GGUF. | +| `size_bytes` | `integer \| null` | File size from `stat()`. `null` if the stat failed. | +| `sha256` | `string \| null` | Lowercase hex sha256 (64 chars). Cached to a `.sha256` sidecar so subsequent restarts skip the rehash. `null` when `$DFLASH_SKIP_SHA256=1` or the file couldn't be opened for reading. | +| `gguf` | `object` | Selected `general.*` and `.*` header fields. Each field is `null` when the GGUF doesn't carry the corresponding key — drafter GGUFs in particular omit `context_length` and `vocab_size` more often than full target models do. | + +The `gguf` sub-object's keys map 1:1 to GGUF metadata keys: + +- `general.architecture` — raw architecture string (e.g. `qwen35`, + `qwen3`, `gemma4`, `laguna`). +- `general.name` — display name from the GGUF. +- `general.file_type` — raw `LLAMA_FTYPE_*` integer (see + `server/deps/llama.cpp/include/llama.h` for the full table). + 15 = Q4_K_M, 17 = Q5_K_M, 30 = IQ4_XS, 32 = BF16, etc. +- `general.file_type_name` — operator-friendly decoded tag for + `general.file_type` (e.g. `Q4_K_M`, `IQ4_XS`, `BF16`). +- `general.quantization_version` — bumped on quant-format changes + (2 is the current value for K-quants and IQ-quants). +- `block_count` — `.block_count` (number of transformer + blocks). +- `embedding_length` — `.embedding_length` (model hidden + size). +- `context_length` — `.context_length` (max context the + weights themselves were trained for; may exceed the server's + runtime `n_ctx` cap). +- `vocab_size` — `.vocab_size` or the length of + `tokenizer.ggml.tokens` (fallback when the key isn't written). + +The sha256 is computed once at startup. For a multi-GB target +GGUF this is ~30s on a fast NVMe; the result is written to a +sidecar file `.sha256` so subsequent restarts read it from +disk instead of rehashing. Set `$DFLASH_SKIP_SHA256=1` to disable +hashing entirely (faster cold start, but `sha256` will be `null` +at /props). ### 4.9 `model_alias` and `model_path` @@ -478,14 +596,126 @@ configuration drift between runs is possible. - `draft_device` — resolved draft-model device placement, or `null` when no draft model is loaded. +### 4.17 `host` (schema 4+) + +```json +"host": { + "os_pretty": "Ubuntu 22.04.3 LTS", + "kernel": "6.6.87.2-microsoft-standard-WSL2", + "wsl_version": "wsl2", + "docker_version": "29.1.3", + "nvidia_driver": "596.36", + "nvidia_ctk_version":"1.16.2", + "cpu_model": "Intel(R) Core(TM) Ultra 9 275HX", + "nproc": 24, + "ram_gb": 64, + "gpus": [ + { + "index": 0, + "uuid": "GPU-abc…", + "pci_bus_id": "00000000:01:00.0", + "name": "NVIDIA GeForce RTX 5090 Laptop GPU", + "sm": "12.0", + "vram_gb": 24, + "power_limit_w": 175 + } + ], + "cuda_visible_devices": "0", + "source": "lucebox.sh", + "collector": "lucebox.sh", + "collected_at": "2026-05-28T20:31:42Z" +} +``` + +Host-identity facts captured at container startup by +`server/scripts/entrypoint.sh` from the `LUCEBOX_HOST_*` env vars the +host wrapper (`lucebox.sh::probe_host`) exports. Written to +`/opt/lucebox-hub/HOST_INFO` (path override: `$DFLASH_HOST_INFO_PATH` +for tests) and read verbatim into `ServerConfig.host_info` by +`server_main::read_host_info`. + +Surfaces so every benchmark snapshot can self-classify the rig it +ran on, even when the snapshot dir is pulled out of context months +later. `luce-bench snapshot` writes this into `host.json` and into +each per-area `.json` so individual area files self-describe. + +`null` when `/opt/lucebox-hub/HOST_INFO` is missing or malformed — +the normal case for bare-metal dev builds that bypass the container +entrypoint entirely. Containers launched by `lucebox.sh` always get a +populated block; containers launched directly via `docker run` get a +stub `{"source": "unknown", "collector": "entrypoint.sh", ...}` so +the block is always present in container deployments. + +Fields: + +- `os_pretty` — string|null. `PRETTY_NAME` from + `/etc/os-release`. e.g. `"Ubuntu 22.04.3 LTS"`. +- `kernel` — string|null. `uname -r` on the host. +- `wsl_version` — `"wsl1"`, `"wsl2"`, or `null`. `"wsl2"` matches + the modern `microsoft-standard-WSL2` kernel string; `"wsl1"` is + the legacy translation layer; `null` is bare Linux / macOS. +- `docker_version` — string|null. Docker server version from + `docker version --format '{{.Server.Version}}'`. +- `nvidia_driver` — string|null. Driver version from `nvidia-smi`. +- `nvidia_ctk_version` — string|null. NVIDIA Container Toolkit + version (`nvidia-ctk --version`). Distinct from `docker_version` — + the runtime that wires GPUs into containers can lag behind the + daemon. +- `cpu_model` — string|null. First `"model name"` from + `/proc/cpuinfo`. +- `nproc` — int|null. Logical CPU count. +- `ram_gb` — int|null. Total RAM in GB. +- `gpus` — array of objects (possibly empty). One entry per + installed GPU; the array preserves nvidia-smi's enumeration + order. Per-entry fields: `index` (int), `uuid` (string), + `pci_bus_id` (string), `name` (string), `sm` (string, + compute capability like `"12.0"`), `vram_gb` (int), + `power_limit_w` (int, may differ from manufacturer spec when + the operator has set a power cap). +- `cuda_visible_devices` — string|null. Mirrors the env var; `null` + means "all GPUs visible". +- `source` — string. One of `"lucebox.sh"`, `"unknown"`. Indicates + how the block was populated. +- `collector` — string. The script that wrote HOST_INFO: usually + `"lucebox.sh"` when the host wrapper drove the run, or + `"entrypoint.sh"` on the stub-only path. +- `collected_at` — ISO 8601 UTC timestamp string. + ## 5. Schema versioning -`build_info` includes `props_schema=`. The integer `n` bumps -when the response shape changes in a way that breaks existing -clients. The current schema is `2`. +`build_info` includes `props_schema=`, mirrored in +`server.props_schema` and (schema 3+) `build.props_schema`. The +integer `n` bumps when fields are added or changed; consumers +should treat unknown fields as ignorable. The current schema is +`4`. ### 5.0 Changelog +- **`4`** — Additive over `3`. New top-level `host` object — host- + identity facts (OS, kernel, WSL version, docker version, NVIDIA + driver, NVIDIA Container Toolkit version, CPU model, nproc, RAM, + per-GPU array with UUID/PCI/SM/VRAM/power, CUDA_VISIBLE_DEVICES) + captured by `server/scripts/entrypoint.sh` from the + `LUCEBOX_HOST_*` env the host wrapper exports. `null` when + `/opt/lucebox-hub/HOST_INFO` is missing (bare-metal dev). Pre-4 + consumers ignore the new key; new consumers (luce-bench's + snapshot subcommand in particular) gate on the version to know + the block is guaranteed-present, and fall back to a client-side + hostinfo probe against pre-4 servers. +- **`3`** — Additive over `2`. New top-level `build` object — a + structured replacement for the single-string `build_info` that + carries `git_sha`, `image_tag`, and `build_time` baked into the + container at build time. New `model.target` and `model.draft` + sub-objects carry full GGUF identity (path, `size_bytes`, + `sha256`, and `gguf.*` header fields including + `general.file_type[_name]`, `block_count`, `embedding_length`, + `context_length`, `vocab_size`). New `model.alias` field + (mirror of top-level `model_alias`). The pre-3 top-level + `build_info`, `model_path`, `model_alias`, and + `model.draft_path` stay verbatim for back-compat. Schema is + still bumped (vs leaving at `2`) so consumers can negotiate + the new fields and lucebench's preflight can switch its + display format based on the version. - **`2`** — `model_card` is now the wholesale on-disk sidecar JSON (or `null` when family/hard fallback was used). Runtime-resolved budget knobs that used to live under `model_card` @@ -496,15 +726,20 @@ clients. The current schema is `2`. lives at `budget_envelope.model_card_source`. - **`1`** — Initial schema. -### 5.1 Non-breaking changes (no version bump) - -- Adding a new top-level section or a new field inside an existing - section. -- Adding a new entry to `api.endpoints` or `reasoning.supported_efforts`. -- Loosening field bounds (e.g. extending an enum's allowed values). - -Clients are required to ignore unknown fields. The schema version -does not bump for additive changes. +### 5.1 Non-breaking changes + +Pure additive changes — new top-level section, new field inside +an existing section, new entry in `api.endpoints` or +`reasoning.supported_efforts`, loosened field bounds — historically +did not bump `props_schema`. Schema 3 is a deliberate exception: +it's additive (new `build`, `model.target`, `model.draft`, +`model.alias`) but bumps the version so consumers (lucebench's +preflight in particular) can opt in to the new display when the +fields are guaranteed-present, and fall back when talking to an +older server. The rule going forward: **bumps are allowed for +additive changes too** — pre-3 clients keep working because they +ignore unknown fields; new clients gate on the version to know +they can rely on the new shape. ### 5.2 Breaking changes (bump `props_schema`) @@ -546,7 +781,42 @@ version increments. "max": 81408 } }, - "build_info": "luce-dflash v0.0.0+cpp props_schema=2", + "build": { + "server_name": "luce-dflash", + "server_version": "0.0.0+cpp", + "props_schema": 4, + "git_sha": "6d12378abc456789012345678901234567890abcd", + "image_tag": "sha-6d12378-cuda12", + "image_digest": null, + "build_time": "2026-05-28T13:43:57Z" + }, + "build_info": "luce-dflash v0.0.0+cpp props_schema=4", + "host": { + "os_pretty": "Ubuntu 22.04.3 LTS", + "kernel": "6.6.87.2-microsoft-standard-WSL2", + "wsl_version": "wsl2", + "docker_version": "29.1.3", + "nvidia_driver": "596.36", + "nvidia_ctk_version": "1.16.2", + "cpu_model": "Intel(R) Core(TM) Ultra 9 275HX", + "nproc": 24, + "ram_gb": 64, + "gpus": [ + { + "index": 0, + "uuid": "GPU-abc", + "pci_bus_id": "00000000:01:00.0", + "name": "NVIDIA GeForce RTX 5090 Laptop GPU", + "sm": "12.0", + "vram_gb": 24, + "power_limit_w": 175 + } + ], + "cuda_visible_devices": "0", + "source": "lucebox.sh", + "collector": "lucebox.sh", + "collected_at": "2026-05-28T20:31:42Z" + }, "capabilities": { "reasoning_supported": true, "speculative_supported": true, @@ -570,8 +840,41 @@ version increments. }, "model": { "arch": "qwen35", - "draft_path": "/.../dflash-draft-3.6-q4_k_m.gguf", - "tokenizer_id": "qwen3" + "alias": "dflash", + "draft_path": "/.../dflash-draft-3.6-q8_0.gguf", + "tokenizer_id": "qwen3", + "target": { + "path": "/.../Qwen3.6-27B-Q4_K_M.gguf", + "size_bytes": 17134510080, + "sha256": "abc123def456789012345678901234567890abcd0123456789abcdef01234567", + "gguf": { + "general.architecture": "qwen35", + "general.name": "Qwen3.6-27B", + "general.file_type": 15, + "general.file_type_name": "Q4_K_M", + "general.quantization_version": 2, + "block_count": 64, + "embedding_length": 5120, + "context_length": 65536, + "vocab_size": 152064 + } + }, + "draft": { + "path": "/.../dflash-draft-3.6-q8_0.gguf", + "size_bytes": 425000000, + "sha256": "deadbeef00112233445566778899aabbccddeeff00112233445566778899aabb", + "gguf": { + "general.architecture": "qwen3", + "general.name": "Qwen3-0.6B-DFlash-draft", + "general.file_type": 7, + "general.file_type_name": "Q8_0", + "general.quantization_version": 2, + "block_count": 28, + "embedding_length": 1024, + "context_length": 32768, + "vocab_size": 152064 + } + } }, "model_alias": "dflash", "model_card": { diff --git a/docs/specs/thinking-budget.md b/docs/specs/thinking-budget.md index 5ebc731be..4079e264b 100644 --- a/docs/specs/thinking-budget.md +++ b/docs/specs/thinking-budget.md @@ -125,7 +125,7 @@ Fields: | `verified_at` | ISO date the values were last checked against the source. | | `max_tokens` | The card's standard recommended combined cap. Drives `default_max_tokens`. | | `complex_problem_max_tokens` | Optional. The card's recommendation for hard reasoning / benchmark workloads. Drives the `x-high` and `max` effort tiers, which sit *above* `default_max_tokens` when this field is present — they are admissible as long as they fit under `max_ctx − hard_limit_reply_budget`. If omitted, both collapse to the `high` tier value. | -| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `server/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. | +| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. | | `sampling` | Recommended sampler params. Used as defaults when the request doesn't pin sampler values. | | `reasoning_effort_tiers` | Explicit phase-1 budgets per tier. Override any computed default. Whichever tiers are present win; missing tiers fall through to the computed defaults below. | diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp index 1349109ad..6fecc3752 100644 --- a/server/src/server/chat_template.cpp +++ b/server/src/server/chat_template.cpp @@ -51,7 +51,7 @@ ChatFormat chat_format_for_arch(const std::string & arch) { return ChatFormat::QWEN3; } -std::string render_chat_template( +PromptRenderResult render_chat_template( const std::vector & messages, ChatFormat format, bool add_generation_prompt, @@ -59,6 +59,10 @@ std::string render_chat_template( const std::string & tools_json) { std::string result; + // `started_in_thinking` is derived deterministically from the template + // branch + render flags below. Set per format inside the switch so a + // future format addition can't silently miss the wiring. + bool started_in_thinking = false; bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null"; switch (format) { @@ -141,6 +145,14 @@ std::string render_chat_template( // even when the client opts in, defeating the thinking-budget // mechanism entirely. result += "\n"; + // The prompt suffix pre-opens `` — the model's very + // first generated token is reasoning, never preceded by an + // explicit `` opener in the stream. Callers must + // start the SSE state machine in REASONING mode and pass + // `started_in_thinking=true` to parse_reasoning() so that + // reasoning text routes to reasoning_content instead of + // leaking into content. + started_in_thinking = true; } } break; @@ -224,6 +236,11 @@ std::string render_chat_template( result += "\n"; if (enable_thinking) { result += ""; + // Same situation as Qwen3.6: Laguna XS.2's enable_thinking + // generation prompt ends with `` so the model starts + // emitting reasoning tokens with no explicit opener in the + // stream. Route subsequent tokens to the reasoning channel. + started_in_thinking = true; } else { // Empty think block — model jumps straight to answer. result += ""; @@ -311,11 +328,17 @@ std::string render_chat_template( result += "<|channel>thought\n"; } } + // Gemma4 does NOT pre-open `` from the prompt; its + // reasoning channel is opened by the model emitting `<|channel>` + // which http_server forwards into the SseEmitter as the text + // `` — so the emitter's existing CONTENT→REASONING + // transition fires on that synthesized opener. started_in_thinking + // stays false (initial CONTENT mode is correct). break; } } - return result; + return PromptRenderResult{std::move(result), started_in_thinking}; } // ─── Jinja path ───────────────────────────────────────────────────────── @@ -353,7 +376,29 @@ static std::shared_ptr get_or_parse(const std::string & template } // namespace -std::string render_chat_template_jinja( +// Sniff a rendered prompt for a trailing `` opener so the caller +// can route subsequent stream tokens to the reasoning channel. Accepts +// optional whitespace after the opener (Qwen3.6 emits `\n`). +// True positive ⇒ caller should treat the prompt as having pre-opened +// the reasoning channel (and the renderer warns loudly so a model-card +// mismatch is visible at runtime). +static bool prompt_ends_with_think_open(const std::string & s) { + static const std::string OPEN = ""; + // Walk back over trailing ASCII whitespace. + size_t end = s.size(); + while (end > 0) { + char c = s[end - 1]; + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + end--; + } else { + break; + } + } + if (end < OPEN.size()) return false; + return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0; +} + +PromptRenderResult render_chat_template_jinja( const std::string & template_src, const std::vector & messages, const std::string & bos_token, @@ -407,14 +452,43 @@ std::string render_chat_template_jinja( throw std::runtime_error(std::string("jinja global_from_json: ") + e.what()); } + std::string rendered; try { jinja::runtime rt(ctx); jinja::value results = rt.execute(*prog); auto parts = jinja::runtime::gather_string_parts(results); - return parts->as_string().str(); + rendered = parts->as_string().str(); } catch (const std::exception & e) { throw std::runtime_error(std::string("jinja runtime: ") + e.what()); } + + // Jinja path: we don't know which template family the caller passed + // in, so derive `started_in_thinking` by sniffing the rendered tail + // for a `` opener. This catches the common Qwen3.6 / Laguna + // chat templates that end with `\n` when enable_thinking is + // honored, plus any custom template that follows the same convention. + // + // The sniff is the source of truth — if the rendered prompt ends + // with ``, the model's first generated token is reasoning + // regardless of the `enable_thinking` flag we passed in. A template + // that hard-codes `` even with enable_thinking=false will + // still pre-open the channel, and we must route accordingly to + // avoid leaking reasoning into the content stream. + // + // Warn only on the mismatch case (sniff=true, enable_thinking=false) + // so a template/model-card disagreement surfaces in server logs + // without spamming the normal-success path. + bool started_in_thinking = + add_generation_prompt && prompt_ends_with_think_open(rendered); + if (started_in_thinking && !enable_thinking) { + std::fprintf(stderr, + "[WARN] render_chat_template_jinja: rendered prompt ends with " + "`` opener despite enable_thinking=false — treating as " + "started_in_thinking=true. Check the template's enable_thinking " + "branch or the model card's reasoning configuration.\n"); + } + + return PromptRenderResult{std::move(rendered), started_in_thinking}; } } // namespace dflash::common diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h index ca7ef9db5..770e65a42 100644 --- a/server/src/server/chat_template.h +++ b/server/src/server/chat_template.h @@ -27,6 +27,23 @@ enum class ChatFormat { GEMMA4, // <|turn>role\n...\n }; +// Provenance for a rendered prompt. `text` is the byte string that gets +// tokenized; `started_in_thinking` records whether the prompt suffix +// pre-opens a `` block (or equivalent reasoning-channel marker) +// that the model is expected to continue into. +// +// Callers route this into the SseEmitter's initial mode and into +// parse_reasoning()'s `started_in_thinking` argument so reasoning text +// emitted before any explicit `` opener is still attributed to +// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna +// enable_thinking prompts (which pre-open `\n` in the assistant +// turn) cause the model to emit reasoning straight into the content +// channel, leaving `reasoning_content` empty. +struct PromptRenderResult { + std::string text; // rendered prompt text, ready to tokenize + bool started_in_thinking; // prompt suffix opens reasoning channel +}; + // Render chat messages into the model-specific prompt string. // The result is plain text ready to be tokenized. // @@ -40,7 +57,7 @@ enum class ChatFormat { // `tools_json` is an optional JSON string containing the tool definitions // array. When non-empty, the Qwen3/3.5 template injects a tool preamble // into the system message instructing the model how to emit tags. -std::string render_chat_template( +PromptRenderResult render_chat_template( const std::vector & messages, ChatFormat format, bool add_generation_prompt = true, @@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch); // Internally caches the most recently parsed program per thread (avoids // re-parsing the template on every request). Throws std::runtime_error on // lexer/parser/runtime failure (caller should surface a 500 response). -std::string render_chat_template_jinja( +PromptRenderResult render_chat_template_jinja( const std::string & template_src, const std::vector & messages, const std::string & bos_token, diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index 62f1b91ff..ef49568b9 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -257,12 +257,31 @@ static bool curl_forward(int client_fd, const std::string & url, // // SERVER_NAME / SERVER_VERSION mirror the Python server's identity strings // so cross-server consumers (autotune, dashboards) see a stable -// `build_info` shape. Bump PROPS_SCHEMA on breaking changes only: -// - field renamed -// - field removed -// - existing field's semantics change (units, nullability, type) -// Do NOT bump for additive changes (new fields, new sections). -static constexpr int kPropsSchema = 2; +// `build_info` shape. Bump PROPS_SCHEMA when the response shape changes +// — either: +// - breaking: field renamed, removed, or its semantics changed +// (units, nullability, type tightening) +// - additive (new fields / sections) when downstream consumers need +// to negotiate the new shape. Pre-bump consumers keep working +// because they ignore unknown fields; the bump signals "the new +// fields are guaranteed-present at this version or higher" so +// code like lucebench's preflight can opt in to the richer display. +// +// Schema 3 (additive vs 2): new top-level `build` block (structured +// version of `build_info` with git_sha/image_tag/build_time), and new +// `model.target` / `model.draft` GGUF-identity sub-objects carrying +// size_bytes + sha256 + gguf header fields. The pre-3 top-level +// `build_info`, `model_path`, `model_alias`, and `model.draft_path` +// are preserved verbatim for back-compat. +// +// Schema 4 (additive vs 3): new top-level `host` block — verbatim +// pass-through of /opt/lucebox-hub/HOST_INFO (written by +// server/scripts/entrypoint.sh from the LUCEBOX_HOST_* env the host +// wrapper probes). Null when HOST_INFO is missing (bare-metal dev or +// manual docker run that bypasses entrypoint). luce-bench's snapshot +// subcommand uses the version bump to gate on the new shape — pre-4 +// servers force a client-side fallback probe. +static constexpr int kPropsSchema = 4; static constexpr char kServerName[] = "luce-dflash"; #ifndef DFLASH_SERVER_VERSION #define DFLASH_SERVER_VERSION "0.0.0+cpp" @@ -393,6 +412,8 @@ static std::string build_stall_tool_prefix(const json & tools, return prefix; } + + // Build the /props response body. // // Non-static so unit tests can call it directly (declared in http_server.h). @@ -439,6 +460,34 @@ json build_props_body(const ServerConfig & config, {"props_schema", kPropsSchema}, }; + // Structured replacement for the single-string `build_info` (schema 3+). + // Reads image identity stashed by server_main from /opt/lucebox-hub/ + // IMAGE_INFO when the binary is running inside a Docker image built by + // docker-bake.hcl. On bare-metal / dev builds, image_info is null and + // the three image_* fields stay null; git_sha / image_tag / build_time + // are always present as keys for shape stability. + auto pull_string = [&](const char * field) -> json { + if (!config.image_info.is_object()) return nullptr; + auto it = config.image_info.find(field); + if (it == config.image_info.end()) return nullptr; + if (!it->is_string()) return nullptr; + const std::string & s = it->get_ref(); + if (s.empty()) return nullptr; + return s; + }; + json build_block = { + {"server_name", kServerName}, + {"server_version", DFLASH_SERVER_VERSION}, + {"props_schema", kPropsSchema}, + {"git_sha", pull_string("git_sha")}, + {"image_tag", pull_string("image_tag")}, + // image_digest is set externally (image is content-addressable only + // after push; the running container would need to query its own + // image via the Docker socket, which we don't do today). Reserved. + {"image_digest", nullptr}, + {"build_time", pull_string("build_time")}, + }; + json pflash; if (!pflash_enabled) { pflash = { @@ -502,12 +551,29 @@ json build_props_body(const ServerConfig & config, {"model_path", config.model_path}, {"build_info", std::string(kServerName) + " v" DFLASH_SERVER_VERSION " props_schema=" + std::to_string(kPropsSchema)}, + {"build", build_block}, {"speculative_mode", speculative_mode}, {"server", server}, {"model", { {"arch", config.arch}, + // `alias` mirrors top-level `model_alias` for grouping under + // `model`. The top-level field stays for back-compat (clients + // already grep for `model_alias`); new consumers should prefer + // `model.alias` since that's where all the model identity + // (arch, target, draft, tokenizer_id) lives. + {"alias", config.model_name}, + // Back-compat: pre-schema-3 readers grep `model.draft_path` + // directly. New shape exposes the same path under + // `model.draft.path` along with size/sha256/header fields. {"draft_path", config.draft_path.empty() ? json(nullptr) : json(config.draft_path)}, {"tokenizer_id", config.tokenizer_id.empty() ? json(nullptr) : json(config.tokenizer_id)}, + // Schema 3 additions. Always emitted; `target` is null if the + // GGUF couldn't be inspected at startup (rare — implies a load + // failure that should have aborted boot). `draft` is null when + // no draft GGUF is loaded (`--draft` not passed), which is the + // normal target-only configuration for laguna / qwen3.6-moe. + {"target", config.target_gguf.is_null() ? json(nullptr) : config.target_gguf}, + {"draft", config.draft_gguf.is_null() ? json(nullptr) : config.draft_gguf}, }}, {"runtime", { {"backend", config.runtime_backend.empty() ? "cuda" : config.runtime_backend}, @@ -595,6 +661,13 @@ json build_props_body(const ServerConfig & config, // The C++ daemon is linked in-process; if /props is responding, // the daemon is alive by construction. {"daemon", {{"alive", true}}}, + // Host identity (schema 4+). Verbatim pass-through of + // /opt/lucebox-hub/HOST_INFO — see server_main::read_host_info + // and entrypoint.sh::write_host_info. Null when HOST_INFO is + // missing or malformed; null is the explicit "bare metal dev" + // signal that luce-bench's snapshot uses to trigger a + // client-side fallback probe. + {"host", config.host_info.is_null() ? json(nullptr) : config.host_info}, {"api", {{"endpoints", kApiEndpoints}}}, // Capability flags surfaced for clients that don't want to crack // open `reasoning` / `speculative` / etc. — matches the Python @@ -697,6 +770,7 @@ std::vector normalize_chat_messages( cm.role = m.value("role", "user"); bool replayed = false; + // OpenAI format: assistant message with tool_calls field. if (cm.role == "assistant" && m.contains("tool_calls") && m["tool_calls"].is_array() && !m["tool_calls"].empty()) { std::vector call_ids; @@ -711,6 +785,43 @@ std::vector normalize_chat_messages( } } + // Anthropic format: assistant message with tool_use content blocks. + // IDs in tool_use blocks match the IDs stored in tool_memory when + // this server emitted the tool calls. Look them up to get the raw + // model output (already formatted for the model's chat template). + if (!replayed && cm.role == "assistant" && + m.contains("content") && m["content"].is_array()) { + std::vector call_ids; + for (const auto & part : m["content"]) { + if (part.value("type", "") == "tool_use") { + std::string id = part.value("id", ""); + if (!id.empty()) call_ids.push_back(id); + } + } + if (!call_ids.empty()) { + std::string raw = tool_memory.lookup(call_ids); + if (!raw.empty()) { + cm.content = raw; + replayed = true; + } else { + // tool_memory miss (cross-session replay): synthesize + // from the block fields using the model's tool_call XML. + for (const auto & part : m["content"]) { + if (part.value("type", "") == "tool_use") { + json input = part.contains("input") + ? part["input"] : json::object(); + cm.content += "\n"; + cm.content += render_tool_call_xml( + part.value("name", ""), input); + cm.content += "\n"; + } + } + replayed = !cm.content.empty(); + } + } + } + + bool has_tool_results = false; if (!replayed) { if (m.contains("content") && m["content"].is_string()) { cm.content = m["content"].get(); @@ -720,16 +831,45 @@ std::vector normalize_chat_messages( if (ptype == "text" || ptype == "input_text" || ptype == "output_text") { cm.content += part.value("text", ""); + } else if (ptype == "tool_result") { + // Anthropic format: tool result inside a user + // message. Push as a tool-role message so the + // chat template wraps it in tags. + has_tool_results = true; + std::string result_content; + if (part.contains("content")) { + if (part["content"].is_string()) { + result_content = + part["content"].get(); + } else if (part["content"].is_array()) { + for (const auto & c : part["content"]) { + if (c.value("type", "") == "text") { + result_content += + c.value("text", ""); + } + } + } + } + std::string result_id = + part.value("tool_use_id", ""); + chat_msgs.push_back( + {"tool", result_content, result_id}); } } } } - if (format == ApiFormat::RESPONSES && - (cm.role == "system" || cm.role == "developer")) { - system_parts.push_back(cm.content); - } else { - chat_msgs.push_back(std::move(cm)); + // Skip pushing an empty user container when all content was + // tool_result blocks (already pushed as individual tool messages). + bool skip = (cm.role == "user" && has_tool_results && + cm.content.empty()); + if (!skip) { + if (format == ApiFormat::RESPONSES && + (cm.role == "system" || cm.role == "developer")) { + system_parts.push_back(cm.content); + } else { + chat_msgs.push_back(std::move(cm)); + } } } } else if (messages.is_string()) { @@ -1591,7 +1731,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { tools_json = req.tools.dump(); } - std::string rendered; + PromptRenderResult render_result; if (!config_.chat_template_src.empty()) { // Jinja path: caller supplied a chat template file via // --chat-template-file. Override the hardcoded QWEN3/LAGUNA @@ -1608,7 +1748,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { ? tokenizer_.raw_token(tokenizer_.eos_id()) : std::string(); try { - rendered = render_chat_template_jinja( + render_result = render_chat_template_jinja( config_.chat_template_src, chat_msgs, bos_str, @@ -1622,11 +1762,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { return true; } } else { - rendered = render_chat_template(chat_msgs, chat_format_, - true, enable_thinking, - tools_json); - } - req.prompt_tokens = tokenizer_.encode(rendered); + render_result = render_chat_template(chat_msgs, chat_format_, + true, enable_thinking, + tools_json); + } + // Propagate prompt provenance so the SseEmitter's initial mode + // matches the template's pre-opened reasoning channel (Qwen3.6 / + // Laguna enable_thinking case). Without this, reasoning text + // leaks into the content channel and `reasoning_content` stays + // empty — see fix(server): route Qwen3.6/Laguna think-mode + // reasoning to reasoning_content channel. + req.started_in_thinking = render_result.started_in_thinking; + req.prompt_tokens = tokenizer_.encode(render_result.text); // count_tokens: short-circuit after tokenization. Skip generation // entirely — Anthropic's contract is just `{"input_tokens": N}`. @@ -1770,11 +1917,20 @@ void HttpServer::worker_loop() { } } - // Create SSE emitter for streaming state machine. + // Create SSE emitter for streaming state machine. `initial_mode` + // tracks whether the chat-template prompt pre-opened a `` + // block (Qwen3.6 / Laguna enable_thinking path). When true, the + // emitter starts in REASONING so the model's first generated + // token routes to reasoning_content even though no explicit + // `` opener appears in the token stream. + const StreamMode initial_mode = req.started_in_thinking + ? StreamMode::REASONING + : StreamMode::CONTENT; SseEmitter emitter(req.format, req.response_id, req.model, (int)req.prompt_tokens.size(), req.tools, &tool_memory_, - req.stop_sequences); + req.stop_sequences, + initial_mode); // Emit initial SSE events (skip when proxying). if (req.stream && config_.pflash_upstream_base.empty()) { @@ -2235,6 +2391,7 @@ void HttpServer::worker_loop() { } #endif // DFLASH_HAS_CURL + // Build generate request. // // Thinking-budget v2 (Level 2): when caller opts in via @@ -2252,19 +2409,21 @@ void HttpServer::worker_loop() { const int effective_think_ceiling = (req.per_req_phase1_cap >= 0) ? req.per_req_phase1_cap : config_.think_max_tokens; - // The effective per-request reply budget is the operator's choice - // (CLI / sidecar / per-request override). The AR loop force-closes - // when `n_gen - generated <= eff_reply`, which means n_gen must - // include BOTH the think budget AND the reply reserve. Without the - // `+ eff_reply` term, force-close fires immediately when - // `eff_reply == effective_think_ceiling` (e.g. think_max=4096, - // hard_limit=4096 → remaining starts at 4096, condition fires - // before the model emits a single thinking token). Spec §4.4. + // When thinking is active, max_tokens is the *response* budget only — + // thinking tokens are additive. n_gen = think_ceiling + response_budget, + // where response_budget = min(max_tokens, hard_limit_reply_budget). + // This prevents immediate force-close on benchmarks whose max_tokens + // were sized for nothink responses (e.g. gsm8k=2048, agent_recorded=4096). + // Without this, n_gen = min(think+reply, max_tokens) would cap n_gen + // below the hard_limit threshold, firing force-close at step 0. Spec §4.4. const int eff_reply_for_n_gen = (req.per_req_reply_budget >= 0) ? req.per_req_reply_budget : config_.hard_limit_reply_budget; + const int response_budget = budget_active + ? std::min(req.max_output, eff_reply_for_n_gen) + : req.max_output; const int n_gen_cap = budget_active - ? std::min(effective_think_ceiling + eff_reply_for_n_gen, req.max_output) + ? effective_think_ceiling + response_budget : req.max_output; GenerateRequest gen_req; @@ -2620,8 +2779,9 @@ void HttpServer::worker_loop() { const std::string & raw = tokenizer_.raw_token(token); - // Gemma4 thinking channel: map <|channel> → , \n - if (raw == "<|channel>") { + // Gemma4 thinking channel: map <|channel>* → , \n + // raw vocab token is "<|channel>thought", not just "<|channel>". + if (raw.rfind("<|channel>", 0) == 0) { visible_output_seen = true; broadcast_token(""); if (req.stream) { @@ -2811,15 +2971,18 @@ void HttpServer::worker_loop() { } } - // close_kind reflects the Level 2 BudgetHook outcome: "hard" when - // the backend's AR/spec decode injected the close-token sequence - // at the budget boundary, "natural" when the model self-closed - // (or the request never opted in). Emitted as part of - // finish_details for thinking-budget callers. - std::string close_kind = - (req.thinking_opt_in && result.budget_forced_close) - ? "hard" - : "natural"; + // close_kind reflects the Level 2 BudgetHook outcome: + // "natural" — the model emitted on its own (or the + // request never opted in to the envelope). + // "hard" — the budget edge was reached and the AR loop + // forced in. Original Level 2 behavior. + // Soft-close (Level 2.5) lives on a sibling branch; this PR + // reports the natural/hard split that landed first. Emitted as + // part of finish_details for thinking-budget callers. + std::string close_kind = "natural"; + if (req.thinking_opt_in && result.budget_forced_close) { + close_kind = "hard"; + } // Finalize. // Per-request wall-clock timings forwarded to the response's @@ -2868,8 +3031,8 @@ void HttpServer::worker_loop() { const std::string & raw = tokenizer_.raw_token(tok); if (tok == tokenizer_.eos_id()) continue; if (tok == tokenizer_.eos_chat_id()) continue; - // Gemma4 channel → think mapping - if (raw == "<|channel>") { emitter.emit_token(""); continue; } + // Gemma4 channel → think mapping; raw token is "<|channel>thought" + if (raw.rfind("<|channel>", 0) == 0) { emitter.emit_token(""); continue; } if (raw == "") { emitter.emit_token("\n"); continue; } // Qwen3.6 thinking tokens (id 248068 / 248069) — must // forward as text so the emitter transitions diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 49fcafb6a..db5b48ff7 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -175,6 +175,39 @@ struct ServerConfig { // the Anthropic tool_use envelope, e.g. froggeric Qwen3.6 template. std::string chat_template_src; // literal Jinja source (loaded from file) std::string chat_template_path; // path it was loaded from (logged at startup) + + // ── /props identity payloads (filled by server_main at startup) ── + // + // `target_gguf` / `draft_gguf`: JSON blobs produced by reading the GGUF + // header + sha256 for each loaded model. Surface verbatim under + // /props.model.target / /props.model.draft so an operator can pin the + // exact weights + quant + sha from a single curl. Empty/null when + // not loaded; `draft_gguf` is null when --draft was not passed. + // See docs/specs/props-endpoint.md §4.8 and build_props_body(). + nlohmann::json target_gguf = nullptr; + nlohmann::json draft_gguf = nullptr; + + // `image_info`: container/image identity read from /opt/lucebox-hub/ + // IMAGE_INFO at server start. Three lines: git_sha, image_tag, + // build_time (ISO 8601). Object with three string fields or null + // when the file is missing (e.g. local non-Docker builds). Surfaced + // under /props.build as git_sha/image_tag/build_time. Path overridable + // via $DFLASH_IMAGE_INFO_PATH for tests. + nlohmann::json image_info = nullptr; + + // `host_info`: host-identity facts read from /opt/lucebox-hub/HOST_INFO + // at server start. JSON object written by server/scripts/entrypoint.sh + // from the LUCEBOX_HOST_* env vars the host wrapper exports. Surfaced + // verbatim under /props.host so every benchmark snapshot can self- + // classify the rig it ran on (OS, kernel, WSL version, GPU list with + // per-GPU UUID/PCI/SM/VRAM/power, nvidia driver + CTK versions). + // Null when the file is missing (e.g. bare-metal dev or someone ran + // `docker run` without lucebox.sh — entrypoint still writes a stub + // {"source":"unknown",...} so this is null only on the bare-metal + // path that bypasses entrypoint entirely). Path overridable via + // $DFLASH_HOST_INFO_PATH for tests. See HOST_INFO doc at + // docs/specs/props-endpoint.md §4.10. + nlohmann::json host_info = nullptr; }; // ─── Parsed request ───────────────────────────────────────────────────── @@ -215,6 +248,12 @@ struct ParsedRequest { // Bandit: per-session adaptive keep_ratio opt-in std::string session_id; DiskPrefixCachePolicy disk_cache_policy; + // Set by the chat-template renderer when the rendered prompt suffix + // pre-opens a `` block (Qwen3.6 / Laguna enable_thinking path). + // Drives the SseEmitter's initial mode so reasoning tokens emitted + // before any explicit `` opener route to reasoning_content + // instead of leaking into content. + bool started_in_thinking = false; }; // Build the /props response body. Exposed (non-static) so unit tests diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 2c7dc850f..b5ede8338 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -25,15 +25,20 @@ #include "gguf.h" #include +#include + + #include #include #include #include +#include #include #include #include using namespace dflash::common; +using nlohmann::json; // Global server pointer for signal handling. static HttpServer * g_server = nullptr; @@ -45,6 +50,87 @@ static void signal_handler(int sig) { } } +// Render a GgufMetadata as the JSON object surfaced under +// /props.model.target / /props.model.draft (schema 3+). Header keys that +// the file didn't carry come through as JSON null so consumers can tell +// "no key in GGUF" from "0" — important for context_length / vocab_size +// where 0 is implausible but missing is common on hand-edited drafters. +static json gguf_metadata_to_json(const GgufMetadata & m) { + auto str_or_null = [](const std::string & s) -> json { + return s.empty() ? json(nullptr) : json(s); + }; + auto i32_or_null = [](int32_t v) -> json { + return v < 0 ? json(nullptr) : json(v); + }; + json gguf = { + {"general.architecture", str_or_null(m.general_architecture)}, + {"general.name", str_or_null(m.general_name)}, + {"general.file_type", i32_or_null(m.file_type)}, + {"general.file_type_name", str_or_null(m.file_type_name)}, + {"general.quantization_version", i32_or_null(m.quantization_version)}, + {"block_count", i32_or_null(m.block_count)}, + {"embedding_length", i32_or_null(m.embedding_length)}, + {"context_length", i32_or_null(m.context_length)}, + {"vocab_size", i32_or_null(m.vocab_size)}, + }; + return { + {"path", m.path}, + {"size_bytes", m.size_bytes < 0 ? json(nullptr) : json(m.size_bytes)}, + {"sha256", str_or_null(m.sha256)}, + {"gguf", gguf}, + }; +} + +// Read /opt/lucebox-hub/IMAGE_INFO (three lines: git_sha, image_tag, +// build_time) into a JSON object surfaced under /props.build. Returns +// JSON null when the file is missing or unreadable — the normal case +// for bare-metal dev builds. Path override via $DFLASH_IMAGE_INFO_PATH +// (set by tests to point at a fixture). +static json read_image_info() { + const char * env_path = std::getenv("DFLASH_IMAGE_INFO_PATH"); + const std::string path = (env_path && *env_path) + ? std::string(env_path) + : std::string("/opt/lucebox-hub/IMAGE_INFO"); + std::ifstream f(path); + if (!f) return nullptr; + std::string git_sha, image_tag, build_time; + std::getline(f, git_sha); + std::getline(f, image_tag); + std::getline(f, build_time); + // If all three are empty (file existed but was blank), treat as missing + // so /props doesn't carry a useless `{git_sha: "", ...}` blob. + if (git_sha.empty() && image_tag.empty() && build_time.empty()) { + return nullptr; + } + json out = json::object(); + if (!git_sha.empty()) out["git_sha"] = git_sha; + if (!image_tag.empty()) out["image_tag"] = image_tag; + if (!build_time.empty()) out["build_time"] = build_time; + return out; +} + +// Read /opt/lucebox-hub/HOST_INFO (JSON written by server/scripts/ +// entrypoint.sh from the LUCEBOX_HOST_* env vars the host wrapper +// exports) into a JSON object surfaced verbatim under /props.host. +// Returns JSON null on missing file or parse error so /props.host +// becomes literal null rather than crashing the handler. Path override +// via $DFLASH_HOST_INFO_PATH for unit tests. +static json read_host_info() { + const char * env_path = std::getenv("DFLASH_HOST_INFO_PATH"); + const std::string path = (env_path && *env_path) + ? std::string(env_path) + : std::string("/opt/lucebox-hub/HOST_INFO"); + std::ifstream f(path); + if (!f) return nullptr; + try { + json out = json::parse(f); + if (!out.is_object()) return nullptr; + return out; + } catch (const json::parse_error &) { + return nullptr; + } +} + static bool parse_double_list(const char * value, std::vector & out) { out.clear(); if (!value || !*value) return false; @@ -1057,6 +1143,94 @@ int main(int argc, char ** argv) { // expose the GGUF metadata key it was loaded from, so leave empty // and let /props report null. (Add a getter on Tokenizer later.) + // ── /props identity payloads ──────────────────────────────────────── + // + // Target + draft GGUF identity for /props.model.target / .draft. The + // hash is the slow part (~30s per multi-GB file on NVMe); the GGUF + // header read is cheap. Cached in a sidecar `.sha256` so + // subsequent restarts skip the rehash. + // + // `DFLASH_SKIP_SHA256=1` env disables hashing entirely — useful when + // benchmarking server cold-start latency or when the model dir is + // read-only (no place to write the sidecar). Leaves `sha256` as JSON + // null in /props; the other identity fields still populate. + const bool skip_sha = []() { + const char * v = std::getenv("DFLASH_SKIP_SHA256"); + return v && *v && std::strcmp(v, "0") != 0; + }(); + if (bargs.model_path && *bargs.model_path) { + std::fprintf(stderr, + "[server] inspecting target GGUF for /props%s\n", + skip_sha ? " (sha256 disabled by $DFLASH_SKIP_SHA256)" : ""); + GgufMetadata tm = read_gguf_metadata(bargs.model_path, !skip_sha); + if (tm.ok) { + sconfig.target_gguf = gguf_metadata_to_json(tm); + std::fprintf(stderr, + "[server] target gguf: %s size=%lld sha=%s%s ftype=%s\n", + tm.path.c_str(), (long long)tm.size_bytes, + tm.sha256.empty() ? "(skipped)" : tm.sha256.substr(0, 12).c_str(), + tm.sha256.empty() ? "" : "...", + tm.file_type_name.empty() ? "?" : tm.file_type_name.c_str()); + } else { + std::fprintf(stderr, + "[server] WARNING: could not read target GGUF metadata: %s\n", + bargs.model_path); + } + } + if (bargs.draft_path && *bargs.draft_path) { + std::fprintf(stderr, + "[server] inspecting draft GGUF for /props%s\n", + skip_sha ? " (sha256 disabled by $DFLASH_SKIP_SHA256)" : ""); + GgufMetadata dm = read_gguf_metadata(bargs.draft_path, !skip_sha); + if (dm.ok) { + sconfig.draft_gguf = gguf_metadata_to_json(dm); + std::fprintf(stderr, + "[server] draft gguf: %s size=%lld sha=%s%s ftype=%s\n", + dm.path.c_str(), (long long)dm.size_bytes, + dm.sha256.empty() ? "(skipped)" : dm.sha256.substr(0, 12).c_str(), + dm.sha256.empty() ? "" : "...", + dm.file_type_name.empty() ? "?" : dm.file_type_name.c_str()); + } else { + std::fprintf(stderr, + "[server] WARNING: could not read draft GGUF metadata: %s\n", + bargs.draft_path); + } + } + // nlohmann::json::value() throws type_error when the + // key exists but isn't a string (e.g. `"kernel": null`). IMAGE_INFO + // and HOST_INFO are sourced from operator-provided env via + // entrypoint.sh, so we can't assume well-typed values — fall back + // to "(none)" on any missing-or-non-string field rather than + // crashing server startup. + auto json_str_or_none = [](const json & j, const char * key) -> const char * { + auto it = j.find(key); + if (it == j.end() || !it->is_string()) { + return "(none)"; + } + return it->get_ref().c_str(); + }; + + // Container/image identity (Dockerfile bakes /opt/lucebox-hub/IMAGE_INFO). + sconfig.image_info = read_image_info(); + if (!sconfig.image_info.is_null()) { + std::fprintf(stderr, + "[server] image_info: git_sha=%s image_tag=%s build_time=%s\n", + json_str_or_none(sconfig.image_info, "git_sha"), + json_str_or_none(sconfig.image_info, "image_tag"), + json_str_or_none(sconfig.image_info, "build_time")); + } + // Host identity (entrypoint.sh writes /opt/lucebox-hub/HOST_INFO from + // LUCEBOX_HOST_*). Surfaced verbatim under /props.host (schema 4+). + // Null on bare-metal dev builds that bypass the container entrypoint. + sconfig.host_info = read_host_info(); + if (!sconfig.host_info.is_null()) { + std::fprintf(stderr, + "[server] host_info: source=%s os=%s kernel=%s\n", + json_str_or_none(sconfig.host_info, "source"), + json_str_or_none(sconfig.host_info, "os_pretty"), + json_str_or_none(sconfig.host_info, "kernel")); + } + // Resolve the Level 2 force-close sequence. Two concepts, both sourced // from the model card sidecar (see model_card.h for semantics): // - marker: bytes that signal end-of-thinking to *us* (parsers). diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp index 604f11a73..e5e9f5a08 100644 --- a/server/src/server/sse_emitter.cpp +++ b/server/src/server/sse_emitter.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -23,7 +24,63 @@ static bool has_request_tools(const json & tools) { return tools.is_array() && !tools.empty(); } -static bool find_tool_start(const std::string & text, size_t & pos) { +// Cheap pre-check: scan `text` for a plausible `call:{` opener +// before invoking the full parse_tool_calls regex sweep. Mirrors the +// shape of re_call_verb_open() in tool_parser.cpp at a coarse +// granularity. Single O(N) pass with a substring skip; no heap alloc +// and no regex compile, so a response with no `call:` substring pays +// only a `find()` cost. +// +// Returns true if any `call:` occurrence is followed by an identifier +// start (`[A-Za-z_]`), more verb chars (`[A-Za-z0-9_.:-]`), optional +// whitespace, and a `{`. We deliberately do NOT validate balanced +// braces here — parse_tool_calls owns that check, and a leading +// `call:foo{` with no close still costs us only one regex scan. +// +// The pre-check intentionally accepts `_call:foo{` (SentencePiece +// underscore artifact, see tool_parser.cpp re_call_verb_open() +// rationale) by including `_` in the verb-start charset alternation +// at the top — `find("call:")` lands inside the `_call:` window. +static bool looks_like_plain_text_call(const std::string & text) { + size_t pos = 0; + while ((pos = text.find("call:", pos)) != std::string::npos) { + size_t v = pos + 5; // step past "call:" + if (v < text.size() && + (std::isalnum((unsigned char)text[v]) || text[v] == '_')) { + size_t w = v; + while (w < text.size() && + (std::isalnum((unsigned char)text[w]) || + text[w] == '_' || text[w] == '.' || + text[w] == ':' || text[w] == '-')) { + w++; + } + // Allow whitespace between verb and brace (mirrors `\s*\{`). + while (w < text.size() && std::isspace((unsigned char)text[w])) { + w++; + } + if (w < text.size() && text[w] == '{') return true; + } + pos = v; + } + return false; +} + +// `is_plain_text` (out) reports whether the matched opener was Pattern B +// (plain-text `call:{`) vs Pattern A (XML envelope: ``, +// ``). Callers use this to drive divergent +// downstream behavior at emit_finish: +// - Pattern A: malformed parse → suppress buffer (XML envelopes are not +// user-facing text); .done events expose only the pre-call accumulated +// content. +// - Pattern B: malformed parse → flush buffer back to accumulated_content_ +// so the literal `call:foo{...` span stays caller-visible; on success, +// the raw call text must also appear in the Responses-format +// finalization events (see emit_finish for the responses_streamed_text +// handling). +static bool find_tool_start(const std::string & text, size_t & pos, + bool & is_plain_text) { + is_plain_text = false; + // Pattern A: XML-like openers (, ). size_t idx = text.find('<'); while (idx != std::string::npos) { if (text.compare(idx, sizeof(TOOL_OPEN) - 1, TOOL_OPEN) == 0 || @@ -34,6 +91,42 @@ static bool find_tool_start(const std::string & text, size_t & pos) { } idx = text.find('<', idx + 1); } + + // Pattern B: call:{ opener (Gemma4 plain-text emissions). + // Valid sentinels before "call:" mirror tool_parser.cpp Pattern 5: + // start-of-text, whitespace, or one of ,;:()[]{}> + // Require at least one alpha char after the colon (the verb start) + // to avoid false-positives on English "I'll call: ..." prose. + // We do NOT require the closing '{' here — it may arrive in a later + // token. The full parse in parse_tool_calls() at emit_finish() handles + // validation; entering TOOL_BUFFER too eagerly costs only a buffered + // flush, not incorrect output. + static const char CALL_PREFIX[] = "call:"; + static constexpr size_t CALL_PREFIX_LEN = 5; // strlen("call:") + size_t call_pos = 0; + while (call_pos < text.size()) { + size_t found = text.find(CALL_PREFIX, call_pos); + if (found == std::string::npos) break; + + bool valid_sentinel = (found == 0); + if (!valid_sentinel && found > 0) { + char prev = text[found - 1]; + valid_sentinel = (prev == '\n' || prev == '\r' || prev == ' ' || prev == '\t' || + prev == ',' || prev == ';' || prev == ':' || + prev == '(' || prev == '[' || prev == '{' || + prev == ')' || prev == ']' || prev == '}' || prev == '>'); + } + + if (valid_sentinel) { + size_t verb_start = found + CALL_PREFIX_LEN; + if (verb_start < text.size() && std::isalpha((unsigned char)text[verb_start])) { + pos = found; + is_plain_text = true; + return true; + } + } + call_pos = found + 1; + } return false; } @@ -76,15 +169,16 @@ SseEmitter::SseEmitter(ApiFormat format, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - const std::vector & stop_sequences) + const std::vector & stop_sequences, + StreamMode initial_mode) : format_(format) , request_id_(request_id) , model_name_(model_name) , prompt_tokens_(prompt_tokens) , tools_(tools) , tool_memory_(tool_memory) - , mode_(StreamMode::CONTENT) - , active_kind_("text") + , mode_(initial_mode) + , active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text") , stop_sequences_(stop_sequences) , created_at_(unix_timestamp()) , msg_item_id_(gen_item_id()) @@ -93,6 +187,12 @@ SseEmitter::SseEmitter(ApiFormat format, for (const auto & s : stop_sequences_) { if (s.size() > stop_holdback_) stop_holdback_ = s.size(); } + // NOTE on `checked_think_prefix_`: we deliberately leave the default + // (false) here even when initial_mode == REASONING. The emitter has a + // one-time guard in emit_token() that strips a redundantly-emitted + // leading `` if the model emits one anyway (model-card / + // template-mismatch edge case). Pre-setting the flag to true would + // skip that strip and leak the duplicate opener into reasoning_text. } // ─── SSE formatting helpers ───────────────────────────────────────────── @@ -381,8 +481,9 @@ std::vector SseEmitter::emit_token(const std::string & raw_piece) { size_t think_idx = window_.find(THINK_OPEN); size_t think_close_idx = window_.find(THINK_CLOSE); size_t tool_idx = std::string::npos; + bool tool_is_plain_text = false; bool tool_hit = has_request_tools(tools_) && - find_tool_start(window_, tool_idx); + find_tool_start(window_, tool_idx, tool_is_plain_text); struct Hit { size_t pos; int type; }; // type: 0=think, 1=think_close, 2=tool-ish std::vector hits; @@ -411,6 +512,7 @@ std::vector SseEmitter::emit_token(const std::string & raw_piece) { // Tool-call syntax. Keep the full tag/function text buffered // until finish so the parser can validate it. tool_buffer_ = window_.substr(h.pos); + tool_open_is_plain_text_ = tool_is_plain_text; window_.clear(); mode_ = StreamMode::TOOL_BUFFER; } @@ -500,6 +602,41 @@ std::vector SseEmitter::emit_finish(int completion_tokens, } window_.clear(); + // Snapshot of pre-strip text for the Responses finalization events. + // + // The Responses-format finalization events + // (response.output_text.done / content_part.done / completed) must + // reflect the full assistant text — including any plain-text + // `call:{...}` span — so a streaming client sees its accumulated + // buffer agree with the server's .done payload, and non-streaming + // builders that consume .completed get the raw assistant emission. + // Meanwhile, accumulated_text() (used by OpenAI Chat / Anthropic final + // shapes and non-streaming Responses builders that DO want stripped + // text to avoid text+tool_use duplication) continues to return the + // post-hoist stripped form. + // + // Cases: + // - Pattern A (XML envelope, mode==TOOL_BUFFER): tool_buffer_ holds + // protocol artifact text (`...`) that was never streamed + // as a delta. The raw envelope is excluded from + // responses_streamed_text — but any `cleaned_text` the parser + // extracts (text outside the XML span) IS emitted as a content + // delta below, so it must also be folded into the snapshot once + // the parse succeeds (handled inline at the cleaned_text emit). + // - Pattern B (plain-text `call:`, mode==TOOL_BUFFER): tool_buffer_ + // holds the raw `call:{...}` span plus any post-call trailing + // text. Both belong in the visible text snapshot per the PR #329 + // review (tests #1126 et al). The snapshot already includes the + // full raw buffer, which is a superset of `cleaned_text`, so we + // don't double-count when the cleaned_text emit happens. + // - mode==CONTENT plain-text hoist branch below: accumulated_content_ + // already contains the full pre-strip text; the snapshot taken + // here freezes it before the strip mutates it. + std::string responses_streamed_text = accumulated_content_; + if (mode_ == StreamMode::TOOL_BUFFER && tool_open_is_plain_text_) { + responses_streamed_text += tool_buffer_; + } + // Parse tool calls from buffer std::string fr = "stop"; if (mode_ == StreamMode::TOOL_BUFFER && !tool_buffer_.empty()) { @@ -518,6 +655,17 @@ std::vector SseEmitter::emit_finish(int completion_tokens, if (!parsed.cleaned_text.empty()) { accumulated_content_ += parsed.cleaned_text; emit_content_delta(out, parsed.cleaned_text); + // Pattern A: snapshot was taken before parsing and only + // included pre-buffer accumulated_content_; the + // cleaned_text we just streamed must also appear in the + // .done/.completed payloads or the client's accumulated + // delta buffer will disagree with the server's final + // text. Pattern B's snapshot already contains the full + // raw buffer (which is a superset of cleaned_text), so + // we skip the append there to avoid double-counting. + if (!tool_open_is_plain_text_) { + responses_streamed_text += parsed.cleaned_text; + } } fr = "tool_calls"; @@ -600,14 +748,170 @@ std::vector SseEmitter::emit_finish(int completion_tokens, break; default: break; } + } else if (tool_open_is_plain_text_) { + // Pattern B (plain-text `call:{...`) failed to parse — + // most commonly an unbalanced `{` (the model's args were + // truncated, or the verb name is real but the JSON body + // never closed). Unlike Pattern A's XML envelopes, the + // buffered span here is plain user-facing text. Flushing + // it back to accumulated_content_ (and re-emitting as a + // content delta) preserves the malformed span as + // caller-visible signal that the model produced garbage — + // dropping it silently would hide the failure mode. + // accumulated_text() then reports the original `call:` + // text exactly as the model emitted it. + accumulated_content_ += tool_buffer_; + emit_content_delta(out, tool_buffer_); + tool_buffer_.clear(); } else { - // Tool syntax was detected but no valid call parsed. Do not leak - // malformed/incomplete XML back to the user as assistant text. + // Pattern A (XML envelope) parse failure. Do not leak + // malformed/incomplete `` / `` markup back to the user as assistant text + // — XML envelopes are protocol artifacts, not prose. See + // test_emitter_does_not_leak_malformed_tool_xml. std::fprintf(stderr, "[server] tool_call parse failed; suppressing buffered tool text " "request_id=%s format=%d bytes=%zu\n", request_id_.c_str(), (int)format_, tool_buffer_.size()); } + } else if (mode_ == StreamMode::CONTENT && + !accumulated_content_.empty() && + has_request_tools(tools_) && + looks_like_plain_text_call(accumulated_content_)) { + // CONTENT-mode plain-text tool-call hoist. Gemma4 (and similar + // models with no XML tool-call template) emits invocations as + // literal text like `call:get_weather{location: "SF"}` or + // `_call:get_weather{...}` (SentencePiece artifact). The emitter + // stays in CONTENT mode for the whole stream because no + // `` / `` opener ever + // arrives. Without this branch the response stops with + // finish_reason="stop" / stop_reason="end_turn" and no tool_use + // block is emitted, breaking forge/agent_recorded scenarios + // that depend on structured tool_calls. + // + // The branch runs parse_tool_calls over accumulated_content_, + // hoists any ToolCalls (the allowlist filter `tool_allowed` is + // already enforced inside parse_tool_calls' add_call lambda, + // so unauthorized verbs never enter parsed.tool_calls), and + // replaces accumulated_content_ with cleaned_text so the final + // response carries the prose-only text (no duplicate `call:` + // span). Streaming clients have already received the raw call + // text as content deltas — they get a post-hoc tool_use block + // appended at finalize. Text + tool_use is a legal stream in + // both OpenAI and Anthropic specs. + // + // Gated on has_request_tools(tools_) to mirror the + // TOOL_BUFFER-entry condition at line 391 — if the request + // didn't declare tools we keep `call:foo{}` as visible content + // (see test_emitter_no_tools_keeps_tool_like_text for the + // equivalent XML-shape behavior). + auto parsed = parse_tool_calls(accumulated_content_, tools_); + if (!parsed.tool_calls.empty()) { + tool_calls_ = std::move(parsed.tool_calls); + + // Remember for tool memory (mirrors TOOL_BUFFER branch). + if (tool_memory_) { + std::vector ids; + for (const auto & tc : tool_calls_) ids.push_back(tc.id); + tool_memory_->remember(ids, accumulated_raw_); + } + + // Strip matched call spans from the visible content so the + // non-streaming final-message shape doesn't duplicate them + // as both text AND tool_use. Mirrors + // _strip_plain_text_tool_calls in + // luce-bench/.../forge.py. Streaming clients already saw + // the pre-strip text in earlier deltas; this only affects + // the final accumulated_text() consumed by the response + // builders in http_server.cpp. + // + // For the Responses format we capture a separate snapshot + // (responses_streamed_text, see top of emit_finish) before + // this strip so the streaming finalization events + // (.output_text.done / .content_part.done / .completed) + // continue to agree with the raw .delta events the client + // already received. Without the snapshot the .done payload + // would carry the stripped text and a streaming client's + // accumulated buffer would disagree with the server's + // claimed "done" text. + accumulated_content_ = parsed.cleaned_text; + + fr = "tool_calls"; + + // Format-specific tool call events — same shape as the + // TOOL_BUFFER branch above. Kept inlined (rather than + // refactored into a helper) to keep this commit's diff + // minimal and side-by-side reviewable against the + // upstream block. + switch (format_) { + case ApiFormat::OPENAI_CHAT: { + json tc_list = json::array(); + for (size_t i = 0; i < tool_calls_.size(); i++) { + tc_list.push_back({ + {"index", (int)i}, + {"id", tool_calls_[i].id}, + {"type", "function"}, + {"function", { + {"name", tool_calls_[i].name}, + {"arguments", tool_calls_[i].arguments} + }} + }); + } + out.push_back(format_openai_delta({{"tool_calls", tc_list}})); + break; + } + case ApiFormat::ANTHROPIC: { + if (!active_kind_.empty()) { + out.push_back(sse_event("content_block_stop", + json({{"type", "content_block_stop"}, {"index", block_index_}}).dump())); + active_kind_.clear(); + } + for (const auto & tc : tool_calls_) { + block_index_++; + json tu_block = { + {"type", "tool_use"}, + {"id", tc.id}, + {"name", tc.name}, + {"input", json::object()} + }; + out.push_back(sse_event("content_block_start", + json({{"type", "content_block_start"}, + {"index", block_index_}, + {"content_block", tu_block}}).dump())); + if (!tc.arguments.empty()) { + out.push_back(sse_event("content_block_delta", + json({{"type", "content_block_delta"}, + {"index", block_index_}, + {"delta", {{"type", "input_json_delta"}, + {"partial_json", tc.arguments}}}}).dump())); + } + out.push_back(sse_event("content_block_stop", + json({{"type", "content_block_stop"}, + {"index", block_index_}}).dump())); + } + break; + } + case ApiFormat::RESPONSES: + for (const auto & tc : tool_calls_) { + out.push_back(format_responses_event( + "response.function_call_arguments.delta", { + {"item_id", tc.id}, {"output_index", 0}, + {"delta", tc.arguments} + })); + out.push_back(format_responses_event( + "response.function_call_arguments.done", { + {"item_id", tc.id}, {"output_index", 0}, + {"arguments", tc.arguments}, {"name", tc.name} + })); + } + break; + default: break; + } + } + // If parse_tool_calls matched the substring pre-check but + // returned no calls (all filtered by tool_allowed, or all args + // malformed), `fr` stays "stop" and accumulated_content_ is + // left intact. Caller sees the original prose; no leak. } // Format-specific final events @@ -670,16 +974,21 @@ std::vector SseEmitter::emit_finish(int completion_tokens, } case ApiFormat::RESPONSES: { + // Use the pre-strip snapshot for the streaming finalization + // events (.done / .completed) so they agree with the + // .delta events that preceded them. See + // responses_streamed_text init at the top of emit_finish for + // rationale. // output_text.done out.push_back(format_responses_event("response.output_text.done", { {"item_id", msg_item_id_}, {"output_index", 0}, - {"content_index", 0}, {"text", accumulated_content_} + {"content_index", 0}, {"text", responses_streamed_text} })); // content_part.done out.push_back(format_responses_event("response.content_part.done", { {"item_id", msg_item_id_}, {"output_index", 0}, {"content_index", 0}, - {"part", {{"type", "output_text"}, {"text", accumulated_content_}, + {"part", {{"type", "output_text"}, {"text", responses_streamed_text}, {"annotations", json::array()}}} })); @@ -698,7 +1007,7 @@ std::vector SseEmitter::emit_finish(int completion_tokens, {"type", "message"}, {"id", msg_item_id_}, {"status", "completed"}, {"role", "assistant"}, {"content", json::array({{ - {"type", "output_text"}, {"text", accumulated_content_}, + {"type", "output_text"}, {"text", responses_streamed_text}, {"annotations", json::array()} }})} }); @@ -726,7 +1035,7 @@ std::vector SseEmitter::emit_finish(int completion_tokens, {"created_at", created_at_}, {"status", "completed"}, {"model", model_name_}, {"output", final_output}, - {"output_text", accumulated_content_}, + {"output_text", responses_streamed_text}, {"usage", resp_usage} }; out.push_back(format_responses_event("response.completed", {{"response", shell}})); diff --git a/server/src/server/sse_emitter.h b/server/src/server/sse_emitter.h index 4710b8d45..79c711e02 100644 --- a/server/src/server/sse_emitter.h +++ b/server/src/server/sse_emitter.h @@ -54,13 +54,27 @@ nlohmann::json build_timings_json(const GenTimings & t, int completion_tokens); // Manages SSE streaming for a single request. class SseEmitter { public: + // `initial_mode` defaults to CONTENT for backward compatibility. Pass + // StreamMode::REASONING when the chat-template prompt suffix pre-opens + // a `` block (Qwen3.6 / Laguna enable_thinking path): the + // model's first generated token is reasoning, never preceded by an + // explicit `` opener in the stream. Without this hint the + // emitter would route reasoning text to the content channel and + // reasoning_content would stay empty. + // + // Note: the leading-`` strip guard (`checked_think_prefix_`) + // remains active when we start in REASONING mode — if the model + // *does* emit a redundant `` opener anyway, the guard still + // strips it. Pre-setting checked_think_prefix_=true here would let a + // duplicate `` leak into reasoning_text in that edge case. SseEmitter(ApiFormat format, const std::string & request_id, const std::string & model_name, int prompt_tokens, const json & tools, ToolMemory * tool_memory, - const std::vector & stop_sequences = {}); + const std::vector & stop_sequences = {}, + StreamMode initial_mode = StreamMode::CONTENT); // Emit the initial SSE events (role delta, message_start, etc.) // Returns the formatted SSE strings to send. @@ -145,6 +159,21 @@ class SseEmitter { StreamMode mode_; std::string window_; // holdback buffer std::string tool_buffer_; // accumulated tool text + // True when TOOL_BUFFER was entered via Pattern B (plain-text + // `call:{` opener) rather than Pattern A (XML envelope: + // `` / ``). Set at the + // CONTENT→TOOL_BUFFER transition in emit_token(). Drives two + // divergent behaviors at emit_finish(): + // 1. malformed-parse branch: Pattern A drops the buffer + // (XML envelopes are not user-facing prose); Pattern B + // flushes the buffer back to accumulated_content_ so the + // literal `call:foo{...` span stays caller-visible. + // 2. Responses-format finalization events (.output_text.done / + // .content_part.done / .completed): Pattern B includes the + // raw call span in the streamed-text snapshot used for + // these events, while accumulated_text() continues to + // return the stripped (post-hoist) text. + bool tool_open_is_plain_text_ = false; std::string accumulated_content_; std::string accumulated_raw_; // all raw text for tool memory std::string reasoning_text_; diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index 1ddbf2d1f..07db573e3 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -1045,6 +1045,268 @@ static void test_emitter_anthropic_thinking_blocks() { TEST_ASSERT(!em.accumulated_text().empty()); } +// ═══════════════════════════════════════════════════════════════════════ +// CONTENT-mode plain-text `call:{...}` tool-call hoist +// +// Regression coverage for the finalize-pass branch that runs +// parse_tool_calls over accumulated_content_ when the stream stayed +// in CONTENT mode (Gemma4-style `call:foo{...}` plain text — no XML +// envelope to trip TOOL_BUFFER). Without this branch the emitter +// returns finish_reason="stop" / stop_reason="end_turn" and never +// emits a tool_use block, breaking forge / agent_recorded. +// ═══════════════════════════════════════════════════════════════════════ + +static void test_emitter_content_mode_plain_text_call_parsed() { + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + // Feed enough text past the holdback so the prose flushes into + // accumulated_content_ before finalize. The call: span itself stays + // in accumulated_content_ either way (no XML opener to redirect to + // TOOL_BUFFER); finalize parses and strips it. + em.emit_token("I'll fetch the forecast for you right now: "); + em.emit_token("call:get_weather{\"location\": \"SF\"}"); + em.emit_token(" — let me know what you'd like next."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 1); + if (!em.tool_calls().empty()) { + TEST_ASSERT(em.tool_calls()[0].name == "get_weather"); + auto args = json::parse(em.tool_calls()[0].arguments); + TEST_ASSERT(args["location"] == "SF"); + } + // finish_reason should now be "tool_calls" (drives Anthropic + // stop_reason="tool_use" downstream). + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_no_tools_skips_plain_text_call() { + // Empty tools array: branch is gated on has_request_tools(tools_), + // so the call: text remains as visible content. + auto em = make_emitter(ApiFormat::OPENAI_CHAT); + em.emit_start(); + em.emit_token("I'll fetch the forecast for you right now: "); + em.emit_token("call:get_weather{\"location\": \"SF\"}"); + em.emit_token(" — let me know what you'd like next."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().empty()); + TEST_ASSERT(em.finish_reason() == "stop"); + TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos); +} + +static void test_emitter_content_mode_underscore_prefix_call_parsed() { + // Regression for the `_call:foo{}` SentencePiece artifact (commit + // 004a81b). Parser Pattern 5 sentinel set includes `_`, so the + // verb is captured even with the leading underscore. The emitter + // wiring must surface it the same way as the bare `call:` form. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("Sure thing, here is the call you asked for: "); + em.emit_token("_call:get_weather{\"location\": \"NYC\"}"); + em.emit_token(" — happy to refine if needed."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 1); + if (!em.tool_calls().empty()) { + TEST_ASSERT(em.tool_calls()[0].name == "get_weather"); + auto args = json::parse(em.tool_calls()[0].arguments); + TEST_ASSERT(args["location"] == "NYC"); + } + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_no_call_substring_skips_parser() { + // Pure prose with no `call:` substring: the pre-check + // looks_like_plain_text_call short-circuits before parse_tool_calls + // runs. Accumulated text is preserved; finish_reason stays "stop". + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("Sorry, I cannot help with that "); + em.emit_token("specific question today. Please consult a local guide "); + em.emit_token("for the most accurate information."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().empty()); + TEST_ASSERT(em.finish_reason() == "stop"); + TEST_ASSERT(em.accumulated_text().find("Sorry") != std::string::npos); +} + +static void test_emitter_content_mode_mixed_calls_multiple() { + // Multiple back-to-back calls in the same response. Parser + // Pattern 5 sentinel set includes `}` so consecutive invocations + // are captured. Verify the emitter hoists both in emission order. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("start. "); + em.emit_token("call:get_weather{\"location\": \"A\"} "); + em.emit_token("middle. "); + em.emit_token("call:get_weather{\"location\": \"B\"} "); + em.emit_token("end of the message."); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 2); + if (em.tool_calls().size() == 2) { + auto args0 = json::parse(em.tool_calls()[0].arguments); + auto args1 = json::parse(em.tool_calls()[1].arguments); + TEST_ASSERT(args0["location"] == "A"); + TEST_ASSERT(args1["location"] == "B"); + } + TEST_ASSERT(em.finish_reason() == "tool_calls"); + // Codex Q3 residue guard: the stripped accumulated text must NOT + // contain `call:` any more. + TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos); +} + +static void test_emitter_content_mode_malformed_call_dropped() { + // Unbalanced `{`: balanced_braces_end inside parse_tool_calls + // returns npos and the match is dropped. Emitter must not crash + // and the malformed text remains in accumulated_text (no tool + // hoist, no silent strip — caller-visible signal that the model + // produced garbage). + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("Here is the call you wanted with malformed args: "); + em.emit_token("call:get_weather{location: \"unclosed"); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().empty()); + TEST_ASSERT(em.finish_reason() == "stop"); + // Malformed call span is left visible (no strip on parse failure). + TEST_ASSERT(em.accumulated_text().find("call:get_weather") != std::string::npos); +} + +static void test_emitter_content_mode_does_not_double_fire_on_tool_call_xml() { + // Regression guard: a `` XML envelope must continue to + // route through the TOOL_BUFFER path (transition fires inside + // emit_token). The new CONTENT-mode branch sits in an `else if` + // tied to `mode_ == CONTENT` at emit_finish entry, so it cannot + // fire when TOOL_BUFFER handled the call. Verify exactly 1 + // ToolCall is emitted, not 2. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("\n" + "\n" + "SF\n" + "\n" + ""); + em.emit_finish(20); + + TEST_ASSERT(em.tool_calls().size() == 1); + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_strips_call_span_from_accumulated_text() { + // Codex Q3 "residue" hazard guard. After a successful hoist, + // accumulated_text() must not contain the `call:` substring (the + // matched span is replaced by cleaned_text). Without this guard + // the OpenAI Chat / Anthropic / Responses final-message shapes + // would echo the call as both literal text AND a tool_use block, + // producing UI double-display. + auto em = make_emitter(ApiFormat::OPENAI_CHAT, weather_tools()); + em.emit_start(); + em.emit_token("prefix prose here. "); + em.emit_token("call:get_weather{\"location\": \"SF\"}"); + em.emit_token(" suffix prose continues to flush the holdback."); + em.emit_finish(20); + + TEST_ASSERT(!em.tool_calls().empty()); + TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("prefix prose") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("suffix prose") != std::string::npos); +} + +static void test_emitter_content_mode_anthropic_emits_tool_use_block() { + // Verify the Anthropic format-specific events fire from the new + // branch (content_block_stop on the open text block, then + // content_block_start tool_use + input_json_delta + content_block_stop). + json tools = json::array(); + tools.push_back({ + {"name", "get_weather"}, + {"description", "weather"}, + {"input_schema", {{"type", "object"}, + {"properties", {{"city", {{"type", "string"}}}}}}} + }); + SseEmitter em(ApiFormat::ANTHROPIC, "req_id", "test-model", 10, + tools, nullptr); + em.emit_start(); + em.emit_token("Let me fetch the data you need from the service: "); + em.emit_token("call:get_weather{\"city\": \"Tokyo\"}"); + em.emit_token(" — back in a moment."); + auto finish = em.emit_finish(20); + std::string s = concat(finish); + + TEST_ASSERT(!em.tool_calls().empty()); + TEST_ASSERT(s.find("\"type\":\"tool_use\"") != std::string::npos); + TEST_ASSERT(s.find("\"name\":\"get_weather\"") != std::string::npos); + TEST_ASSERT(s.find("\"type\":\"input_json_delta\"") != std::string::npos); + TEST_ASSERT(s.find("Tokyo") != std::string::npos); + TEST_ASSERT(s.find("\"stop_reason\":\"tool_use\"") != std::string::npos); +} + +static void test_emitter_content_mode_digit_start_verb_parsed() { + // Cubic PR #329 review: the looks_like_plain_text_call() pre-check + // must accept verbs starting with a digit because the parser's + // re_call_verb_open() regex allows them ([A-Za-z0-9_.:\\-]+). + // A model emitting `call:2nd_pass{...}` with a digit-led verb + // should still trigger the parser sweep. + json tools = json::array(); + tools.push_back({ + {"name", "2nd_pass"}, + {"description", "second pass"}, + {"input_schema", {{"type", "object"}, + {"properties", {{"reason", {{"type", "string"}}}}}}} + }); + SseEmitter em(ApiFormat::OPENAI_CHAT, "req_id", "test-model", 10, + tools, nullptr); + em.emit_start(); + em.emit_token("call:2nd_pass{reason: \"verify\"}"); + em.emit_finish(5); + + TEST_ASSERT(em.tool_calls().size() == 1); + if (!em.tool_calls().empty()) { + TEST_ASSERT(em.tool_calls()[0].name == "2nd_pass"); + } + TEST_ASSERT(em.finish_reason() == "tool_calls"); +} + +static void test_emitter_content_mode_responses_done_uses_pre_strip_text() { + // Cubic PR #329 review: the Responses-format finalization events + // (.output_text.done / .content_part.done / .completed) must + // reflect the text that was streamed in earlier .delta events, + // not the post-strip text. Otherwise a streaming client's + // accumulated buffer (built from .delta events) disagrees with + // the server's claimed .done payload. + // + // accumulated_text() (consumed by non-streaming response builders) + // still returns the stripped version so the non-streaming response + // shape doesn't carry both text AND tool_use for the same span. + json tools = json::array(); + tools.push_back({ + {"name", "get_weather"}, + {"description", "weather"}, + {"input_schema", {{"type", "object"}, + {"properties", {{"city", {{"type", "string"}}}}}}} + }); + SseEmitter em(ApiFormat::RESPONSES, "req_id", "test-model", 10, + tools, nullptr); + em.emit_start(); + em.emit_token("Looking up: "); + em.emit_token("call:get_weather{\"city\": \"Tokyo\"}"); + em.emit_token(" done."); + auto finish = em.emit_finish(20); + std::string s = concat(finish); + + TEST_ASSERT(em.tool_calls().size() == 1); + // Streaming .done events must include the raw call text (matching + // what the .delta events already sent). + TEST_ASSERT(s.find("response.output_text.done") != std::string::npos); + TEST_ASSERT(s.find("call:get_weather") != std::string::npos); + // Non-streaming accessor returns the stripped text. + TEST_ASSERT(em.accumulated_text().find("call:") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("Looking up:") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("done.") != std::string::npos); +} + // ═══════════════════════════════════════════════════════════════════════ // Stop sequences tests // ═══════════════════════════════════════════════════════════════════════ @@ -1232,7 +1494,7 @@ static void test_pflash_config_defaults() { ServerConfig cfg; TEST_ASSERT(cfg.pflash_mode == ServerConfig::PflashMode::OFF); TEST_ASSERT(cfg.pflash_threshold == 32000); - TEST_ASSERT(cfg.pflash_keep_ratio > 0.04f && cfg.pflash_keep_ratio < 0.06f); + TEST_ASSERT(cfg.pflash_keep_ratio > 0.09f && cfg.pflash_keep_ratio < 0.11f); TEST_ASSERT(cfg.pflash_drafter_path.empty()); TEST_ASSERT(!cfg.pflash_skip_park); TEST_ASSERT(cfg.draft_residency == DraftResidencyPolicy::Auto); @@ -1541,11 +1803,11 @@ static void test_jinja_render_basic() { {"system", "you are helpful", ""}, {"user", "hi", ""}, }; - std::string out = render_chat_template_jinja( + auto out = render_chat_template_jinja( MINI_JINJA_TEMPLATE, msgs, /*bos=*/"", /*eos=*/"", /*add_gen=*/true, /*think=*/false, - /*tools=*/""); + /*tools=*/"").text; TEST_ASSERT(out.find("<|system|>you are helpful") != std::string::npos); TEST_ASSERT(out.find("<|user|>hi") != std::string::npos); TEST_ASSERT(out.find("<|assistant|>") != std::string::npos); @@ -1553,9 +1815,9 @@ static void test_jinja_render_basic() { static void test_jinja_render_no_gen_prompt() { std::vector msgs = {{"user", "ping", ""}}; - std::string out = render_chat_template_jinja( + auto out = render_chat_template_jinja( MINI_JINJA_TEMPLATE, msgs, "", "", - /*add_gen=*/false, /*think=*/false, ""); + /*add_gen=*/false, /*think=*/false, "").text; TEST_ASSERT(out.find("<|user|>ping") != std::string::npos); TEST_ASSERT(out.find("<|assistant|>") == std::string::npos); } @@ -1567,8 +1829,8 @@ static void test_jinja_render_tools_injected() { "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"; std::vector msgs = {{"user", "?", ""}}; std::string tools = R"([{"name":"my_tool","description":"test"}])"; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, tools); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, tools).text; TEST_ASSERT(out.find("TOOLS_PRESENT:my_tool") != std::string::npos); } @@ -1577,8 +1839,8 @@ static void test_jinja_render_empty_tools_skipped() { static const char TPL[] = "{%- if tools -%}TOOLS_PRESENT{%- else -%}NO_TOOLS{%- endif -%}"; std::vector msgs = {{"user", "?", ""}}; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, "[]"); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "[]").text; TEST_ASSERT(out.find("NO_TOOLS") != std::string::npos); TEST_ASSERT(out.find("TOOLS_PRESENT") == std::string::npos); } @@ -1587,8 +1849,8 @@ static void test_jinja_render_bos_eos_threaded() { // {{ bos_token }} and {{ eos_token }} must reach the template. static const char TPL[] = "{{ bos_token }}HI{{ eos_token }}"; std::vector msgs; - std::string out = render_chat_template_jinja( - TPL, msgs, "", "", false, false, ""); + auto out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "").text; TEST_ASSERT(out == "HI"); } @@ -1616,6 +1878,345 @@ static void test_jinja_render_bad_tools_json_throws() { TEST_ASSERT(threw); } +// ─── started_in_thinking provenance ───────────────────────────────────── +// +// Regression suite for the Qwen3.6 / Laguna think-mode channel-routing +// bug: the rendered prompt suffix pre-opens `` so the model +// starts emitting reasoning tokens with no explicit opener. Callers +// route PromptRenderResult.started_in_thinking → SseEmitter initial +// mode so reasoning text lands in reasoning_content, not content. + +static void test_chat_template_qwen3_enable_thinking_pre_opens() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(result.started_in_thinking); + // Sanity: rendered suffix ends with `\n` per the Qwen3.6 + // chat_template.jinja's enable_thinking branch. + TEST_ASSERT(result.text.size() >= 8); + TEST_ASSERT(result.text.compare(result.text.size() - 8, 8, "\n") == 0); +} + +static void test_chat_template_qwen3_disable_thinking_does_not_pre_open() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); + // The disabled branch emits `\n\n\n\n` — closes + // immediately, so the reasoning channel is NOT left open. + TEST_ASSERT(result.text.find("") != std::string::npos); +} + +static void test_chat_template_qwen3_no_gen_prompt_does_not_pre_open() { + // Without add_generation_prompt the assistant turn isn't appended + // and there's nothing to pre-open. + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/false, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); +} + +static void test_chat_template_laguna_enable_thinking_pre_opens() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(result.started_in_thinking); + TEST_ASSERT(result.text.size() >= 7); + TEST_ASSERT(result.text.compare(result.text.size() - 7, 7, "") == 0); +} + +static void test_chat_template_laguna_disable_thinking_does_not_pre_open() { + std::vector msgs = {{"user", "hi", ""}}; + auto result = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!result.started_in_thinking); +} + +static void test_chat_template_gemma4_does_not_pre_open() { + // Gemma4's reasoning channel is opened by the model's `<|channel>` + // token (which http_server forwards into the emitter as ``). + // The prompt itself never pre-opens `` regardless of + // enable_thinking, so started_in_thinking must stay false. + std::vector msgs = {{"user", "hi", ""}}; + auto enabled = render_chat_template(msgs, ChatFormat::GEMMA4, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT(!enabled.started_in_thinking); + auto disabled = render_chat_template(msgs, ChatFormat::GEMMA4, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!disabled.started_in_thinking); +} + +// Jinja path: suffix-sniff detection. The renderer should set +// started_in_thinking=true when the rendered prompt ends with `` +// (optionally followed by whitespace) AND enable_thinking is honored. +static void test_jinja_render_suffix_sniff_sets_started_in_thinking() { + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>{%- if enable_thinking -%}\n{%- endif -%}" + "{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, ""); + TEST_ASSERT(r.started_in_thinking); +} + +static void test_jinja_render_suffix_sniff_negative() { + // Template doesn't end with `` → started_in_thinking=false + // even with enable_thinking=true. + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}<|assistant|>{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/true, ""); + TEST_ASSERT(!r.started_in_thinking); +} + +// Jinja path: the sniff is the source of truth for started_in_thinking, +// not the enable_thinking flag. If a template hard-codes `` despite +// enable_thinking=false (custom template, model-card mismatch, etc.) we +// still need to route the model's first tokens to the reasoning channel +// or they'll leak into content. The renderer logs a [WARN] in that case +// (verified manually; we don't capture stderr in this test). +static void test_jinja_render_suffix_sniff_overrides_enable_thinking_flag() { + // Template hardcodes `` regardless of enable_thinking. + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}<|assistant|>\n{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/true, /*think=*/false, ""); + // Even though enable_thinking=false, the rendered prompt ends with + // `` so started_in_thinking must be true to avoid routing + // reasoning tokens into the content channel. + TEST_ASSERT(r.started_in_thinking); +} + +// Jinja path: sniff still requires add_generation_prompt — without it the +// rendered prompt is a transcript, not a continuation, and any embedded +// `` in past turns shouldn't claim the channel is pre-opened. +static void test_jinja_render_suffix_sniff_requires_add_generation_prompt() { + static const char TPL[] = + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}" + "{%- if add_generation_prompt -%}<|assistant|>\n{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + auto r = render_chat_template_jinja( + TPL, msgs, "", "", /*add_gen=*/false, /*think=*/true, ""); + TEST_ASSERT(!r.started_in_thinking); +} + +// ─── SseEmitter initial_mode=REASONING ────────────────────────────────── +// +// Regression: when constructed with initial_mode=REASONING (the +// Qwen3.6/Laguna enable_thinking path), the emitter must route the +// model's first generated tokens to reasoning_content until a natural +// `` is seen, even though no explicit `` opener appears +// in the stream. + +static void test_emitter_initial_mode_reasoning_routes_to_reasoning_content() { + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-1", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + + // Model emits reasoning tokens directly with no leading `` + // (because the prompt suffix already opened the channel), then + // closes with `` and emits the answer. + em.emit_token("alpha "); + em.emit_token("beta "); + em.emit_token("\n\nAnswer: 4"); + em.emit_finish(4); + + TEST_ASSERT(em.reasoning_text().find("alpha") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("beta") != std::string::npos); + // No spurious tag leaked into reasoning or content. + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("Answer: 4") != std::string::npos); +} + +static void test_emitter_initial_mode_reasoning_unclosed_stays_reasoning() { + // No close — everything stays in reasoning_content, content + // stays empty. Matches parse_reasoning(started_in_thinking=true) + // behavior for the non-streaming path. + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-2", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + em.emit_token("still thinking"); + em.emit_token(" more thinking"); + em.emit_finish(3); + + TEST_ASSERT(em.reasoning_text().find("still thinking") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("more thinking") != std::string::npos); + TEST_ASSERT(em.accumulated_text().empty()); +} + +static void test_emitter_initial_mode_reasoning_strips_redundant_think_opener() { + // Edge case: prompt pre-opened , but the model also emits a + // leading anyway (template/model-card mismatch). The + // emitter's strip guard (checked_think_prefix_) must still trip + // because we deliberately leave it at its default (false) in the + // constructor — otherwise the duplicate opener would leak into + // reasoning_text. + SseEmitter em(ApiFormat::OPENAI_CHAT, "req-3", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + em.emit_start(); + em.emit_token("actual reasoninganswer"); + em.emit_finish(3); + + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("actual reasoning") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("answer") != std::string::npos); +} + +static void test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking() { + // Anthropic format: when starting in REASONING mode, the very first + // content_block_start must be `thinking`, not `text`. Otherwise the + // emitter would open a text block, then have to stop+restart it as + // thinking on the first reasoning delta — wasteful and visible to + // SDK clients as a spurious empty text block. + SseEmitter em(ApiFormat::ANTHROPIC, "req-4", "test-model", 10, + json::array(), nullptr, + /*stops=*/{}, + StreamMode::REASONING); + auto start = em.emit_start(); + std::string all; + for (const auto & c : start) all += c; + // First content block must be a thinking block. nlohmann::json sorts + // keys alphabetically on dump(), so the inner block serializes as + // `{"thinking":"","type":"thinking"}` (NOT type-first). Assert on + // the unique `"thinking":""` opener which only appears in the + // thinking-kind serialization. + TEST_ASSERT(all.find("\"thinking\":\"\",\"type\":\"thinking\"") + != std::string::npos); + // And the initial text-block opener must NOT appear (regression: if + // active_kind_ defaulted to "text", emit_start would have emitted + // `{"text":"","type":"text"}` here instead). + TEST_ASSERT(all.find("\"text\":\"\",\"type\":\"text\"") + == std::string::npos); +} + +// ─── Integration: render_chat_template → SseEmitter wiring ────────────── +// +// The original bug was an integration gap: render_chat_template correctly +// reported started_in_thinking=true, but no caller routed it into the +// SseEmitter's initial_mode, so reasoning text leaked into content and +// reasoning_content stayed empty. Each end of the wire has its own unit +// tests above; these chain the two ends so a future refactor that drops +// the propagation cannot pass without an assertion failure here. +// +// The body mirrors the production wiring in +// server/src/server/http_server.cpp (the `started_in_thinking → +// initial_mode → SseEmitter` chain). Keep these in sync if that wiring +// moves. + +static void test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning() { + std::vector msgs = {{"user", "What is 2+2?", ""}}; + auto render = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT_MSG(render.started_in_thinking, + "renderer end of wire: QWEN3 enable_thinking must pre-open "); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-q", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Let me compute. "); + em.emit_token("2+2 equals 4."); + em.emit_token("\n\nThe answer is 4."); + em.emit_finish(5); + + TEST_ASSERT_MSG(!em.reasoning_text().empty(), + "wiring broken: reasoning_content empty despite started_in_thinking=true"); + TEST_ASSERT(em.reasoning_text().find("Let me compute") != std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT(em.reasoning_text().find("") == std::string::npos); + TEST_ASSERT_MSG(em.accumulated_text().find("Let me compute") == std::string::npos, + "wiring broken: reasoning text leaked into content channel"); + TEST_ASSERT(em.accumulated_text().find("The answer is 4") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); +} + +static void test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning() { + std::vector msgs = {{"user", "Solve 7*8.", ""}}; + auto render = render_chat_template(msgs, ChatFormat::LAGUNA, + /*add_gen=*/true, + /*enable_thinking=*/true, + /*tools=*/""); + TEST_ASSERT_MSG(render.started_in_thinking, + "renderer end of wire: LAGUNA enable_thinking must pre-open "); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-l", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Working through it: "); + em.emit_token("7*8 = 56."); + em.emit_token("\n\n56."); + em.emit_finish(4); + + TEST_ASSERT_MSG(!em.reasoning_text().empty(), + "wiring broken: reasoning_content empty despite started_in_thinking=true"); + TEST_ASSERT(em.reasoning_text().find("Working through it") != std::string::npos); + TEST_ASSERT_MSG(em.accumulated_text().find("Working through it") == std::string::npos, + "wiring broken: reasoning text leaked into content channel"); + TEST_ASSERT(em.accumulated_text().find("56.") != std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); + TEST_ASSERT(em.accumulated_text().find("") == std::string::npos); +} + +static void test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content() { + // Inverse direction: when enable_thinking=false the renderer must not + // pre-open and the emitter must start in CONTENT, so the model's + // tokens land in content from the first byte. Guards against the + // opposite regression of unconditionally starting in REASONING. + std::vector msgs = {{"user", "Hi.", ""}}; + auto render = render_chat_template(msgs, ChatFormat::QWEN3, + /*add_gen=*/true, + /*enable_thinking=*/false, + /*tools=*/""); + TEST_ASSERT(!render.started_in_thinking); + + const StreamMode initial_mode = render.started_in_thinking + ? StreamMode::REASONING : StreamMode::CONTENT; + SseEmitter em(ApiFormat::OPENAI_CHAT, "rid-n", "test-model", 10, + json::array(), nullptr, /*stops=*/{}, initial_mode); + em.emit_start(); + em.emit_token("Hello there."); + em.emit_finish(2); + + TEST_ASSERT(em.reasoning_text().empty()); + TEST_ASSERT(em.accumulated_text().find("Hello there") != std::string::npos); +} + + static void test_normalize_responses_tool_followup_messages() { ToolMemory tool_memory; const std::string call_id = "call_exec_001"; @@ -1795,7 +2396,6 @@ struct MockLayerSplitAdapter : LayerSplitAdapter { std::vector emitted_tokens; bool dflash_enabled = false; bool dflash_called = false; - bool sampling_enabled = false; int shutdown_calls = 0; ModelBackend::CompressRequest last_compress_req; int prefill_chunk = 0; @@ -1832,7 +2432,6 @@ struct MockLayerSplitAdapter : LayerSplitAdapter { return true; } bool can_dflash_decode() const override { return dflash_enabled; } - bool supports_cpu_sampling() const override { return sampling_enabled; } bool decode_dflash(const std::vector & prompt, int base_pos, int last_tok, int n_gen, std::vector & out_tokens, const DaemonIO & io, float & accept_rate_out) override { @@ -3229,8 +3828,11 @@ static void test_props_budget_envelope_shape() { TEST_ASSERT(body["model_card"]["max_tokens"].get() == 32768); TEST_ASSERT(be["default_max_tokens"].get() == 16000); - // Sanity: props_schema bumped to 2 (breaking change). - TEST_ASSERT(body["server"]["props_schema"].get() == 2); + // Sanity: props_schema bumped to 4 (schema 4 added the top-level + // `host` block over schema 3; schema 3 over 2 added `build` and + // `model.target`/`model.draft`. All additive but the bump + // propagates so consumers can negotiate.) + TEST_ASSERT(body["server"]["props_schema"].get() == 4); } // ─── /props.runtime captures full config (§4.16) ────────────────────── @@ -3280,6 +3882,257 @@ static void test_props_runtime_shape() { TEST_ASSERT(body["runtime"]["draft_device"].is_null()); } +// ─── /props.build block (schema 3) ──────────────────────────────────── +// The new structured replacement for the single-string `build_info`. +// Always emitted; image_* fields are null when the binary isn't running +// in a Docker image (no /opt/lucebox-hub/IMAGE_INFO baked in). +static void test_props_build_block_shape_no_image_info() { + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + // image_info default = null → image_* fields stay null. + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("build")); + const json & b = body["build"]; + // Stable identity always populated. + TEST_ASSERT(b["server_name"].get() == "luce-dflash"); + TEST_ASSERT(b["server_version"].is_string()); + TEST_ASSERT(b["props_schema"].get() == 4); + // Image-baked fields null in the no-IMAGE_INFO case. + TEST_ASSERT(b["git_sha"].is_null()); + TEST_ASSERT(b["image_tag"].is_null()); + TEST_ASSERT(b["image_digest"].is_null()); + TEST_ASSERT(b["build_time"].is_null()); + + // Legacy build_info still present for back-compat readers. + TEST_ASSERT(body.contains("build_info")); + TEST_ASSERT(body["build_info"].get().find("props_schema=4") + != std::string::npos); +} + +// ─── /props.host (schema 4) ─────────────────────────────────────────── +// Verbatim pass-through of the JSON written by entrypoint.sh to +// /opt/lucebox-hub/HOST_INFO. Surfaces /props.host so luce-bench's +// snapshot subcommand can capture the rig identity alongside the run. +// `null` when ServerConfig.host_info was not populated (bare-metal +// dev builds that bypass the container entrypoint). +static void test_props_host_block_present_when_populated() { + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + cfg.host_info = json::object({ + {"os_pretty", "Ubuntu 22.04.3 LTS"}, + {"kernel", "6.6.87.2-microsoft-standard-WSL2"}, + {"wsl_version", "wsl2"}, + {"docker_version", "29.1.3"}, + {"nvidia_driver", "596.36"}, + {"nvidia_ctk_version", "1.16.2"}, + {"cpu_model", "Intel(R) Core(TM) Ultra 9 275HX"}, + {"nproc", 24}, + {"ram_gb", 64}, + {"gpus", json::array({ + json::object({ + {"index", 0}, + {"uuid", "GPU-abc"}, + {"pci_bus_id", "00000000:01:00.0"}, + {"name", "NVIDIA GeForce RTX 5090 Laptop GPU"}, + {"sm", "12.0"}, + {"vram_gb", 24}, + {"power_limit_w", 175}, + }), + })}, + {"cuda_visible_devices", "0"}, + {"source", "lucebox.sh"}, + {"collector", "lucebox.sh"}, + {"collected_at", "2026-05-28T20:31:42Z"}, + }); + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("host")); + TEST_ASSERT(!body["host"].is_null()); + const json & h = body["host"]; + TEST_ASSERT(h["os_pretty"].get() == "Ubuntu 22.04.3 LTS"); + TEST_ASSERT(h["wsl_version"].get() == "wsl2"); + TEST_ASSERT(h["nvidia_ctk_version"].get() == "1.16.2"); + TEST_ASSERT(h["source"].get() == "lucebox.sh"); + TEST_ASSERT(h["gpus"].is_array()); + TEST_ASSERT(h["gpus"].size() == 1); + TEST_ASSERT(h["gpus"][0]["name"].get() + == "NVIDIA GeForce RTX 5090 Laptop GPU"); + TEST_ASSERT(h["gpus"][0]["vram_gb"].get() == 24); +} + +static void test_props_host_block_null_when_missing() { + // ServerConfig.host_info default = null → /props.host emits JSON null. + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + // cfg.host_info stays at its default nullptr. + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body.contains("host")); + TEST_ASSERT(body["host"].is_null()); + // /props.server.props_schema reflects the schema-4 bump regardless. + TEST_ASSERT(body["server"]["props_schema"].get() == 4); +} + +static void test_props_build_block_with_image_info() { + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + cfg.image_info = json::object({ + {"git_sha", "6d12378"}, + {"image_tag", "sha-6d12378-cuda12"}, + {"build_time", "2026-05-28T13:43:57Z"}, + }); + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + const json & b = body["build"]; + TEST_ASSERT(b["git_sha"].get() == "6d12378"); + TEST_ASSERT(b["image_tag"].get() == "sha-6d12378-cuda12"); + TEST_ASSERT(b["build_time"].get() == "2026-05-28T13:43:57Z"); + // image_digest is reserved for external population; still null. + TEST_ASSERT(b["image_digest"].is_null()); +} + +// ─── /props.model.target + /props.model.draft (schema 3) ────────────── +// Verbatim GGUF identity surfaced under model.target / model.draft. +// `draft` is null when no draft GGUF is loaded; the legacy +// `model.draft_path` string stays alongside for back-compat readers. +static void test_props_model_target_draft_shape() { + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "Qwen3.6 27B"}, + {"source", "https://huggingface.co/Qwen/Qwen3.6-27B"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + cfg.draft_path = "/opt/models/dflash-draft-3.6-q4_k_m.gguf"; + cfg.target_gguf = json::object({ + {"path", "/opt/models/Qwen3.6-27B-Q4_K_M.gguf"}, + {"size_bytes", int64_t(17134510080)}, + {"sha256", "abc123def456" + std::string(52, '0')}, + {"gguf", { + {"general.architecture", "qwen35"}, + {"general.name", "Qwen3.6-27B"}, + {"general.file_type", 15}, + {"general.file_type_name", "Q4_K_M"}, + {"general.quantization_version", 2}, + {"block_count", 64}, + {"embedding_length", 5120}, + {"context_length", 65536}, + {"vocab_size", 152064}, + }}, + }); + cfg.draft_gguf = json::object({ + {"path", "/opt/models/dflash-draft-3.6-q4_k_m.gguf"}, + {"size_bytes", int64_t(425000000)}, + {"sha256", "deadbeef" + std::string(56, '0')}, + {"gguf", { + {"general.architecture", "qwen3"}, + {"general.name", "Qwen3-0.6B-DFlash-draft"}, + {"general.file_type", 15}, + {"general.file_type_name", "Q4_K_M"}, + {"general.quantization_version", 2}, + {"block_count", 28}, + {"embedding_length", 1024}, + {"context_length", 32768}, + {"vocab_size", 152064}, + }}, + }); + + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + const json & m = body["model"]; + + // arch + back-compat fields preserved. + TEST_ASSERT(m["arch"].get() == "qwen35"); + TEST_ASSERT(m["alias"].get() == cfg.model_name); + TEST_ASSERT(m["draft_path"].get() == + "/opt/models/dflash-draft-3.6-q4_k_m.gguf"); + + // target: required, never null when GGUF is loaded. + TEST_ASSERT(!m["target"].is_null()); + const json & tgt = m["target"]; + TEST_ASSERT(tgt["path"].get() == + "/opt/models/Qwen3.6-27B-Q4_K_M.gguf"); + TEST_ASSERT(tgt["size_bytes"].get() == int64_t(17134510080)); + TEST_ASSERT(tgt["sha256"].get().size() == 64); + TEST_ASSERT(tgt["gguf"]["general.architecture"].get() == "qwen35"); + TEST_ASSERT(tgt["gguf"]["general.file_type_name"].get() == "Q4_K_M"); + TEST_ASSERT(tgt["gguf"]["context_length"].get() == 65536); + TEST_ASSERT(tgt["gguf"]["vocab_size"].get() == 152064); + + // draft: required key, populated when --draft was passed. + TEST_ASSERT(!m["draft"].is_null()); + TEST_ASSERT(m["draft"]["path"].get() == + "/opt/models/dflash-draft-3.6-q4_k_m.gguf"); + TEST_ASSERT(m["draft"]["gguf"]["general.architecture"].get() == "qwen3"); +} + +static void test_props_model_draft_null_when_target_only() { + // laguna / qwen3.6-moe configs run target-only: model.draft is JSON + // null (NOT omitted), so consumers can distinguish "feature absent" + // from "field not in this schema version". + ServerConfig cfg = make_props_config_with_sidecar(json{ + {"name", "qwen3.6-moe-test"}, + {"source", "https://huggingface.co/test"}, + {"verified_at", "2026-05-23"}, + {"max_tokens", 32768}, + }); + cfg.draft_path = ""; // no --draft + cfg.target_gguf = json::object({ + {"path", "/opt/models/qwen3.6-moe.gguf"}, + {"size_bytes", int64_t(18000000000)}, + {"sha256", nullptr}, + {"gguf", { + {"general.architecture", "qwen35moe"}, + {"general.name", "Qwen3.6-35B-A3B"}, + }}, + }); + // draft_gguf left at default (null). + + Tokenizer tok; + PrefixCache pc(0, tok); + ToolMemory tm; + json body = build_props_body(cfg, pc, tm); + + TEST_ASSERT(body["model"].contains("draft")); + TEST_ASSERT(body["model"]["draft"].is_null()); + TEST_ASSERT(body["model"]["draft_path"].is_null()); // legacy field too + // target still populated. + TEST_ASSERT(!body["model"]["target"].is_null()); + TEST_ASSERT(body["model"]["target"]["gguf"]["general.architecture"] + .get() == "qwen35moe"); +} + // ═══════════════════════════════════════════════════════════════════════ // usage.timings — per-request prefill / decode wall-clock breakdown // surfaced under usage.timings (spec §6.3). Tests cover all three @@ -3903,6 +4756,23 @@ int main() { RUN_TEST(test_emitter_streaming_openai_has_done); RUN_TEST(test_emitter_nonstreaming_accumulates); RUN_TEST(test_emitter_anthropic_thinking_blocks); + RUN_TEST(test_emitter_initial_mode_reasoning_routes_to_reasoning_content); + RUN_TEST(test_emitter_initial_mode_reasoning_unclosed_stays_reasoning); + RUN_TEST(test_emitter_initial_mode_reasoning_strips_redundant_think_opener); + RUN_TEST(test_emitter_initial_mode_reasoning_anthropic_first_block_is_thinking); + + std::fprintf(stderr, "\n── CONTENT-mode plain-text call:{} ──\n"); + RUN_TEST(test_emitter_content_mode_plain_text_call_parsed); + RUN_TEST(test_emitter_content_mode_no_tools_skips_plain_text_call); + RUN_TEST(test_emitter_content_mode_underscore_prefix_call_parsed); + RUN_TEST(test_emitter_content_mode_no_call_substring_skips_parser); + RUN_TEST(test_emitter_content_mode_mixed_calls_multiple); + RUN_TEST(test_emitter_content_mode_malformed_call_dropped); + RUN_TEST(test_emitter_content_mode_does_not_double_fire_on_tool_call_xml); + RUN_TEST(test_emitter_content_mode_strips_call_span_from_accumulated_text); + RUN_TEST(test_emitter_content_mode_anthropic_emits_tool_use_block); + RUN_TEST(test_emitter_content_mode_digit_start_verb_parsed); + RUN_TEST(test_emitter_content_mode_responses_done_uses_pre_strip_text); std::fprintf(stderr, "\n── Stop sequences ──\n"); RUN_TEST(test_stop_sequence_basic); @@ -3965,6 +4835,19 @@ int main() { RUN_TEST(test_jinja_render_bos_eos_threaded); RUN_TEST(test_jinja_render_empty_template_throws); RUN_TEST(test_jinja_render_bad_tools_json_throws); + RUN_TEST(test_chat_template_qwen3_enable_thinking_pre_opens); + RUN_TEST(test_chat_template_qwen3_disable_thinking_does_not_pre_open); + RUN_TEST(test_chat_template_qwen3_no_gen_prompt_does_not_pre_open); + RUN_TEST(test_chat_template_laguna_enable_thinking_pre_opens); + RUN_TEST(test_chat_template_laguna_disable_thinking_does_not_pre_open); + RUN_TEST(test_chat_template_gemma4_does_not_pre_open); + RUN_TEST(test_jinja_render_suffix_sniff_sets_started_in_thinking); + RUN_TEST(test_jinja_render_suffix_sniff_negative); + RUN_TEST(test_jinja_render_suffix_sniff_overrides_enable_thinking_flag); + RUN_TEST(test_jinja_render_suffix_sniff_requires_add_generation_prompt); + RUN_TEST(test_integration_qwen3_enable_thinking_render_to_emit_routes_to_reasoning); + RUN_TEST(test_integration_laguna_enable_thinking_render_to_emit_routes_to_reasoning); + RUN_TEST(test_integration_qwen3_disable_thinking_render_to_emit_stays_in_content); RUN_TEST(test_normalize_responses_tool_followup_messages); std::fprintf(stderr, "\n── Placement config ──\n"); @@ -3977,7 +4860,6 @@ int main() { RUN_TEST(test_backend_precision_hip_arch_policy); RUN_TEST(test_backend_precision_activation_type_combine); RUN_TEST(test_layer_split_backend_inline_snapshot_and_restore_delta); - RUN_TEST(test_layer_split_backend_sampling_capability_gate); RUN_TEST(test_layer_split_backend_chunks_prefill_by_adapter_limit); RUN_TEST(test_layer_split_compress_nopark_uses_default_drafter_path); RUN_TEST(test_layer_split_compress_rejects_bad_keep_ratio); @@ -4032,6 +4914,12 @@ int main() { RUN_TEST(test_props_model_card_null_on_family_fallback); RUN_TEST(test_props_budget_envelope_shape); RUN_TEST(test_props_runtime_shape); + RUN_TEST(test_props_build_block_shape_no_image_info); + RUN_TEST(test_props_build_block_with_image_info); + RUN_TEST(test_props_model_target_draft_shape); + RUN_TEST(test_props_model_draft_null_when_target_only); + RUN_TEST(test_props_host_block_present_when_populated); + RUN_TEST(test_props_host_block_null_when_missing); std::fprintf(stderr, "\n── usage.timings ──\n"); RUN_TEST(test_usage_timings_openai_chat_streaming); diff --git a/share/model_cards/_schema.json b/share/model_cards/_schema.json index 3fc204cb4..d83e70061 100644 --- a/share/model_cards/_schema.json +++ b/share/model_cards/_schema.json @@ -65,6 +65,17 @@ "repetition_penalty": { "type": "number" } } }, + "thinking_control": { + "type": "object", + "description": "Optional. Client-side prompt-level thinking-control tokens. Read by luce-bench when running against providers that ignore the API-side flags (chat_template_kwargs.enable_thinking / thinking / reasoning_effort) — for the Qwen3.x family this is e.g. '/think' and '/no_think'. The token is appended to the last user turn before serialization; see luce-bench/src/lucebench/_thinking.py. Only `user_turn_suffix` is supported in v1.", + "additionalProperties": false, + "required": ["think_prompt_token", "nothink_prompt_token", "injection_point"], + "properties": { + "think_prompt_token": { "type": "string", "minLength": 1 }, + "nothink_prompt_token": { "type": "string", "minLength": 1 }, + "injection_point": { "type": "string", "enum": ["user_turn_suffix"] } + } + }, "reasoning_effort_tiers": { "type": "object", "description": "Optional. Explicit per-tier phase-1 budgets. Overrides any computed default. Use when ratio-based defaults don't fit the model.", diff --git a/share/model_cards/laguna-xs.2.json b/share/model_cards/laguna-xs.2.json index bc0dda85a..a4fb86f14 100644 --- a/share/model_cards/laguna-xs.2.json +++ b/share/model_cards/laguna-xs.2.json @@ -3,11 +3,15 @@ "source": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF", "verified_at": "2026-05-24", "download_urls": { - "Q4_K_M": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-Q4_K_M.gguf", - "bf16": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-bf16.gguf" + "Q4_K_M-target": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-Q4_K_M.gguf", + "bf16-target": "https://huggingface.co/Lucebox/Laguna-XS.2-GGUF/resolve/main/laguna-xs2-bf16.gguf", + "DFlash-speculator":"https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash/resolve/main/model.safetensors", + "DFlash-speculator-config":"https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash/resolve/main/config.json" }, - "notes": "Non-reasoning MoE code model (3B active / 33B total). Card does not specify generation params or a complex-problem mode. context: native 4096, PFlash-extended to 131072. Sampling defaults below are code-model-typical (not from card). general.name has not been verified against a loaded GGUF — confirm and rename file if needed.", - "max_tokens": 4096, + "notes": "Reasoning MoE code model (3B active / 33B total). The README and the published Poolside speculator confirm a `` reasoning channel and an `enable_thinking=true` chat-template flag. Native 131072 context. DFlash speculator is published as poolside/Laguna-XS.2-speculator.dflash (safetensors, 5-layer Qwen3-flavored draft: head_dim=128, hidden=2048, q_norm + k_norm). Our existing safetensors loader consumes it directly with dynamic dim inference; bench measured +60% decode rate vs no-draft (125 vs 78 tok/s at temp=0) on bragi. Sampling defaults are code-model conservative; the upstream card does not pin them.", + "max_tokens": 16384, + "hard_limit_reply_budget": 4096, + "thinking_terminator_hint": "Considering the limited time by the user, I have to give the solution based on the thinking directly now.\n\n\n", "sampling": { "temperature": 0.6, "top_p": 0.95, diff --git a/share/model_cards/qwen3.6-27b.json b/share/model_cards/qwen3.6-27b.json index 94094dddf..5448be8b3 100644 --- a/share/model_cards/qwen3.6-27b.json +++ b/share/model_cards/qwen3.6-27b.json @@ -20,5 +20,10 @@ "high": 32256, "x-high": 56832, "max": 81408 + }, + "thinking_control": { + "think_prompt_token": "/think", + "nothink_prompt_token": "/no_think", + "injection_point": "user_turn_suffix" } } \ No newline at end of file