Luce-Org · dusterbloom · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/configs/CONTRIBUTING.md b/configs/CONTRIBUTING.md
@@ -0,0 +1,178 @@
+# Contributing a New Profile
+
+Thank you for contributing a benchmark result. Follow these five steps exactly.
+
+---
+
+## Step 1 — Run the benchmark and save the log
+
+Run the inference command and capture stdout/stderr to a file:
+
+```bash
+<your binary> <flags> 2>&1 | tee .sisyphus/notes/<experiment>/<your-run>.log
+```
+
+The log file is the ground truth for the numbers in your profile. Without it
+the profile will be rejected.
+
+---
+
+## Step 2 — Create the profile TOML
+
+Copy the template from `configs/profiles/base.toml` or the most similar
+existing profile. Name it `<hw>-<model>-<method>-<ctx>.toml`, e.g.:
+`rtx4090-dense31b-mtp-128k.toml`.
+
+### Required keys
+
+```toml
+extends = "base"          # or another profile stem
+backend = "dflash"        # must match a file in configs/backends/
+
+[hardware]
+gpu = "RTX 4090"
+sm = 89
+
+[model]
+target = "${LUCEBOX_ROOT}/models/your-model.gguf"
+# mtp_assistant required when spec.method = "mtp"
+# dflash_draft  required when spec.method = "dflash"
+
+[runtime]
+ctx = 131072
+kv_k = "tq3_0"
+kv_v = "tq3_0"
+
+[runtime.spec]
+method = "mtp"      # "none" | "mtp" | "dflash"
+gamma = 2           # required for mtp
+# draft_max = 4     # required for dflash
+
+[expected_floors]
+decode_tok_s = 15.0
+# ttft_ms_max = 80.0
+# prefill_tok_s = 500.0
+
+[provenance]
+source_log = ".sisyphus/notes/<experiment>/<your-run>.log"
+measured_at = "2026-01-15"       # ISO date
+hardware_id = "yourname-rtx4090-linux"
+commit = "abc1234"               # optional git SHA
+```
+
+### Auto-rejection rules (the linter will reject these)
+
+- `provenance.source_log = "<NEEDS_RUN>"` — fill in the real log path
+- Hardcoded `/absolute/paths` anywhere — use `${VAR}/...` or relative paths
+- `spec.method = "mtp"` without `model.mtp_assistant`
+- `spec.method = "dflash"` without `model.dflash_draft`
+- Empty `[expected_floors]` — set at least one floor
+- Missing `[provenance]` section or any of its three required fields
+- `source_log` pointing to a file that does not exist (warning, not error,
+  but reviewers will ask you to provide it)
+
+---
+
+## Step 3 — Lint before submitting
+
+```bash
+python dflash/scripts/config_lint.py --profile <your-profile-stem>
+```
+
+Must exit 0 (warnings about missing binaries are OK).
+
+For strict checking (promotes warnings to errors):
+
+```bash
+python dflash/scripts/config_lint.py --profile <your-profile-stem> --strict
+```
+
+---
+
+## Step 4 — Add or validate the backend
+
+If your profile uses a backend that already exists, skip this step.
+
+To add a backend, create `configs/backends/<name>.toml`:
+
+```toml
+name = "my-backend"      # must match filename stem exactly
+upstream = "https://..."
+build_hint = "..."       # optional build instructions
+
+[binary]
+# exactly one of:
+in_tree = "path/relative/to/git/root"
+# env_var = "MY_BINARY_VAR"
+
+[supports]
+spec_types = ["none", "mtp"]   # which methods this binary supports
+kv_quants = ["q8_0", "tq3_0"]
+
+[flags]
+# map canonical key -> CLI flag string
+model = "--model"
+ctx = "--ctx-size"
+kv_k = "--kv-k"
+kv_v = "--kv-v"
+# if "mtp" in spec_types:
+spec_model = "--mtp"
+spec_gamma = "--gamma"
+# if "dflash" in spec_types:
+# draft_model = "--draft"
+# draft_max  = "--draft-max"
+
+[stdout_parse]
+tok_s   = "eval time.*?([0-9]+\.[0-9]+) tokens per second"
+ttft_ms = "time to first token.*?([0-9]+\.[0-9]+) ms"
+```
+
+Backend validation rules:
+- `name` must equal the filename stem
+- Exactly one of `binary.in_tree` or `binary.env_var` must be set
+- All required flags for declared `spec_types` must be present
+
+---
+
+## Step 5 — Open a pull request
+
+Include in the PR body:
+- A snippet from the log file showing the measured tok/s and TTFT
+- The exact hardware (GPU model, driver version, VRAM)
+- The date of measurement
+- Confirmation that `config_lint.py --strict` exits 0
+
+### Disclosure requirement
+
+If any part of the profile, code, or PR description was AI-generated, state
+this explicitly. PRs with AI-generated content that is not disclosed will be
+closed.
+
+---
+
+## Schema reference summary
+
+### Profile keys
+
+| Key | Type | Required | Notes |
+|-----|------|----------|-------|
+| extends | string | yes | parent profile stem or "" for none |
+| backend | string | yes | stem of a file in configs/backends/ |
+| hardware.gpu | string | yes | GPU model name |
+| hardware.sm | int | yes | CUDA SM version (e.g. 86 for Ampere) |
+| model.target | path | yes | main model GGUF |
+| model.mtp_assistant | path | when method=mtp | MTP assistant GGUF |
+| model.dflash_draft | path | when method=dflash | DFlash draft GGUF |
+| runtime.ctx | int | yes | context length in tokens |
+| runtime.kv_k | string | yes | KV cache key quantization |
+| runtime.kv_v | string | yes | KV cache value quantization |
+| runtime.spec.method | string | yes | "none", "mtp", or "dflash" |
+| runtime.spec.gamma | int | when method=mtp | speculative tokens per step |
+| runtime.spec.draft_max | int | when method=dflash | max draft tokens |
+| runtime.flash_attn | bool | no | enable flash attention |
+| runtime.pflash | bool | no | enable pflash (MoE models) |
+| expected_floors | table | yes | at least one floor metric |
+| provenance.source_log | path | yes | path to benchmark log |
+| provenance.measured_at | date | yes | ISO 8601 date |
+| provenance.hardware_id | string | yes | unique hardware identifier |
+| provenance.commit | string | no | git SHA of code under test |
diff --git a/configs/README.md b/configs/README.md
@@ -0,0 +1,73 @@
+# configs — Declarative Inference Profiles
+
+This directory contains declarative TOML profiles and backend definitions for
+running Gemma-4 inference on lucebox-hub. Each profile captures a specific
+(model, context length, speculative decode method, hardware) combination along
+with measured performance floors, so every run is reproducible and comparable.
+
+## Why this exists
+
+Ad-hoc shell commands diverge over time. Profiles make the connection between
+a benchmark log and the exact flags used to produce it explicit and machine-checkable.
+
+## Directory layout
+
+```
+configs/
+  profiles/        — one .toml per (model, ctx, method, hw) combination
+  backends/        — one .toml per inference binary variant
+```
+
+## Quick start
+
+```bash
+# Lint everything (exits 0 if no errors, prints warnings)
+python dflash/scripts/config_lint.py
+
+# Dry-run a profile (validates env, paths, backend; does NOT run inference)
+python dflash/scripts/profile_run.py --profile rtx3090-moe26b-dflash-256k --dry-run
+
+# Print the resolved command (for inspection or shell scripting)
+python dflash/scripts/profile_run.py --profile rtx3090-moe26b-dflash-256k --print-cmd
+
+# Run (execvp — replaces the Python process)
+LUCEBOX_ROOT=/your/root python dflash/scripts/profile_run.py --profile rtx3090-dense31b-mtp-64k
+
+# Override a single field at runtime
+python dflash/scripts/profile_run.py --profile rtx3090-moe26b-dflash-256k \
+    --override runtime.ctx=131072
+
+# Verify a running server meets the floors declared in the profile
+python dflash/scripts/verify_server.py --profile rtx3090-moe26b-dflash-256k \
+    --base-url http://127.0.0.1:8080 --runs 5
+```
+
+## Required environment variables
+
+| Profile | Variable | Purpose |
+|---------|----------|---------|
+| rtx3090-dense31b-mtp-64k | `LUCEBOX_ROOT` | Root containing models/ |
+| rtx3090-moe26b-dflash-256k | `HOME` (auto-set) | Root for ~/models/ paths |
+| rtx3090-moe26b-mtp-1m | `HOME` (auto-set) | Root for ~/models/ paths |
+| llama-upstream backend | `LUCEBOX_LLAMA_BIN` | Path to llama-server or llama-cli |
+
+## Shipped profiles
+
+| Profile | Model | Method | CTX | Measured decode | Floor |
+|---------|-------|--------|-----|-----------------|-------|
+| rtx3090-dense31b-mtp-64k | Gemma-4 31B dense Q4_K_M | MTP γ=2 | 64K | 10.07 tok/s | 9.5 tok/s |
+| rtx3090-moe26b-dflash-256k | Gemma-4 26B-A4B MoE Q4_K_M | DFlash dm=4+pflash | 256K | 67.95 tok/s / 55ms TTFT | 65.0 tok/s / 65ms |
+| rtx3090-moe26b-mtp-1m | Gemma-4 26B-A4B MoE Q4_K_M | MTP γ=2+pflash | 1M | 23.65 tok/s / 108ms TTFT | 22.0 tok/s / 120ms |
+
+All measurements taken on RTX 3090 (24 GB VRAM) running WSL2 (peppi-rtx3090-wsl).
+
+## Backends
+
+| Backend | Binary | Spec methods |
+|---------|--------|-------------|
+| dflash | `dflash/build/test_gemma4_dflash` (in-tree) | none, mtp, dflash |
+| llama-upstream | `$LUCEBOX_LLAMA_BIN` (external) | none |
+
+## Schema reference
+
+See `configs/CONTRIBUTING.md` for the full schema and contribution guide.
diff --git a/configs/backends/dflash.toml b/configs/backends/dflash.toml
@@ -0,0 +1,37 @@
+# dflash backend — in-tree speculative decode binary
+name = "dflash"
+upstream = "https://github.com/dusterbloom/lucebox-hub"
+build_hint = "mkdir -p dflash/build && cd dflash/build && cmake .. -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) test_gemma4_dflash"
+
+[binary]
+in_tree = "dflash/build/test_gemma4_dflash"
+
+[supports]
+spec_types = ["none", "mtp", "dflash"]
+kv_quants = ["q8_0", "tq3_0", "f16"]
+
+[flags]
+# Core
+model = "--model"
+ctx = "--ctx-size"
+kv_k = "--kv-k"
+kv_v = "--kv-v"
+# MTP speculative decode
+spec_model = "--mtp"
+spec_gamma = "--gamma"
+# DFlash speculative decode
+draft_model = "--draft"
+draft_max = "--draft-max"
+# Optional
+pflash = "--pflash"
+flash_attn = "--flash-attn"
+temp = "--temp"
+seed = "--seed"
+n_predict = "--n-predict"
+ignore_eos = "--ignore-eos"
+batch = "--batch-size"
+ubatch = "--ubatch-size"
+
+[stdout_parse]
+tok_s = "eval time.*?([0-9]+\\.[0-9]+) tokens per second"
+ttft_ms = "time to first token.*?([0-9]+\\.[0-9]+) ms"
diff --git a/configs/backends/llama-upstream.toml b/configs/backends/llama-upstream.toml
@@ -0,0 +1,29 @@
+# llama-upstream backend — external llama.cpp server binary
+# Set LUCEBOX_LLAMA_BIN to the path of your compiled llama-server or llama-cli.
+name = "llama-upstream"
+upstream = "https://github.com/ggerganov/llama.cpp"
+build_hint = "cmake -B build -DGGML_CUDA=ON && cmake --build build --config Release -t llama-cli"
+
+[binary]
+env_var = "LUCEBOX_LLAMA_BIN"
+
+[supports]
+spec_types = ["none"]
+kv_quants = ["q8_0", "f16", "f32"]
+
+[flags]
+model = "--model"
+ctx = "--ctx-size"
+kv_k = "--kv-cache-type-k"
+kv_v = "--kv-cache-type-v"
+flash_attn = "--flash-attn"
+temp = "--temp"
+seed = "--seed"
+n_predict = "--n-predict"
+ignore_eos = "--ignore-eos"
+batch = "--batch-size"
+ubatch = "--ubatch-size"
+
+[stdout_parse]
+tok_s = "eval time.*?([0-9]+\\.[0-9]+) tokens per second"
+ttft_ms = "load time.*?([0-9]+\\.[0-9]+) ms"
diff --git a/configs/profiles/base.toml b/configs/profiles/base.toml
@@ -0,0 +1,25 @@
+# Base profile template — all profiles extend this or a child of it.
+# This file is NOT directly runnable; it lacks provenance and measured data.
+extends = ""
+backend = "dflash"
+
+[hardware]
+gpu = ""
+sm = 0
+
+[model]
+target = ""
+
+[runtime]
+ctx = 4096
+kv_k = "q8_0"
+kv_v = "q8_0"
+flash_attn = true
+
+[runtime.spec]
+method = "none"
+
+[expected_floors]
+# at least one of: decode_tok_s, prefill_tok_s, ttft_ms_max
+
+# [provenance] intentionally absent — fill in when deriving a real profile
diff --git a/configs/profiles/rtx3090-dense31b-mtp-64k.toml b/configs/profiles/rtx3090-dense31b-mtp-64k.toml
@@ -0,0 +1,31 @@
+# RTX 3090 — Gemma-4 Dense 31B + MTP (gamma=2) @ 64K context
+# Measured: 10.07 tok/s decode, acceptance_length=0.73 (+61% over no-MTP)
+extends = "base"
+backend = "dflash"
+
+[hardware]
+gpu = "RTX 3090"
+sm = 86
+
+[model]
+target = "${LUCEBOX_ROOT}/models/gemma-4-31B-it-Q4_K_M.gguf"
+mtp_assistant = "${LUCEBOX_ROOT}/models/gemma4-mtp-31B/gemma-4-31B-it-assistant.Q4_K_M.gguf"
+
+[runtime]
+ctx = 65536
+kv_k = "tq3_0"
+kv_v = "tq3_0"
+flash_attn = true
+
+[runtime.spec]
+method = "mtp"
+gamma = 2
+
+[expected_floors]
+decode_tok_s = 9.5
+
+[provenance]
+source_log = ".sisyphus/notes/gemma4-baseline/mtp-gamma/phase4-b/mtp_g2_ctx65536.log"
+measured_at = "2026-05-11"
+hardware_id = "peppi-rtx3090-wsl"
+commit = "4bcb972"