jtn0123 · jtn0123 · Apr 28, 2026 · Apr 28, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -63,21 +63,28 @@ jobs:
           PERF_COLD_START_SAMPLES: ${{ vars.PERF_COLD_START_SAMPLES || '3' }}
           PERF_COLD_START_TIMEOUT_S: ${{ vars.PERF_COLD_START_TIMEOUT_S || '90' }}
         run: |
+          set -o pipefail
           python -m pytest tests/benchmarks/ --benchmark-only \
             --benchmark-columns=min,mean,median,stddev,ops \
             --benchmark-json=/tmp/bench-current.json -q
           # Compare against CI-cached baseline if available, otherwise
-          # fall back to the repo-committed baseline.
+          # fall back to the repo-committed baseline. Both paths are blocking
+          # so cache misses cannot silently hide PR regressions.
           if [ -f /tmp/bench-baseline.json ]; then
+            echo "Using CI cached benchmark baseline." | tee /tmp/benchmark-compare.txt
+            cp /tmp/bench-baseline.json /tmp/bench-effective-baseline.json
             python scripts/benchmark_compare.py \
-              --baseline /tmp/bench-baseline.json \
-              --current /tmp/bench-current.json
+              --baseline /tmp/bench-effective-baseline.json \
+              --current /tmp/bench-current.json \
+              | tee -a /tmp/benchmark-compare.txt
           else
-            echo "No CI baseline cache found — recording initial baseline."
-            echo "Comparing against repo baseline (cross-platform, informational only):"
+            echo "No CI baseline cache found — using repo baseline as blocking fallback." \
+              | tee /tmp/benchmark-compare.txt
+            cp tests/benchmarks/baseline.json /tmp/bench-effective-baseline.json
             python scripts/benchmark_compare.py \
-              --baseline tests/benchmarks/baseline.json \
-              --current /tmp/bench-current.json || true
+              --baseline /tmp/bench-effective-baseline.json \
+              --current /tmp/bench-current.json \
+              | tee -a /tmp/benchmark-compare.txt
           fi
           # JTN-738: hard performance budgets on plugin render latency and
           # cold-start startup time.
@@ -86,6 +93,16 @@ jobs:
           if [ "${{ github.ref }}" = "refs/heads/main" ]; then
             cp /tmp/bench-current.json /tmp/bench-baseline.json
           fi
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: |
+            /tmp/bench-current.json
+            /tmp/benchmark-compare.txt
+            /tmp/bench-effective-baseline.json
+          if-no-files-found: ignore
 
   shellcheck:
     name: Shell script validation

diff --git a/docs/benchmarking.md b/docs/benchmarking.md
@@ -61,13 +61,14 @@ benchmark's median time exceeds the baseline by more than the configured
 threshold, CI fails.
 
 - **CI baseline**: cached per-OS via GitHub Actions cache. On pushes to `main`,
-  the current run becomes the new baseline for future PRs. On the very first
-  run (no cache), the comparison is informational only (non-blocking).
+  the current run becomes the new baseline for future PRs.
 - **Repo baseline** (`tests/benchmarks/baseline.json`): committed for local
-  development use and as a fallback when no CI cache exists.
+  development use and as a blocking fallback when no CI cache exists.
 - **Threshold**: defaults to +15%. Override via the `BENCHMARK_THRESHOLD_PCT`
   GitHub Actions variable or environment variable.
 - **Comparison script**: `scripts/benchmark_compare.py`
+- **Artifacts**: every PR uploads `benchmark-results` with the current
+  pytest-benchmark JSON and comparison log for audit/debugging.
 
 The gate runs as part of the `lint` job in `.github/workflows/ci.yml`.
 

diff --git a/docs/mutation_testing.md b/docs/mutation_testing.md
@@ -106,6 +106,20 @@ The job is sharded by package so each package has its own runtime budget:
 Results are uploaded as `mutmut-cache-<shard>` artifacts and can be downloaded
 from the GitHub Actions run summary.
 
+## Narrow PR Mutation Gate
+
+The full `mutmut` pass is intentionally advisory because it is slow. For PR and
+pre-flash confidence, the repo also has a narrow deterministic harness:
+
+```bash
+INKYPI_ENV=dev INKYPI_NO_REFRESH=1 PYTHONPATH=src python scripts/mutation_check.py
+```
+
+This harness applies a small set of known high-value mutants to a temporary copy
+of the repo, runs targeted tests for each mutant, and fails if any mutant
+survives. It is the fast signal for source-path drift and regression-sensitive
+logic while the full nightly `mutmut` job remains the broad advisory signal.
+
 ## Interpreting results
 
 | Status | Meaning |

diff --git a/scripts/mutation_check.py b/scripts/mutation_check.py
@@ -26,7 +26,7 @@ class Mutant:
 MUTANTS: tuple[Mutant, ...] = (
     Mutant(
         name="cache-hit-inverted",
-        file="src/refresh_task.py",
+        file="src/refresh_task/task.py",
         old="        used_cached = image_hash == latest_refresh.image_hash\n",
         new="        used_cached = image_hash != latest_refresh.image_hash\n",
         commands=(
@@ -42,7 +42,7 @@ class Mutant:
     ),
     Mutant(
         name="retry-count-off-by-one",
-        file="src/refresh_task.py",
+        file="src/refresh_task/executor.py",
         old="        attempts = max(1, retries + 1)\n",
         new="        attempts = max(1, retries)\n",
         commands=(

diff --git a/tests/unit/test_quality_gates.py b/tests/unit/test_quality_gates.py
@@ -0,0 +1,39 @@
+"""Regression tests for CI quality gate wiring."""
+
+from pathlib import Path
+
+from scripts import mutation_check
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CI_YAML = REPO_ROOT / ".github" / "workflows" / "ci.yml"
+
+
+def test_narrow_mutation_targets_exist_and_match_snippets():
+    """The narrow mutation harness must not drift after source moves."""
+    for mutant in mutation_check.MUTANTS:
+        target = REPO_ROOT / mutant.file
+        assert target.exists(), f"{mutant.name} target is missing: {mutant.file}"
+        assert mutant.old in target.read_text(
+            encoding="utf-8"
+        ), f"{mutant.name} snippet is missing from {mutant.file}"
+
+
+def test_pr_benchmark_gate_uploads_auditable_artifacts():
+    """PR benchmark runs should leave numbers and comparison logs behind."""
+    ci_yaml = CI_YAML.read_text(encoding="utf-8")
+    assert "Upload benchmark artifacts" in ci_yaml
+    assert "/tmp/bench-current.json" in ci_yaml
+    assert "/tmp/benchmark-compare.txt" in ci_yaml
+    assert "/tmp/bench-effective-baseline.json" in ci_yaml
+
+
+def test_pr_benchmark_repo_baseline_fallback_is_blocking():
+    """A missing CI cache must not turn benchmark regressions advisory-only."""
+    ci_yaml = CI_YAML.read_text(encoding="utf-8")
+    assert "informational only" not in ci_yaml
+    assert (
+        "cp tests/benchmarks/baseline.json /tmp/bench-effective-baseline.json"
+        in ci_yaml
+    )
+    assert "--baseline /tmp/bench-effective-baseline.json" in ci_yaml
+    assert "--current /tmp/bench-current.json || true" not in ci_yaml