diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f01255f63..2b07e57a9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,21 +63,28 @@ jobs: PERF_COLD_START_SAMPLES: ${{ vars.PERF_COLD_START_SAMPLES || '3' }} PERF_COLD_START_TIMEOUT_S: ${{ vars.PERF_COLD_START_TIMEOUT_S || '90' }} run: | + set -o pipefail python -m pytest tests/benchmarks/ --benchmark-only \ --benchmark-columns=min,mean,median,stddev,ops \ --benchmark-json=/tmp/bench-current.json -q # Compare against CI-cached baseline if available, otherwise - # fall back to the repo-committed baseline. + # fall back to the repo-committed baseline. Both paths are blocking + # so cache misses cannot silently hide PR regressions. if [ -f /tmp/bench-baseline.json ]; then + echo "Using CI cached benchmark baseline." | tee /tmp/benchmark-compare.txt + cp /tmp/bench-baseline.json /tmp/bench-effective-baseline.json python scripts/benchmark_compare.py \ - --baseline /tmp/bench-baseline.json \ - --current /tmp/bench-current.json + --baseline /tmp/bench-effective-baseline.json \ + --current /tmp/bench-current.json \ + | tee -a /tmp/benchmark-compare.txt else - echo "No CI baseline cache found — recording initial baseline." - echo "Comparing against repo baseline (cross-platform, informational only):" + echo "No CI baseline cache found — using repo baseline as blocking fallback." \ + | tee /tmp/benchmark-compare.txt + cp tests/benchmarks/baseline.json /tmp/bench-effective-baseline.json python scripts/benchmark_compare.py \ - --baseline tests/benchmarks/baseline.json \ - --current /tmp/bench-current.json || true + --baseline /tmp/bench-effective-baseline.json \ + --current /tmp/bench-current.json \ + | tee -a /tmp/benchmark-compare.txt fi # JTN-738: hard performance budgets on plugin render latency and # cold-start startup time. @@ -86,6 +93,16 @@ jobs: if [ "${{ github.ref }}" = "refs/heads/main" ]; then cp /tmp/bench-current.json /tmp/bench-baseline.json fi + - name: Upload benchmark artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: | + /tmp/bench-current.json + /tmp/benchmark-compare.txt + /tmp/bench-effective-baseline.json + if-no-files-found: ignore shellcheck: name: Shell script validation diff --git a/docs/benchmarking.md b/docs/benchmarking.md index d635f37f6..cb380be49 100644 --- a/docs/benchmarking.md +++ b/docs/benchmarking.md @@ -61,13 +61,14 @@ benchmark's median time exceeds the baseline by more than the configured threshold, CI fails. - **CI baseline**: cached per-OS via GitHub Actions cache. On pushes to `main`, - the current run becomes the new baseline for future PRs. On the very first - run (no cache), the comparison is informational only (non-blocking). + the current run becomes the new baseline for future PRs. - **Repo baseline** (`tests/benchmarks/baseline.json`): committed for local - development use and as a fallback when no CI cache exists. + development use and as a blocking fallback when no CI cache exists. - **Threshold**: defaults to +15%. Override via the `BENCHMARK_THRESHOLD_PCT` GitHub Actions variable or environment variable. - **Comparison script**: `scripts/benchmark_compare.py` +- **Artifacts**: every PR uploads `benchmark-results` with the current + pytest-benchmark JSON and comparison log for audit/debugging. The gate runs as part of the `lint` job in `.github/workflows/ci.yml`. diff --git a/docs/mutation_testing.md b/docs/mutation_testing.md index 28d5c1d71..9c2f5630f 100644 --- a/docs/mutation_testing.md +++ b/docs/mutation_testing.md @@ -106,6 +106,20 @@ The job is sharded by package so each package has its own runtime budget: Results are uploaded as `mutmut-cache-` artifacts and can be downloaded from the GitHub Actions run summary. +## Narrow PR Mutation Gate + +The full `mutmut` pass is intentionally advisory because it is slow. For PR and +pre-flash confidence, the repo also has a narrow deterministic harness: + +```bash +INKYPI_ENV=dev INKYPI_NO_REFRESH=1 PYTHONPATH=src python scripts/mutation_check.py +``` + +This harness applies a small set of known high-value mutants to a temporary copy +of the repo, runs targeted tests for each mutant, and fails if any mutant +survives. It is the fast signal for source-path drift and regression-sensitive +logic while the full nightly `mutmut` job remains the broad advisory signal. + ## Interpreting results | Status | Meaning | diff --git a/scripts/mutation_check.py b/scripts/mutation_check.py index a4e0c734b..4a71b2343 100644 --- a/scripts/mutation_check.py +++ b/scripts/mutation_check.py @@ -26,7 +26,7 @@ class Mutant: MUTANTS: tuple[Mutant, ...] = ( Mutant( name="cache-hit-inverted", - file="src/refresh_task.py", + file="src/refresh_task/task.py", old=" used_cached = image_hash == latest_refresh.image_hash\n", new=" used_cached = image_hash != latest_refresh.image_hash\n", commands=( @@ -42,7 +42,7 @@ class Mutant: ), Mutant( name="retry-count-off-by-one", - file="src/refresh_task.py", + file="src/refresh_task/executor.py", old=" attempts = max(1, retries + 1)\n", new=" attempts = max(1, retries)\n", commands=( diff --git a/tests/unit/test_quality_gates.py b/tests/unit/test_quality_gates.py new file mode 100644 index 000000000..cfb0e4b56 --- /dev/null +++ b/tests/unit/test_quality_gates.py @@ -0,0 +1,39 @@ +"""Regression tests for CI quality gate wiring.""" + +from pathlib import Path + +from scripts import mutation_check + +REPO_ROOT = Path(__file__).resolve().parents[2] +CI_YAML = REPO_ROOT / ".github" / "workflows" / "ci.yml" + + +def test_narrow_mutation_targets_exist_and_match_snippets(): + """The narrow mutation harness must not drift after source moves.""" + for mutant in mutation_check.MUTANTS: + target = REPO_ROOT / mutant.file + assert target.exists(), f"{mutant.name} target is missing: {mutant.file}" + assert mutant.old in target.read_text( + encoding="utf-8" + ), f"{mutant.name} snippet is missing from {mutant.file}" + + +def test_pr_benchmark_gate_uploads_auditable_artifacts(): + """PR benchmark runs should leave numbers and comparison logs behind.""" + ci_yaml = CI_YAML.read_text(encoding="utf-8") + assert "Upload benchmark artifacts" in ci_yaml + assert "/tmp/bench-current.json" in ci_yaml + assert "/tmp/benchmark-compare.txt" in ci_yaml + assert "/tmp/bench-effective-baseline.json" in ci_yaml + + +def test_pr_benchmark_repo_baseline_fallback_is_blocking(): + """A missing CI cache must not turn benchmark regressions advisory-only.""" + ci_yaml = CI_YAML.read_text(encoding="utf-8") + assert "informational only" not in ci_yaml + assert ( + "cp tests/benchmarks/baseline.json /tmp/bench-effective-baseline.json" + in ci_yaml + ) + assert "--baseline /tmp/bench-effective-baseline.json" in ci_yaml + assert "--current /tmp/bench-current.json || true" not in ci_yaml