From c887cae298a90a4b9d60a79567d1a55f37cc5c54 Mon Sep 17 00:00:00 2001 From: David Cramer Date: Thu, 7 May 2026 11:03:25 -0700 Subject: [PATCH 1/2] feat(skills): Add built-in code review skill Add code-review as a package-native built-in skill for correctness-focused bug review. Document it alongside security-review and update CLI init/help guidance so users can add both baseline reviews by name. Co-Authored-By: GPT-5 Codex --- README.md | 3 +- packages/docs/public/llms.txt | 16 ++- packages/docs/src/components/HeroFlow.astro | 1 + packages/docs/src/pages/cli.astro | 4 +- packages/docs/src/pages/config.astro | 12 +- packages/docs/src/pages/guide.astro | 14 ++- packages/docs/src/pages/index.astro | 16 +-- src/builtin-skills/code-review/SKILL.md | 85 ++++++++++++++ src/builtin-skills/code-review/SOURCES.md | 42 +++++++ src/builtin-skills/code-review/SPEC.md | 110 ++++++++++++++++++ .../references/github-workflows.md | 88 ++++++++++++++ .../references/javascript-typescript.md | 73 ++++++++++++ .../code-review/references/python.md | 77 ++++++++++++ src/cli/commands/init.ts | 13 ++- src/cli/help.ts | 3 +- src/package.test.ts | 4 + src/skills/loader.test.ts | 14 ++- 17 files changed, 549 insertions(+), 26 deletions(-) create mode 100644 src/builtin-skills/code-review/SKILL.md create mode 100644 src/builtin-skills/code-review/SOURCES.md create mode 100644 src/builtin-skills/code-review/SPEC.md create mode 100644 src/builtin-skills/code-review/references/github-workflows.md create mode 100644 src/builtin-skills/code-review/references/javascript-typescript.md create mode 100644 src/builtin-skills/code-review/references/python.md diff --git a/README.md b/README.md index 18c38826..e5a82107 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,9 @@ Your code is under new management. Agents that review your code - locally or on # Initialize warden in your repository npx @sentry/warden init -# Add the built-in baseline security check +# Add the built-in baseline reviews npx @sentry/warden add security-review +npx @sentry/warden add code-review # Run a pre-review on current branch changes # Uses Claude Code subscription if logged in, or set WARDEN_ANTHROPIC_API_KEY diff --git a/packages/docs/public/llms.txt b/packages/docs/public/llms.txt index b276c1d6..edd6723d 100644 --- a/packages/docs/public/llms.txt +++ b/packages/docs/public/llms.txt @@ -4,7 +4,7 @@ Warden watches over your code by running **skills** against your changes. Skills are prompts that define what to look for: security vulnerabilities, API design issues, performance problems, or anything else you want consistent coverage on. -Skills follow the [agentskills.io](https://agentskills.io) specification. They're markdown files with a prompt that tells the AI what to look for. Warden includes a baseline `security-review` skill by default. Treat it as a first pass, not a complete security audit, and add community or custom skills when you need deeper coverage. +Skills follow the [agentskills.io](https://agentskills.io) specification. They're markdown files with a prompt that tells the AI what to look for. Warden includes `security-review` for baseline AppSec coverage and `code-review` for correctness bugs. Treat them as first passes, and add more skills when you need deeper coverage. - Docs: https://warden.sentry.dev - GitHub: https://github.com/getsentry/warden @@ -51,6 +51,7 @@ Creates `warden.toml` (configuration) and `.github/workflows/warden.yml` (GitHub ```bash warden add security-review +warden add code-review ``` ### Run Locally @@ -116,6 +117,7 @@ Add a skill trigger to your configuration. ```bash warden add # Interactive mode warden add security-review # Add baseline security review +warden add code-review # Add correctness bug review warden add --list # List available skills warden add --remote your-org/warden-skills --skill api-review warden add --remote your-org/warden-skills@abc123 --skill api-review # Pinned to commit @@ -176,6 +178,13 @@ version = 1 [[skills]] name = "security-review" +[[skills.triggers]] +type = "pull_request" +actions = ["opened", "synchronize"] + +[[skills]] +name = "code-review" + [[skills.triggers]] type = "pull_request" actions = ["opened", "synchronize"] @@ -331,6 +340,9 @@ Skills can be referenced in multiple ways: [[skills]] name = "security-review" +[[skills]] +name = "code-review" + # By relative path [[skills]] name = "./custom-skills/my-review" @@ -351,7 +363,7 @@ Resolution order: 1. Remote repository (if `remote` field is specified) 2. Direct path (if skill contains `/`, `\`, or starts with `.`) 3. Conventional directories: `.warden/skills/`, `.agents/skills/`, `.claude/skills/` -4. Built-in skills, including `security-review` +4. Built-in skills, including `security-review` and `code-review` ### Environment Variables diff --git a/packages/docs/src/components/HeroFlow.astro b/packages/docs/src/components/HeroFlow.astro index 3e457087..f2f75948 100644 --- a/packages/docs/src/components/HeroFlow.astro +++ b/packages/docs/src/components/HeroFlow.astro @@ -1,6 +1,7 @@ --- const skills = [ "security-review", + "code-review", "api-design-review", "architecture-review", "dependency-review", diff --git a/packages/docs/src/pages/cli.astro b/packages/docs/src/pages/cli.astro index 51ba91ca..1936a3ba 100644 --- a/packages/docs/src/pages/cli.astro +++ b/packages/docs/src/pages/cli.astro @@ -74,6 +74,7 @@ warden init --force # Overwrite existing files`} Remote repository (if remote is specified in trigger config)
  • Direct path (if contains /, \, or starts with .)
  • Conventional directories: .warden/skills/, .agents/skills/, .claude/skills/
  • -
  • Built-in skills, including security-review
  • +
  • Built-in skills, including security-review and code-review
  • Environment Variables

    diff --git a/packages/docs/src/pages/config.astro b/packages/docs/src/pages/config.astro index f4a08c96..181555ae 100644 --- a/packages/docs/src/pages/config.astro +++ b/packages/docs/src/pages/config.astro @@ -36,6 +36,13 @@ const tocItems = [ [[skills]] name = "security-review" +[[skills.triggers]] +type = "pull_request" +actions = ["opened", "synchronize"] + +[[skills]] +name = "code-review" + [[skills.triggers]] type = "pull_request" actions = ["opened", "synchronize"]`} @@ -322,6 +329,9 @@ createFixPR = true`} [[skills]] name = "security-review" +[[skills]] +name = "code-review" + # By relative path [[skills]] name = "./custom-skills/my-review" @@ -346,7 +356,7 @@ remote = "your-org/warden-skills@abc123def"`}
  • Remote repository (if remote field is specified)
  • Direct path (if skill contains /, \, or starts with .)
  • Conventional directories: .warden/skills/, .agents/skills/, .claude/skills/
  • -
  • Built-in skills, including security-review
  • +
  • Built-in skills, including security-review and code-review
  • Skill Files

    diff --git a/packages/docs/src/pages/guide.astro b/packages/docs/src/pages/guide.astro index a9ddb68a..05e89e5c 100644 --- a/packages/docs/src/pages/guide.astro +++ b/packages/docs/src/pages/guide.astro @@ -40,7 +40,7 @@ const tocItems = [
  • Reports findings with severity, location, and optional fixes
  • -

    Skills follow the agentskills.io specification -they're markdown files with a prompt that tells the AI what to look for. Warden includes a baseline security-review skill by default. Treat it as a first pass, not a complete security audit, and add community or custom skills when you need deeper coverage.

    +

    Skills follow the agentskills.io specification -they're markdown files with a prompt that tells the AI what to look for. Warden includes security-review for baseline AppSec coverage and code-review for correctness bugs. Treat them as first passes, and add more skills when you need deeper coverage.

    Warden works in two contexts:

      @@ -135,7 +135,8 @@ export WARDEN_ANTHROPIC_API_KEY=sk-ant-...`} @@ -242,15 +243,16 @@ Focus on issues in the changed code. For each issue found, report:

      Adding Skills

      -

      Use built-in skills by name. Add local or remote skills when your codebase needs more specialized checks.

      +

      Use built-in skills by name. Add more skills when your codebase needs specialized checks.

      -

      Add the Baseline Security Review

      +

      Add Built-in Reviews

      -

      security-review ships with Warden as a baseline first pass, so no local skill file or remote repository is required:

      +

      security-review and code-review ship with Warden as baseline first passes. Add them by name:

      diff --git a/packages/docs/src/pages/index.astro b/packages/docs/src/pages/index.astro index 3a516343..4f171ab4 100644 --- a/packages/docs/src/pages/index.astro +++ b/packages/docs/src/pages/index.astro @@ -95,7 +95,7 @@ vulnerabilities in code changes for Warden's baseline security skill.

      Its Just Skills

      -

      The PR feedback above comes from skills. Warden ships with a baseline security-review skill. It is a first pass, not a complete security audit, and it is still just a SKILL.md file telling Warden what to look for.

      +

      The PR feedback above comes from skills. Warden ships with security-review for baseline AppSec coverage and code-review for correctness bugs. They are first passes, and they are still just SKILL.md files telling Warden what to look for.

      -

      Use it by name. No local skill file, build step, schema, or SDK required.

      +

      Use built-ins by name. No local skill file, build step, schema, or SDK required.

      Real skills can include detailed reference material, code examples, style guides, architectural constraints, or anything else you'd put in a design doc. The prompt is the skill.

      @@ -128,7 +128,8 @@ vulnerabilities in code changes for Warden's baseline security skill. Created .github/workflows/warden.yml Next steps: - 1. Add a skill: warden add security-review + 1. Add built-in reviews: warden add security-review + warden add code-review 2. export WARDEN_ANTHROPIC_API_KEY=sk-ant-... 3. Add WARDEN_ANTHROPIC_API_KEY to repository secrets https://github.com/your-org/your-repo/settings/secrets/actions @@ -138,11 +139,12 @@ vulnerabilities in code changes for Warden's baseline security skill.

      Load Skills

      -

      Start with the baseline security check. Add custom or remote skills when your codebase needs deeper coverage.

      - -
      $ warden add security-review
      +

      Start with baseline security and correctness reviews. Add more skills when your codebase needs deeper coverage.

      + +
      $ warden add security-review
      +$ warden add code-review
      -

      Create your own skills or find ones driven by the community at skills.sh.

      +

      Keep adding skills for areas where your codebase needs specific coverage.

      diff --git a/src/builtin-skills/code-review/SKILL.md b/src/builtin-skills/code-review/SKILL.md new file mode 100644 index 00000000..6d27844b --- /dev/null +++ b/src/builtin-skills/code-review/SKILL.md @@ -0,0 +1,85 @@ +--- +name: code-review +description: Finds real correctness bugs in code changes. Use for adversarial code review, bug hunts, regression review, PR correctness review, logic errors, data loss, race conditions, state bugs, interface contract breaks, error handling bugs, edge cases, broken builds, or broken workflows. Excludes style, readability, architecture, AppSec, and best-practice-only feedback unless the issue causes a demonstrable bug. +allowed-tools: Read Grep Glob +--- + +You are an extremely adversarial production code reviewer finding only real bugs in code changes. +Try to break the changed behavior from every reachable angle, but report nothing unless the failure is concrete, reproducible from the code, and would cause incorrect behavior. + +## References + +Load only matching references: + +| Reference | Read When | +|-----------|-----------| +| `references/javascript-typescript.md` | Reviewing JavaScript, TypeScript, Node, React, Next.js, or browser code | +| `references/python.md` | Reviewing Python, Django, Flask, FastAPI, Celery, or Python service code | +| `references/github-workflows.md` | Reviewing GitHub Actions workflows, local actions, reusable workflows, or scripts and config loaded by workflows | + +## Bugs Only Rule + +Report a finding only when you can prove all of these: + +- The changed code is reachable in production, a user entry point, a published interface, a shipped workflow, or a test that can mask a real regression. +- A specific input, state, ordering, configuration, dependency result, or retry path triggers the failure. +- The surrounding code, tests, schema, docs, or public contract shows what should happen. +- The changed behavior violates that contract and produces a concrete symptom. +- The impact is observable: wrong result, crash, data loss, corrupted state, missed side effect, duplicate side effect, broken build, failed deploy, or false success. + +No proof, no finding. Suspicion is not a result. + +## Investigation Process + +1. Read the changed hunk and enough surrounding code to understand the intended behavior. +2. Identify the contract: caller expectations, public types, schemas, validation, docs, tests, persistence shape, API response shape, workflow trigger, or CLI behavior. +3. Construct adversarial cases: null or undefined, empty collections, zero, false, empty string, duplicates, missing keys, boundary counts, timezone boundaries, stale state, retries, partial failures, concurrent calls, and reordered events. +4. Trace data and state across imports, wrappers, validators, serializers, database writes, caches, queues, and dependent call sites. +5. Compare old and new behavior when the diff changes a condition, default, type, schema, query, ordering, side effect, or error path. +6. Check whether tests, types, schemas, framework guarantees, or caller guards already exclude the failure. +7. Report only defects that survive this verification. + +## What To Report + +| Category | Report When | +|----------|-------------| +| Logic and conditions | Branches are inverted, unreachable, too broad, too narrow, or collapse distinct cases such as `0`, `false`, `""`, `null`, and missing values. | +| Data contracts | Runtime values no longer match schemas, public types, API responses, persistence shapes, serialized payloads, or caller assumptions. | +| State and mutation | Shared objects, caches, global state, refs, arrays, maps, ORM models, or config are mutated in a way that leaks across callers or corrupts later work. | +| Async and ordering | Promises, tasks, callbacks, queues, retries, cancellation, transactions, or cleanup run in the wrong order, are not awaited, or race in a reachable path. | +| Error handling | Real failures are swallowed, converted to success, retried unsafely, or leave partial state that callers treat as complete. | +| Boundaries and edge cases | Empty, first, last, duplicate, pagination, sorting, timezone, locale, precision, overflow, migration, or compatibility cases produce wrong behavior. | +| Persistence and migrations | Writes are non-atomic, migrations lose data, backfills skip rows, query filters update the wrong records, or rollback paths leave inconsistent state. | +| API and dependency behavior | Published interfaces, CLI flags, config options, webhooks, service calls, or third-party dependency changes break documented or existing caller behavior. | +| UI correctness | The UI displays stale, wrong, duplicate, missing, or unsaved data because of the changed code, not because of style or preference. | +| Build, test, and workflow breakage | Changed code, packaging, imports, exports, generated artifacts, CI, or release workflows fail deterministically or report false success. | + +## Severity + +| Level | Use For | +|-------|---------| +| high | Data loss or corruption, critical-path crashes, broken production deploy or release, incorrect billing or permissions state, published interface breakage for normal callers, deadlock or hang in core flow, or false success after a failed destructive operation. | +| medium | Reproducible wrong results, recoverable crashes, duplicate or missed side effects, broken non-critical workflow, meaningful edge case in a shipped path, or compatibility break with a clear affected caller. | +| low | Narrow but real bug with limited blast radius, confusing state that can cause user-visible mistakes, or a test/tooling bug that masks the intended behavior. | + +- Use the lower severity when impact depends on unproven preconditions. +- Do not inflate severity for cleverness. The bug earns its level through impact. + +## What Not To Report + +- AppSec findings. Use the dedicated AppSec skill for exploitability issues. +- Style, naming, formatting, comments, readability, or maintainability concerns. +- Architecture, design layering, type hygiene, or refactor advice without a proven incorrect behavior. +- Performance concerns unless the changed code causes a reachable timeout, hang, memory blowup, quota exhaustion, or missed deadline. +- Missing tests, weak tests, or low coverage unless the changed test now asserts the wrong behavior or hides a real regression. +- Existing bugs untouched by the change unless the change makes them reachable or materially worse. +- Generated, vendored, fixture, example, migration-only, or test-only code unless it is shipped, executed, or masks a shipped bug. +- Framework, language, or dependency behavior that already guarantees the suspected case is safe. +- Hypothetical failures that require unrealistic inputs, impossible call order, or assumptions not supported by the code. + +## Finding Format + +- Title: name the exact bug and trigger. +- Description: include the changed behavior, trigger conditions, expected behavior, actual behavior, and concrete impact. +- `verification`: list checked files, functions, callers, guards, tests, schemas, or framework guarantees. +- `suggestedFix`: include only when the fix is complete for the analyzed path. diff --git a/src/builtin-skills/code-review/SOURCES.md b/src/builtin-skills/code-review/SOURCES.md new file mode 100644 index 00000000..81dddbc3 --- /dev/null +++ b/src/builtin-skills/code-review/SOURCES.md @@ -0,0 +1,42 @@ +# Code Review Skill Sources + +## Source Inventory + +| Source | Trust tier | Confidence | Usage constraints | Decisions | +|--------|------------|------------|-------------------|-----------| +| Existing built-in `security-review` skill | Local prior art | High | Use structure and evidence gating only; do not import security scope. | Mirror source-boundary-sink style as trigger-contract-impact for correctness bugs. | +| Local `architecture-review`, `code-simplifier`, and `find-warden-bugs` skills | Local prior art | Medium | Use boundaries to avoid overlap with style, architecture, and Warden-specific sweeps. | Make exclusions explicit so code review reports only bugs. | +| Common production correctness review failure modes | Engineering practice | Medium | Keep examples generic, transformed, and evidence-gated. | Cover logic, state, async, error handling, contracts, persistence, UI, build, and workflow bugs. | +| Warden package skill loader and packaging tests | Repository behavior | High | Use for validation scope only. | Add tests that the new built-in skill resolves and is included in the npm package. | + +## Coverage Matrix + +| Dimension | Coverage Status | Notes | +|-----------|-----------------|-------| +| Bug class definitions and prerequisites | Complete | `SKILL.md` defines reportable categories and requires a concrete trigger, violated contract, and impact. | +| Reachability and reproducibility evidence | Complete | Findings must prove the changed path is reachable and the failure is triggered by a specific input, state, ordering, or configuration. | +| False-positive controls | Complete | Exclusions block style, architecture, maintainability, security, performance-only, test-coverage-only, and pattern-only findings. | +| Severity and confidence calibration | Complete | Severity is tied to user-visible or operational impact, with lower severity for unproven preconditions. | +| Remediation expectations | Complete | Findings may include `suggestedFix` only when the fix is complete for the analyzed path. | +| Language and workflow caveats | Complete for initial scope | References cover JS/TS, Python, and GitHub Actions workflow correctness. Other languages use the core contract until repeated examples justify references. | + +## Source-Backed Decisions + +1. Security and correctness review should stay separate. + - Reason: Warden already has a dedicated `security-review` skill with exploitability criteria. + - Decision: `code-review` excludes auth, injection, XSS, SSRF, secrets, unsafe crypto, and workflow privilege issues. +2. Adversarial review still needs an evidence gate. + - Reason: aggressive prompts easily produce style or speculative findings. + - Decision: every finding must include trigger, expected behavior, actual behavior, and impact. +3. References should be conditional and language-specific. + - Reason: the built-in security skill keeps `SKILL.md` concise by routing only relevant details. + - Decision: create JS/TS, Python, and GitHub workflow references rather than a large catch-all reference. +4. Build and workflow breakage are correctness bugs. + - Reason: Warden reviews code changes that can break published packages, CI, and releases. + - Decision: include deterministic build, packaging, test, and workflow failures when the changed code proves the failure. + +## Open Gaps + +- Add eval fixtures after real review results identify the highest-value correctness bug classes for this skill. +- Add more language references only when repeated findings need language-specific calibration. +- Collect sanitized positive and negative review examples before creating `references/evidence/`. diff --git a/src/builtin-skills/code-review/SPEC.md b/src/builtin-skills/code-review/SPEC.md new file mode 100644 index 00000000..2d9e850e --- /dev/null +++ b/src/builtin-skills/code-review/SPEC.md @@ -0,0 +1,110 @@ +# Code Review Skill Specification + +## Intent + +The `code-review` skill is Warden's broad default correctness reviewer. It gives teams a highly adversarial bug-finding pass for code changes without turning into style, architecture, security, or best-practice commentary. + +It should catch real regressions and production defects while preferring no finding over speculative review feedback. + +## Scope + +In scope: + +- Logic, state, async, data contract, persistence, edge-case, API, UI, build, test, and workflow correctness bugs. +- Changed production code, user entry points, public APIs, shipped workflows, and tests that can mask shipped regressions. +- General bug-finding guidance that applies across application code. +- Focused notes for JavaScript/TypeScript, Python, and GitHub Actions workflow correctness. + +Out of scope: + +- Security vulnerabilities covered by `security-review`. +- Style, readability, formatting, naming, architecture, maintainability, or generic best-practice advice. +- Performance issues without a demonstrated timeout, hang, resource exhaustion, or deadline miss. +- Broad test coverage requests unless changed tests now assert the wrong behavior or hide a real regression. +- Benchmark-specific prompt compatibility. + +## Users And Trigger Context + +- Primary users: coding agents and Warden runs reviewing pull requests or local changes. +- Common user requests: "code review", "review this PR", "find bugs", "bug hunt", "correctness review", "look for regressions", "adversarial review", "logic bug", "race condition", or "edge-case bug". +- Should not trigger for: security-only review, architecture review, code simplification, style review, documentation review, prompt-writing help, or Warden CLI usage. + +## Runtime Contract + +- Required first actions: + - Identify whether changed files are production code, public API, shipped workflow, test/tooling code, generated/vendor/example code, or non-runtime docs. + - Read the changed file and any callers, guards, schemas, tests, serializers, migrations, workflow-loaded scripts, or downstream consumers needed to prove the bug. + - Load only the matching reference when it materially improves the review. +- Required outputs: + - Warden findings only for high-confidence correctness bugs. + - Empty findings when a concrete failure is not proven. +- Non-negotiable constraints: + - Do not report style, architecture, maintainability, security, or best-practice-only feedback. + - Do not lower the bar to create coverage. + - Do not report pattern-only suspicions. +- Expected bundled files loaded at runtime: + - `references/javascript-typescript.md` for JS/TS/Node/React/Next/browser correctness. + - `references/python.md` for Python/Django/Flask/FastAPI/Celery correctness. + - `references/github-workflows.md` for GitHub Actions workflow correctness. + +## Source And Evidence Model + +Authoritative sources: + +- The existing `security-review` built-in skill structure and evidence-gated finding model. +- Local Warden review skills that distinguish bugs from architecture, style, and simplification work. +- Common production code review failure modes observed across typed application code, service code, and CI workflows. + +Useful improvement sources: + +- positive examples: reviewed findings that identify a real trigger, violated contract, and concrete impact. +- negative examples: noisy findings that were style, speculation, security-only, or best-practice-only comments. +- commit logs/changelogs: fixes to Warden false positives, missing bug classes, or trigger precision. +- issue or PR feedback: maintainer comments showing what should or should not be reported. +- eval results: true-positive and safe-counterexample cases for correctness defects. + +Data that must not be stored: + +- secrets +- customer data +- private URLs or identifiers that are not needed for reproduction + +## Reference Architecture + +- `SKILL.md` contains the runtime contract, adversarial review loop, category table, severity rubric, exclusions, and reference routing. +- `SOURCES.md` contains synthesis notes, coverage status, and maintenance gaps. +- `references/` contains language and workflow-specific correctness notes with false-positive controls. +- `references/evidence/` is unused until concrete positive and negative review examples are collected. +- `scripts/` and `assets/` are unused. + +## Evaluation + +- Lightweight validation: + - Run the skill validator against `src/builtin-skills/code-review`. + - Verify every reference is directly routed from `SKILL.md`. + - Run built-in skill loader and package-content tests. +- Deeper evaluation: + - Add eval cases for falsey defaults, dropped async work, schema mismatch, migration data loss, stale UI state, workflow false success, and safe counterexamples. + - Compare false positives against sanitized real Warden runs. +- Holdout examples: + - A style-only diff with no behavior change should produce no finding. + - A security-only diff should be routed to `security-review`, not reported by `code-review`. + - A type-safe or schema-validated path should not be reported as a runtime bug. +- Acceptance gates: + - Findings require trigger, expected behavior, actual behavior, and impact. + - `SKILL.md` stays concise enough to scan. + - Language-specific examples stay in references. + - Security and style findings stay out of scope. + +## Known Limitations + +- The skill cannot prove bugs that require running complex integration environments unless the code path itself is sufficient evidence. +- Language coverage is intentionally strongest for JS/TS, Python, and GitHub Actions workflows. +- It may miss domain-specific business-rule bugs when the changed code does not expose the business contract. + +## Maintenance Notes + +- When to update `SKILL.md`: the core finding contract, exclusions, severity rubric, or reference routing changes. +- When to update `SOURCES.md`: new source categories, coverage gaps, or synthesis decisions materially change the skill. +- When to update `EVAL.md`: repeatable eval prompts are added. +- When to update `references/evidence/`: positive or negative examples recur and should shape future revisions. diff --git a/src/builtin-skills/code-review/references/github-workflows.md b/src/builtin-skills/code-review/references/github-workflows.md new file mode 100644 index 00000000..49c955df --- /dev/null +++ b/src/builtin-skills/code-review/references/github-workflows.md @@ -0,0 +1,88 @@ +# GitHub Workflow Bug Review Notes + +Use this when reviewing GitHub Actions workflows, local actions, reusable workflows, or scripts/config loaded by workflows. These notes refine the core `code-review` skill; security workflow findings belong to `security-review`. + +## Review Map + +Start with the effective execution graph: + +1. Identify the trigger: `pull_request`, `push`, `merge_group`, `workflow_dispatch`, `workflow_call`, `workflow_run`, `release`, `schedule`, or comment/label triggers. +2. Identify changed workflow files, local actions, reusable workflows, and repo-local scripts loaded by the workflow. +3. Follow job dependencies, matrix expansion, `if:` conditions, defaults, env, outputs, artifact paths, cache keys, and checked-out refs. +4. Compare intended behavior from names, docs, branch filters, required checks, release process, and adjacent workflows. +5. Report only deterministic broken CI, false success, skipped required work, wrong artifact, wrong release/deploy target, or broken local action behavior. + +## Reportable Patterns + +| Pattern | Report When | Safer Shape | +|---------|-------------|-------------| +| Skipped required checks | Branch, path, event, job `if:`, or matrix conditions exclude code paths that the workflow name or required-check setup says must run. | Align filters and required checks with the shipped paths. | +| False success | A script failure is hidden by `|| true`, missing `set -e` in multi-command shell, ignored exit codes, or background processes that can fail after the step succeeds. | Propagate exit codes and wait for background work. | +| Broken outputs | Step IDs, output names, `$GITHUB_OUTPUT` keys, job outputs, or reusable workflow outputs no longer match consumers. | Keep producer and consumer names aligned. | +| Wrong checkout or ref | Build, release, or deploy job checks out the wrong branch, tag, merge ref, or SHA for the event it claims to process. | Pin the intended event SHA or release ref explicitly. | +| Artifact mismatch | Upload path, download name, retention, working directory, or build output path changed so downstream jobs publish or test stale/missing artifacts. | Use explicit artifact names and verify generated paths. | +| Matrix drift | Matrix includes impossible combinations, drops a supported runtime, or references variables not defined in the expanded job. | Keep matrix dimensions and include/exclude entries consistent. | +| Cache staleness bug | Cache key omits lockfiles, runtime version, OS, architecture, or package manager version and can restore incompatible dependencies in normal CI. | Include dependency and runtime inputs in the key. | +| Reusable workflow contract break | Caller inputs, required secrets, output names, permissions expectations, or defaults no longer match the reusable workflow declaration. | Update callers and callee contracts together. | +| Local action breakage | `action.yml`, composite steps, shell scripts, or checked-in action code no longer match declared inputs, outputs, or runtime. | Keep action metadata and implementation synchronized. | + +## False-Positive Controls + +- Security issues such as privileged PR execution, expression injection, secrets exposure, broad permissions, mutable action refs, or OIDC trust belong to `security-review`, not this skill. +- Path filters are not bugs when another required workflow covers the skipped files. +- `continue-on-error` is not a bug when the result is intentionally checked later and failures still fail or mark the intended conclusion. +- Broad cache restore keys are not findings unless they can restore incompatible state for a normal workflow path. +- Matrix omissions are not findings unless docs, package metadata, or adjacent workflows show the runtime is still supported. +- A workflow step that only comments, labels, or uploads optional diagnostics is not a correctness bug unless its failure changes required behavior. + +## Minimal Examples + +**Report: output name drift** + +```yaml +jobs: + build: + outputs: + image: ${{ steps.meta.outputs.image }} + steps: + - id: metadata + run: echo "image=ghcr.io/acme/app:${GITHUB_SHA}" >> "$GITHUB_OUTPUT" + deploy: + needs: build + steps: + - run: deploy "${{ needs.build.outputs.image }}" +``` + +The job output reads `steps.meta`, but the producer step is now `metadata`, so deploy receives an empty image. + +**Report: false success** + +```yaml +- run: | + pnpm build & + pnpm test +``` + +The step exits with the test command while the background build can fail after success is reported. + +**Report: artifact path drift** + +```yaml +- run: pnpm build +- uses: actions/upload-artifact@v4 + with: + path: build/ +``` + +If the package now writes `dist/`, downstream release jobs download a missing or stale artifact. + +**Do not report: covered path filter** + +```yaml +on: + pull_request: + paths: + - "docs/**" +``` + +This is not a bug if a separate required workflow covers source changes. diff --git a/src/builtin-skills/code-review/references/javascript-typescript.md b/src/builtin-skills/code-review/references/javascript-typescript.md new file mode 100644 index 00000000..2fbd75e3 --- /dev/null +++ b/src/builtin-skills/code-review/references/javascript-typescript.md @@ -0,0 +1,73 @@ +# JavaScript And TypeScript Bug Review Notes + +Use this when reviewing JavaScript, TypeScript, Node, React, Next.js, or browser code. These notes refine the core `code-review` skill; they do not add style, architecture, security, or performance-only scope. + +## Runtime Boundaries + +- TypeScript types disappear at runtime. Treat JSON, form data, URL params, cookies, local storage, external API responses, database rows, env vars, and message payloads as untrusted shape until parsed or validated. +- `as`, `!`, `any`, unchecked indexed access, and broad generics are leads, not findings. Report only when a reachable runtime value can violate the assumption. +- Generated types, Zod schemas, tRPC routers, OpenAPI clients, GraphQL fragments, ORM models, and serializer tests can prove the intended contract. Read them before reporting a mismatch. +- React and Next.js can split server and client execution. Verify where the code actually runs before claiming a browser, server, hydration, or serialization bug. + +## High-Signal Patterns + +| Pattern | Bug Shape | Safer Shape | +|---------|-----------|-------------| +| Falsey fallback | `value || defaultValue` treats `0`, `false`, or `""` as absent when those are valid values. | Use `??` or explicit presence checks. | +| Dropped async work | `forEach(async ...)`, `items.map(async ...)` without `await Promise.all`, missing `return` in promise chains, or fire-and-forget work inside request/CLI paths. | Await the work, return the promise, or intentionally detach with error handling. | +| Swallowed async errors | `void fn()`, unhandled promise callbacks, or catch blocks convert failed writes to success responses. | Await and propagate errors, or surface partial failure explicitly. | +| State mutation | In-place `sort`, `reverse`, `splice`, object mutation, cache mutation, or prop mutation changes data later reused by callers. | Clone before mutation or keep mutation local to newly created values. | +| Stale React state | Closures, effects, memoization, or callbacks use stale props/state and produce wrong UI or wrong submitted data. | Use correct dependencies, functional updates, refs for mutable external state, or derive state at render time. | +| Schema drift | Runtime schema, inferred type, serialized payload, or API response changed without matching callers. | Update schema and every consumer, or keep backward-compatible fields. | +| Pagination and ordering | Filtering after slicing, unstable sort keys, cursor fields that are not unique, or changed default order skips or duplicates records. | Filter before paging, add deterministic tie-breakers, and preserve cursor contracts. | +| Date and precision | Date-only strings, local timezone parsing, DST boundaries, milliseconds vs seconds, integer rounding, or currency precision changes produce wrong values. | Normalize units and timezones at boundaries and keep decimal math explicit. | +| Import/export breakage | A value import points at a type-only export, a default import targets named exports, or an ESM/CJS boundary no longer matches runtime output. | Use `export type` for types and value exports for runtime symbols, matching the package format. | +| Cleanup and cancellation | Abort handlers, timers, subscriptions, streams, temp files, or locks are not cleaned up on error or unmount. | Use finally blocks, cleanup functions, abort propagation, and scoped resource ownership. | + +## False-Positive Controls + +- `Promise.all`, `Promise.allSettled`, returned promise chains, and framework-managed async handlers can prove async work is awaited. +- `value ?? defaultValue` preserves `0`, `false`, and `""`; do not report falsey collapse there. +- In-place mutation is safe when the array or object was created locally and is not reused by callers. +- Optional chaining is not a bug when downstream code intentionally handles absence. +- Type assertions are safe when the value comes from a checked schema, trusted factory, or exhaustive discriminated union. +- React hook dependency warnings are not findings by themselves. Show the stale value and user-visible wrong behavior. +- TypeScript compile errors are findings only when the changed code deterministically breaks the build or emitted runtime behavior. + +## Minimal Examples + +**Report: falsey value regression** + +```ts +const limit = Number(searchParams.get("limit")) || 50; +``` + +If `limit=0` is a documented way to disable fetching, this turns a valid value into `50`. + +**Report: dropped async writes** + +```ts +users.forEach(async (user) => { + await sendInvite(user.id); +}); +return { sent: users.length }; +``` + +The function reports success before invites finish, and failures are detached from the response. + +**Report: schema drift** + +```ts +const UserResponse = z.object({ id: z.string(), name: z.string() }); +return { id: user.id, displayName: user.name }; +``` + +The returned payload no longer satisfies the runtime schema or callers expecting `name`. + +**Do not report: awaited async map** + +```ts +await Promise.all(users.map((user) => sendInvite(user.id))); +``` + +The promises are joined and errors propagate through the awaited call. diff --git a/src/builtin-skills/code-review/references/python.md b/src/builtin-skills/code-review/references/python.md new file mode 100644 index 00000000..64b6e802 --- /dev/null +++ b/src/builtin-skills/code-review/references/python.md @@ -0,0 +1,77 @@ +# Python Bug Review Notes + +Use this when reviewing Python, Django, Flask, FastAPI, Celery, or Python service code. These notes refine the core `code-review` skill; they do not add style, architecture, security, or performance-only scope. + +## Runtime Boundaries + +- Type hints are not runtime validation. Treat request data, query params, env vars, external API responses, database rows, task payloads, and deserialized files as shape-unknown until validated. +- Pydantic models, DRF serializers, dataclasses, typed dicts, Django model fields, migrations, and existing tests can prove the intended contract. Read them before reporting a mismatch. +- Decorators and framework hooks can change call order, transaction scope, auth context, and exception behavior. Verify the effective path. +- Background tasks and management commands often run outside request transactions and sessions. Check idempotency, tenant/account context, and retry behavior before reporting. + +## High-Signal Patterns + +| Pattern | Bug Shape | Safer Shape | +|---------|-----------|-------------| +| Mutable defaults | Function, dataclass, or model defaults reuse lists, dicts, sets, or objects across calls or instances. | Use `None` plus initialization, `default_factory`, or framework-specific callable defaults. | +| Falsey fallback | `value or default` treats `0`, `False`, or `""` as absent when those are valid values. | Check `is None`, missing keys, or explicit sentinel values. | +| Missing `None` handling | `.first()`, `.get()`, optional config, env values, cache reads, or external responses are dereferenced without proving presence. | Add explicit absence handling or enforce presence at the boundary. | +| Swallowed errors | Broad `except` returns success, empty data, or partial defaults that callers treat as complete. | Propagate failure, return explicit partial state, or compensate rolled-back work. | +| Transaction gaps | Multiple writes, task enqueues, cache updates, or file operations can partially succeed when a later step fails. | Use transactions, `on_commit`, idempotency keys, or compensation. | +| Async mismatch | Coroutine is not awaited, blocking I/O runs in an async endpoint, or async context managers are entered incorrectly. | Await coroutines, use async clients, and keep blocking work out of event-loop paths. | +| Iterator exhaustion | Generators, queryset iterators, request streams, or file objects are consumed once and then reused as if still populated. | Materialize intentionally or pass a fresh iterator/stream. | +| Timezone and precision | Naive and aware datetimes are mixed, date boundaries use server local time, or decimal money is converted to float. | Normalize timezone and use `Decimal` or integer minor units for money. | +| Query and migration drift | Renamed fields, changed defaults, non-null constraints, data migrations, or backfills miss existing rows or write wrong records. | Include backward-compatible migrations and scoped update filters. | +| Task retry side effects | Celery or queue retries duplicate emails, charges, state transitions, or external calls. | Make side effects idempotent or persist completion before retryable boundaries. | + +## False-Positive Controls + +- Django ORM, SQLAlchemy, and Pydantic can enforce contracts. Verify the exact model, serializer, or schema before reporting. +- `get_or_create`, `update_or_create`, database constraints, and transactions can mitigate duplicate or partial-write paths. +- A broad `except` is not a finding if the caller receives explicit failure state and no partial success is claimed. +- Mutable values are safe when created inside the function or supplied by a documented immutable factory. +- QuerySet laziness is not a bug by itself. Show the changed evaluation order that produces wrong data. +- Type-checker-only issues are findings only when they deterministically break runtime behavior, packaging, or CI. + +## Minimal Examples + +**Report: mutable default** + +```python +def collect_errors(error, bucket=[]): + bucket.append(error) + return bucket +``` + +Every call shares the same list, so unrelated requests can see stale errors. + +**Report: swallowed partial failure** + +```python +try: + charge_customer(invoice) + mark_paid(invoice) +except Exception: + return {"ok": True} +``` + +The caller receives success even if charging or persistence failed. + +**Report: missing absence handling** + +```python +profile = Profile.objects.filter(user_id=user_id).first() +return profile.timezone +``` + +If profiles are optional or not yet created, this crashes instead of following the expected fallback. + +**Do not report: dataclass default factory** + +```python +@dataclass +class Batch: + items: list[str] = field(default_factory=list) +``` + +Each instance receives a fresh list. diff --git a/src/cli/commands/init.ts b/src/cli/commands/init.ts index ffe86022..9f7e6844 100644 --- a/src/cli/commands/init.ts +++ b/src/cli/commands/init.ts @@ -69,9 +69,11 @@ function generateWardenToml(): string { # https://github.com/getsentry/warden # # Warden reviews code using AI-powered skills triggered by GitHub events. -# Built-in skills are available by name. Custom skills live in .agents/skills/ or .claude/skills/ +# Built-in skills are available by name. Add more skills as needed. # -# Add skills with: warden add security-review +# Add built-in reviews with: +# warden add security-review +# warden add code-review version = 1 @@ -84,7 +86,9 @@ failOn = "high" reportOn = "medium" # Skills define what to analyze and when to run -# Add skills with: warden add security-review +# Add built-in reviews with: +# warden add security-review +# warden add code-review # # Example skill with path filters and triggers: # @@ -402,7 +406,8 @@ export async function runInit(options: CLIOptions, reporter: Reporter): Promise< // Print next steps reporter.bold('Next steps:'); - reporter.text(` 1. Add a skill: ${chalk.cyan('warden add security-review')}`); + reporter.text(` 1. Add built-in reviews: ${chalk.cyan('warden add security-review')}`); + reporter.text(` ${chalk.cyan('warden add code-review')}`); reporter.text(` 2. Set ${chalk.cyan('WARDEN_ANTHROPIC_API_KEY')} in .env.local`); reporter.text(` 3. Add ${chalk.cyan('WARDEN_ANTHROPIC_API_KEY')} to organization or repository secrets`); diff --git a/src/cli/help.ts b/src/cli/help.ts index c95ad2be..0248ff6b 100644 --- a/src/cli/help.ts +++ b/src/cli/help.ts @@ -268,7 +268,7 @@ const HELP_COMMANDS: Record = { }, add: { summary: 'Add a skill trigger to warden.toml', - description: 'Add a local or remote skill trigger to the repository configuration.', + description: 'Add a skill trigger to the repository configuration.', usage: ['warden add [skill] [options]'], arguments: [ { label: 'skill', description: 'Skill name to add' }, @@ -277,6 +277,7 @@ const HELP_COMMANDS: Record = { examples: [ 'warden add', 'warden add security-review', + 'warden add code-review', 'warden add --remote your-org/warden-skills --skill api-review', ], }, diff --git a/src/package.test.ts b/src/package.test.ts index 526356a8..c41c9248 100644 --- a/src/package.test.ts +++ b/src/package.test.ts @@ -17,6 +17,10 @@ describe('npm package contents', () => { expect(ignored.ignores('src/builtin-skills/security-review/SKILL.md')).toBe(false); expect(ignored.ignores('src/builtin-skills/security-review/references/javascript-typescript.md')).toBe(false); expect(ignored.ignores('src/builtin-skills/security-review/references/github-workflows.md')).toBe(false); + expect(ignored.ignores('src/builtin-skills/code-review/SKILL.md')).toBe(false); + expect(ignored.ignores('src/builtin-skills/code-review/SOURCES.md')).toBe(false); + expect(ignored.ignores('src/builtin-skills/code-review/references/javascript-typescript.md')).toBe(false); + expect(ignored.ignores('src/builtin-skills/code-review/references/python.md')).toBe(false); expect(ignored.ignores('skills/warden/SPEC.md')).toBe(false); expect(ignored.ignores('.warden/skills/security/SKILL.md')).toBe(true); expect(ignored.ignores('.codex/config.toml')).toBe(true); diff --git a/src/skills/loader.test.ts b/src/skills/loader.test.ts index 6128c951..c07f8600 100644 --- a/src/skills/loader.test.ts +++ b/src/skills/loader.test.ts @@ -40,9 +40,13 @@ describe('resolveSkillAsync', () => { }); it('resolves package-native built-in skills by name', async () => { - const skill = await resolveSkillAsync('security-review'); - expect(skill.name).toBe('security-review'); - expect(skill.rootDir).toContain('src/builtin-skills/security-review'); + const securitySkill = await resolveSkillAsync('security-review'); + expect(securitySkill.name).toBe('security-review'); + expect(securitySkill.rootDir).toContain('src/builtin-skills/security-review'); + + const codeSkill = await resolveSkillAsync('code-review'); + expect(codeSkill.name).toBe('code-review'); + expect(codeSkill.rootDir).toContain('src/builtin-skills/code-review'); }); it('lets repo-local skills override built-in skills', async () => { @@ -168,10 +172,14 @@ describe('BUILTIN_SKILL_DIRECTORIES', () => { it('discovers built-in skills without a repo root', async () => { const skills = await discoverAllSkills(); const skill = skills.get('security-review'); + const codeSkill = skills.get('code-review'); expect(skill).toBeDefined(); expect(skill!.directory).toBe('built-in'); expect(skill!.path).toContain('src/builtin-skills/security-review'); + expect(codeSkill).toBeDefined(); + expect(codeSkill!.directory).toBe('built-in'); + expect(codeSkill!.path).toContain('src/builtin-skills/code-review'); }); }); From 17abcf4c702cf1972bd6637fef978a441a220399 Mon Sep 17 00:00:00 2001 From: David Cramer Date: Thu, 7 May 2026 14:08:21 -0700 Subject: [PATCH 2/2] fix(docs): Address code review PR feedback Give the hero flow a layout slot for every displayed skill and cover the code-review workflow reference in the package contents test. Co-Authored-By: GPT-5 Codex --- packages/docs/src/components/HeroFlow.astro | 1 + src/package.test.ts | 1 + 2 files changed, 2 insertions(+) diff --git a/packages/docs/src/components/HeroFlow.astro b/packages/docs/src/components/HeroFlow.astro index f2f75948..3e74c1e3 100644 --- a/packages/docs/src/components/HeroFlow.astro +++ b/packages/docs/src/components/HeroFlow.astro @@ -19,6 +19,7 @@ const desktopSkillLayout = [ { side: "left", inset: "1.45rem" }, { side: "left", inset: "2.25rem" }, { side: "right", inset: "0.45rem" }, + { side: "left", inset: "0.95rem" }, ]; const connectorCount = skills.length + 2; --- diff --git a/src/package.test.ts b/src/package.test.ts index c41c9248..44bfd652 100644 --- a/src/package.test.ts +++ b/src/package.test.ts @@ -20,6 +20,7 @@ describe('npm package contents', () => { expect(ignored.ignores('src/builtin-skills/code-review/SKILL.md')).toBe(false); expect(ignored.ignores('src/builtin-skills/code-review/SOURCES.md')).toBe(false); expect(ignored.ignores('src/builtin-skills/code-review/references/javascript-typescript.md')).toBe(false); + expect(ignored.ignores('src/builtin-skills/code-review/references/github-workflows.md')).toBe(false); expect(ignored.ignores('src/builtin-skills/code-review/references/python.md')).toBe(false); expect(ignored.ignores('skills/warden/SPEC.md')).toBe(false); expect(ignored.ignores('.warden/skills/security/SKILL.md')).toBe(true);