diff --git a/.github/workflows/sweep.yml b/.github/workflows/sweep.yml index 8568174b31..e2b699646f 100644 --- a/.github/workflows/sweep.yml +++ b/.github/workflows/sweep.yml @@ -785,6 +785,18 @@ jobs: permission-issues: write permission-pull-requests: read + - name: Create target Codex inspection token + id: codex-inspection-token + uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1 + with: + client-id: ${{ env.CLAWSWEEPER_APP_CLIENT_ID }} + private-key: ${{ secrets.CLAWSWEEPER_APP_PRIVATE_KEY }} + owner: ${{ needs.plan.outputs.target_repo_owner }} + repositories: ${{ needs.plan.outputs.target_repo_name }} + permission-contents: read + permission-issues: read + permission-pull-requests: read + - uses: actions/setup-node@v6 with: node-version: 24 @@ -838,6 +850,7 @@ jobs: working-directory: clawsweeper env: GH_TOKEN: ${{ steps.target-read-token.outputs.token }} + CLAWSWEEPER_PROOF_INSPECTION_TOKEN: ${{ steps.codex-inspection-token.outputs.token }} ADDITIONAL_PROMPT: ${{ github.event.inputs.additional_prompt || github.event.client_payload.additional_prompt || '' }} run: | hot_intake_arg=() diff --git a/CHANGELOG.md b/CHANGELOG.md index 669e932779..19a2762e10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ checkpoint, and status-only commits are intentionally omitted. ### Added +- Added agent-led real behavior proof judgement so ClawSweeper can inspect linked screenshots, videos, logs, and terminal output with a read-only GitHub token, explain the proof verdict in the review comment, tell contributors how to trigger a fresh review after adding proof, and sync `proof: sufficient` when the evidence is convincing. - Added a real behavior proof assessment to PR reviews so missing, mock-only, or insufficient contributor proof blocks pass/automerge markers and asks for screenshots, terminal output, redacted logs, recordings, linked artifacts, or copied live output instead. - Added `config/automation-limits.json` plus docs and a drift check so review, commit-review, repair, and issue-implementation capacity defaults have one diff --git a/prompts/review-item.md b/prompts/review-item.md index dd0cfad8ae..36913d945f 100644 --- a/prompts/review-item.md +++ b/prompts/review-item.md @@ -84,7 +84,7 @@ likely owner. For PRs, include a dedicated security review pass in addition to the functional review. Inspect whether the diff could introduce a security or supply-chain regression, especially when it touches CI workflows, GitHub Action refs, dependency sources, lockfiles, install/build/release scripts, package publishing metadata, secrets handling, permissions, downloaded artifacts, generated/vendor/minified files, or other code execution paths. Check whether those changes are consistent with the PR title, body, discussion, and stated purpose before deciding. Be cautious when a small or unrelated functional change also introduces new third-party code execution, broadens secret or permission access, changes package resolution, adds lifecycle hooks, downloads and executes artifacts, or mixes infrastructure changes into otherwise cosmetic work. Do not infer malicious intent without concrete evidence. Always summarize this pass in `securityReview`; set `status: "cleared"` when the diff has no concrete security or supply-chain concern, `status: "needs_attention"` when there is a concrete concern, and `status: "not_applicable"` for non-PR items without a security-sensitive report. Put concrete security concerns in `securityReview.concerns` with file/line when possible, and also include blocking concerns in `risks` and `evidence` when they affect the merge/close decision. -For PRs, include a dedicated `realBehaviorProof` assessment before any pass, automerge, or repair verdict. External PRs must show that the contributor ran the changed behavior after the fix in a real setup. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only; they are not real behavior proof by themselves. Treat screenshots, recordings, terminal screenshots, console output, copied live output, linked artifacts, and redacted runtime logs as valid proof, including for non-visual CLI, console, text, or error-message changes. Use `status: "sufficient"` only when the PR body or linked artifacts include after-fix real behavior evidence and an observed improved result. Use `status: "missing"` when proof is absent, `status: "mock_only"` when proof is only tests/mocks/CI, `status: "insufficient"` when the evidence does not show the changed real behavior after the fix, `status: "override"` when the PR has `proof: override`, and `status: "not_applicable"` for non-PR items or maintainer/bot PRs where the gate does not apply. When proof is missing, mock-only, or insufficient, set `needsContributorAction: true`, make the PR a human-only merge blocker, and do not request ClawSweeper repair markers because automation cannot prove the contributor's setup for them. +For PRs, include a dedicated `realBehaviorProof` assessment before any pass, automerge, or repair verdict. External PRs must show that the contributor ran the changed behavior after the fix in a real setup. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only; they are not real behavior proof by themselves. Treat screenshots, recordings, terminal screenshots, console output, copied live output, linked artifacts, and redacted runtime logs as valid proof, including for non-visual CLI, console, text, or error-message changes. Use your tools and best judgement: inspect the PR body, comments, links, screenshots, videos, logs, terminal output, and changed behavior context; you may download/open GitHub attachment links, generate stills or contact sheets from videos, inspect terminal screenshots and logs, and compare the proof against the PR diff. Use the provided scratch directory for downloaded artifacts and keep the target checkout read-only. Use `status: "sufficient"` only when the evidence convincingly shows after-fix real behavior and an observed improved result. Use `status: "missing"` when proof is absent, `status: "mock_only"` when proof is only tests/mocks/CI, `status: "insufficient"` when the evidence is unrelated, unviewable, too weak, or does not show the changed real behavior after the fix, `status: "override"` when the PR has `proof: override`, and `status: "not_applicable"` for non-PR items or maintainer/bot PRs where the gate does not apply. When proof is missing, mock-only, or insufficient, set `needsContributorAction: true`, make the PR a human-only merge blocker, and do not request ClawSweeper repair markers because automation cannot prove the contributor's setup for them. For PRs, also emit Codex `/review`-style findings in `reviewFindings`. Review the diff as another engineer's proposed patch and list every discrete, @@ -324,9 +324,14 @@ Always fill `realBehaviorProof`. For external PRs, this is a merge gate, not a nice-to-have. Missing, mock-only, or insufficient proof should appear near the top of the public review as "needs real behavior proof before merge"; tell the contributor that terminal screenshots, console output, copied live output, -linked artifacts, and redacted logs count. Use `evidenceKind: "none"` when proof -is absent or mock-only, and set `needsContributorAction: false` only for -`sufficient`, `override`, or `not_applicable`. +linked artifacts, recordings, and redacted logs count. If the proof links to +public or GitHub-hosted media, inspect it when possible before deciding. Also +tell contributors that after they add proof, updating the PR body should trigger +a fresh ClawSweeper review automatically; if it does not, they can ask a +maintainer to comment `@clawsweeper re-review`. Use `evidenceKind: "none"` when +proof is absent or mock-only, and set +`needsContributorAction: false` only for `sufficient`, `override`, or +`not_applicable`. Always fill the work-lane fields too. For non-candidates, use `workCandidate: "none"`, low confidence/priority, an empty `workPrompt`, and diff --git a/src/clawsweeper.ts b/src/clawsweeper.ts index 2e63385a0b..fc1f856122 100644 --- a/src/clawsweeper.ts +++ b/src/clawsweeper.ts @@ -361,6 +361,10 @@ interface ReviewPromptBuild { telemetry: ReviewPromptTelemetry; } +interface ReviewPromptRuntimeHints { + proofScratchDir?: string; +} + interface DashboardItem { repo: string; number: number; @@ -638,12 +642,13 @@ const RECENT_MISSING_OPEN_MS = DAY_MS; const DEFAULT_CODEX_MODEL = "gpt-5.5"; const DEFAULT_REASONING_EFFORT = "high"; const DEFAULT_SERVICE_TIER = ""; -const REVIEW_POLICY_VERSION = "2026-05-05-policy-v13"; +const REVIEW_POLICY_VERSION = "2026-05-05-policy-v14"; const REVIEW_COMMENT_MARKER_PREFIX = "