openclaw · pashpashpash · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.github/workflows/sweep.yml b/.github/workflows/sweep.yml
@@ -785,6 +785,18 @@ jobs:
           permission-issues: write
           permission-pull-requests: read
 
+      - name: Create target Codex inspection token
+        id: codex-inspection-token
+        uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
+        with:
+          client-id: ${{ env.CLAWSWEEPER_APP_CLIENT_ID }}
+          private-key: ${{ secrets.CLAWSWEEPER_APP_PRIVATE_KEY }}
+          owner: ${{ needs.plan.outputs.target_repo_owner }}
+          repositories: ${{ needs.plan.outputs.target_repo_name }}
+          permission-contents: read
+          permission-issues: read
+          permission-pull-requests: read
+
       - uses: actions/setup-node@v6
         with:
           node-version: 24
@@ -838,6 +850,7 @@ jobs:
         working-directory: clawsweeper
         env:
           GH_TOKEN: ${{ steps.target-read-token.outputs.token }}
+          CLAWSWEEPER_PROOF_INSPECTION_TOKEN: ${{ steps.codex-inspection-token.outputs.token }}
           ADDITIONAL_PROMPT: ${{ github.event.inputs.additional_prompt || github.event.client_payload.additional_prompt || '' }}
         run: |
           hot_intake_arg=()

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ checkpoint, and status-only commits are intentionally omitted.
 
 ### Added
 
+- Added agent-led real behavior proof judgement so ClawSweeper can inspect linked screenshots, videos, logs, and terminal output with a read-only GitHub token, explain the proof verdict in the review comment, tell contributors how to trigger a fresh review after adding proof, and sync `proof: sufficient` when the evidence is convincing.
 - Added a real behavior proof assessment to PR reviews so missing, mock-only, or insufficient contributor proof blocks pass/automerge markers and asks for screenshots, terminal output, redacted logs, recordings, linked artifacts, or copied live output instead.
 - Added `config/automation-limits.json` plus docs and a drift check so review,
   commit-review, repair, and issue-implementation capacity defaults have one

diff --git a/prompts/review-item.md b/prompts/review-item.md
@@ -84,7 +84,7 @@ likely owner.
 
 For PRs, include a dedicated security review pass in addition to the functional review. Inspect whether the diff could introduce a security or supply-chain regression, especially when it touches CI workflows, GitHub Action refs, dependency sources, lockfiles, install/build/release scripts, package publishing metadata, secrets handling, permissions, downloaded artifacts, generated/vendor/minified files, or other code execution paths. Check whether those changes are consistent with the PR title, body, discussion, and stated purpose before deciding. Be cautious when a small or unrelated functional change also introduces new third-party code execution, broadens secret or permission access, changes package resolution, adds lifecycle hooks, downloads and executes artifacts, or mixes infrastructure changes into otherwise cosmetic work. Do not infer malicious intent without concrete evidence. Always summarize this pass in `securityReview`; set `status: "cleared"` when the diff has no concrete security or supply-chain concern, `status: "needs_attention"` when there is a concrete concern, and `status: "not_applicable"` for non-PR items without a security-sensitive report. Put concrete security concerns in `securityReview.concerns` with file/line when possible, and also include blocking concerns in `risks` and `evidence` when they affect the merge/close decision.
 
-For PRs, include a dedicated `realBehaviorProof` assessment before any pass, automerge, or repair verdict. External PRs must show that the contributor ran the changed behavior after the fix in a real setup. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only; they are not real behavior proof by themselves. Treat screenshots, recordings, terminal screenshots, console output, copied live output, linked artifacts, and redacted runtime logs as valid proof, including for non-visual CLI, console, text, or error-message changes. Use `status: "sufficient"` only when the PR body or linked artifacts include after-fix real behavior evidence and an observed improved result. Use `status: "missing"` when proof is absent, `status: "mock_only"` when proof is only tests/mocks/CI, `status: "insufficient"` when the evidence does not show the changed real behavior after the fix, `status: "override"` when the PR has `proof: override`, and `status: "not_applicable"` for non-PR items or maintainer/bot PRs where the gate does not apply. When proof is missing, mock-only, or insufficient, set `needsContributorAction: true`, make the PR a human-only merge blocker, and do not request ClawSweeper repair markers because automation cannot prove the contributor's setup for them.
+For PRs, include a dedicated `realBehaviorProof` assessment before any pass, automerge, or repair verdict. External PRs must show that the contributor ran the changed behavior after the fix in a real setup. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only; they are not real behavior proof by themselves. Treat screenshots, recordings, terminal screenshots, console output, copied live output, linked artifacts, and redacted runtime logs as valid proof, including for non-visual CLI, console, text, or error-message changes. Use your tools and best judgement: inspect the PR body, comments, links, screenshots, videos, logs, terminal output, and changed behavior context; you may download/open GitHub attachment links, generate stills or contact sheets from videos, inspect terminal screenshots and logs, and compare the proof against the PR diff. Use the provided scratch directory for downloaded artifacts and keep the target checkout read-only. Use `status: "sufficient"` only when the evidence convincingly shows after-fix real behavior and an observed improved result. Use `status: "missing"` when proof is absent, `status: "mock_only"` when proof is only tests/mocks/CI, `status: "insufficient"` when the evidence is unrelated, unviewable, too weak, or does not show the changed real behavior after the fix, `status: "override"` when the PR has `proof: override`, and `status: "not_applicable"` for non-PR items or maintainer/bot PRs where the gate does not apply. When proof is missing, mock-only, or insufficient, set `needsContributorAction: true`, make the PR a human-only merge blocker, and do not request ClawSweeper repair markers because automation cannot prove the contributor's setup for them.
 
 For PRs, also emit Codex `/review`-style findings in `reviewFindings`.
 Review the diff as another engineer's proposed patch and list every discrete,
@@ -324,9 +324,14 @@ Always fill `realBehaviorProof`. For external PRs, this is a merge gate, not a
 nice-to-have. Missing, mock-only, or insufficient proof should appear near the
 top of the public review as "needs real behavior proof before merge"; tell the
 contributor that terminal screenshots, console output, copied live output,
-linked artifacts, and redacted logs count. Use `evidenceKind: "none"` when proof
-is absent or mock-only, and set `needsContributorAction: false` only for
-`sufficient`, `override`, or `not_applicable`.
+linked artifacts, recordings, and redacted logs count. If the proof links to
+public or GitHub-hosted media, inspect it when possible before deciding. Also
+tell contributors that after they add proof, updating the PR body should trigger
+a fresh ClawSweeper review automatically; if it does not, they can ask a
+maintainer to comment `@clawsweeper re-review`. Use `evidenceKind: "none"` when
+proof is absent or mock-only, and set
+`needsContributorAction: false` only for `sufficient`, `override`, or
+`not_applicable`.
 
 Always fill the work-lane fields too. For non-candidates, use
 `workCandidate: "none"`, low confidence/priority, an empty `workPrompt`, and

diff --git a/src/clawsweeper.ts b/src/clawsweeper.ts
@@ -361,6 +361,10 @@ interface ReviewPromptBuild {
   telemetry: ReviewPromptTelemetry;
 }
 
+interface ReviewPromptRuntimeHints {
+  proofScratchDir?: string;
+}
+
 interface DashboardItem {
   repo: string;
   number: number;
@@ -638,12 +642,13 @@ const RECENT_MISSING_OPEN_MS = DAY_MS;
 const DEFAULT_CODEX_MODEL = "gpt-5.5";
 const DEFAULT_REASONING_EFFORT = "high";
 const DEFAULT_SERVICE_TIER = "";
-const REVIEW_POLICY_VERSION = "2026-05-05-policy-v13";
+const REVIEW_POLICY_VERSION = "2026-05-05-policy-v14";
 const REVIEW_COMMENT_MARKER_PREFIX = "<!-- clawsweeper-review";
 const REVIEW_START_STATUS_MARKER_PREFIX = "<!-- clawsweeper-review-status";
 const AUTOMERGE_LABEL = "clawsweeper:automerge";
 const AUTOFIX_LABEL = "clawsweeper:autofix";
 const PROOF_OVERRIDE_LABEL = "proof: override";
+const PROOF_SUFFICIENT_LABEL = "proof: sufficient";
 const PROTECTED_LABELS = new Set(["security", "beta-blocker", "release-blocker", "maintainer"]);
 const ALLOWED_REASONS = new Set<CloseReason>([
   "implemented_on_main",
@@ -3260,10 +3265,12 @@ function buildReviewPrompt(
   context: ItemContext,
   git: GitInfo,
   additionalPrompt = "",
+  runtimeHints: ReviewPromptRuntimeHints = {},
 ): ReviewPromptBuild {
   const prompt = reviewPromptTemplate();
   const contextJson = contextJsonForPrompt(context);
   const schema = reviewDecisionSchemaText();
+  const proofScratchDir = runtimeHints.proofScratchDir?.trim();
   const extra = additionalPrompt.trim()
     ? `
 
@@ -3289,6 +3296,12 @@ ${additionalPrompt.trim()}
 - Current main SHA: ${git.mainSha}
 - Latest release: ${git.latestRelease?.tagName ?? "unknown"} (${git.latestRelease?.sha ?? "unknown sha"})
 
+## Runtime Capabilities
+
+- You may use the available network and read-only GitHub token to inspect PR body links, comments, screenshots, videos, logs, terminal output, and target-repo artifacts.
+- Download proof artifacts into ${proofScratchDir ? `\`${proofScratchDir}\`` : "a temporary scratch directory"} before inspecting them.
+- The target checkout is read-only for review. Do not modify repository files; use the scratch directory or /tmp for downloaded evidence and generated video stills/contact sheets.
+
 ## GitHub Context
 
 \`\`\`json
@@ -3308,10 +3321,6 @@ ${extra}
   };
 }
 
-function promptFor(item: Item, context: ItemContext, git: GitInfo, additionalPrompt = ""): string {
-  return buildReviewPrompt(item, context, git, additionalPrompt).text;
-}
-
 function reviewPromptTelemetry(
   item: Item,
   context: ItemContext,
@@ -3434,14 +3443,20 @@ function runCodex(options: {
   timeoutMs: number;
   workDir: string;
   additionalPrompt?: string;
+  proofScratchDir?: string;
   prompt?: string;
 }): Decision {
   ensureDir(options.workDir);
+  const proofScratchDir =
+    options.proofScratchDir ?? join(options.workDir, "proof-scratch", String(options.item.number));
+  ensureDir(proofScratchDir);
   const promptPath = join(options.workDir, `${options.item.number}.prompt.md`);
   const outputPath = join(options.workDir, `${options.item.number}.json`);
   const prompt =
     options.prompt ??
-    promptFor(options.item, options.context, options.git, options.additionalPrompt);
+    buildReviewPrompt(options.item, options.context, options.git, options.additionalPrompt, {
+      proofScratchDir,
+    }).text;
   writeFileSync(promptPath, prompt, "utf8");
   const dirtyBefore = openclawDirtyStatus(options.openclawDir);
   if (dirtyBefore) {
@@ -3470,12 +3485,17 @@ function runCodex(options: {
       outputPath,
       "--sandbox",
       options.sandboxMode,
+      "--add-dir",
+      proofScratchDir,
       "-",
     ],
     {
       cwd: options.openclawDir,
       encoding: "utf8",
-      env: codexEnv(),
+      env: {
+        ...codexEnv({ ghToken: process.env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN }),
+        CLAWSWEEPER_PROOF_SCRATCH_DIR: proofScratchDir,
+      },
       input: prompt,
       maxBuffer: 128 * 1024 * 1024,
       timeout: options.timeoutMs,
@@ -4146,6 +4166,18 @@ function publicSecurityReviewLine(review: SecurityReview): string {
   return `${prefix}: ${sentence(review.summary)}`;
 }
 
+function realBehaviorProofReReviewGuidance(): string {
+  return "After adding proof, update the PR body; ClawSweeper should re-review automatically. If it does not, ask a maintainer to comment `@clawsweeper re-review`.";
+}
+
+function realBehaviorProofBlockerSummary(summary: string, fallback: string): string {
+  const body = sentence(summary) || fallback;
+  if (/\b(?:@clawsweeper re-review|re-review automatically|update the PR body)\b/i.test(body)) {
+    return body;
+  }
+  return `${body} ${realBehaviorProofReReviewGuidance()}`;
+}
+
 function publicRealBehaviorProofLine(proof: RealBehaviorProof): string {
   const summary = sentence(proof.summary);
   switch (proof.status) {
@@ -4154,20 +4186,20 @@ function publicRealBehaviorProofLine(proof: RealBehaviorProof): string {
     case "override":
       return `Override: ${summary || "A maintainer applied proof: override."}`;
     case "missing":
-      return `Needs real behavior proof before merge: ${
-        summary ||
-        "the PR must include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count."
-      }`;
+      return `Needs real behavior proof before merge: ${realBehaviorProofBlockerSummary(
+        summary,
+        "The PR must include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count.",
+      )}`;
     case "mock_only":
-      return `Needs real behavior proof before merge: ${
-        summary ||
-        "tests, mocks, snapshots, lint, typechecks, and CI are supplemental only. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count."
-      }`;
+      return `Needs real behavior proof before merge: ${realBehaviorProofBlockerSummary(
+        summary,
+        "Tests, mocks, snapshots, lint, typechecks, and CI are supplemental only. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count.",
+      )}`;
     case "insufficient":
-      return `Needs stronger real behavior proof before merge: ${
-        summary ||
-        "include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count."
-      }`;
+      return `Needs stronger real behavior proof before merge: ${realBehaviorProofBlockerSummary(
+        summary,
+        "Include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count.",
+      )}`;
     case "not_applicable":
       return summary ? `Not applicable: ${summary}` : "";
   }
@@ -4448,6 +4480,46 @@ function reportRealBehaviorProof(markdown: string): RealBehaviorProof {
   };
 }
 
+function nextRealBehaviorProofSufficientLabels(
+  labels: readonly string[],
+  proof: Pick<RealBehaviorProof, "status">,
+): string[] {
+  const nextLabels = labels.filter((label) => label !== PROOF_SUFFICIENT_LABEL);
+  if (proof.status === "sufficient") nextLabels.push(PROOF_SUFFICIENT_LABEL);
+  return nextLabels;
+}
+
+export function realBehaviorProofSufficientLabelsForTest(
+  labels: readonly string[],
+  status: string,
+): string[] {
+  const proofStatus = REAL_BEHAVIOR_PROOF_STATUSES.has(status as RealBehaviorProofStatus)
+    ? (status as RealBehaviorProofStatus)
+    : "not_applicable";
+  return nextRealBehaviorProofSufficientLabels(labels, { status: proofStatus });
+}
+
+function syncRealBehaviorProofSufficientLabel(options: {
+  number: number;
+  labels: readonly string[];
+  proof: Pick<RealBehaviorProof, "status">;
+  dryRun: boolean;
+}): string[] {
+  const nextLabels = nextRealBehaviorProofSufficientLabels(options.labels, options.proof);
+  const hadLabel = options.labels.includes(PROOF_SUFFICIENT_LABEL);
+  const wantsLabel = nextLabels.includes(PROOF_SUFFICIENT_LABEL);
+  if (hadLabel === wantsLabel) return nextLabels;
+  if (options.dryRun) return nextLabels;
+  ghWithRetry([
+    "issue",
+    "edit",
+    String(options.number),
+    wantsLabel ? "--add-label" : "--remove-label",
+    PROOF_SUFFICIENT_LABEL,
+  ]);
+  return nextLabels;
+}
+
 function isAutomationReportAuthor(author: string | undefined): boolean {
   return Boolean(author && (/\[bot\]$/i.test(author) || author.startsWith("app/")));
 }
@@ -5969,7 +6041,9 @@ function reviewCommand(args: Args): void {
     const contextStartedAt = Date.now();
     const context = collectItemContext(item);
     const contextElapsedMs = Date.now() - contextStartedAt;
-    const prompt = buildReviewPrompt(item, context, git, additionalPrompt);
+    const codexWorkDir = join(artifactDir, "codex");
+    const proofScratchDir = join(codexWorkDir, "proof-scratch", String(item.number));
+    const prompt = buildReviewPrompt(item, context, git, additionalPrompt, { proofScratchDir });
     const snapshotHash = itemSnapshotHash(item, context);
     try {
       const startComment = postReviewStartStatusComment({
@@ -6003,8 +6077,9 @@ function reviewCommand(args: Args): void {
         sandboxMode,
         serviceTier,
         timeoutMs,
-        workDir: join(artifactDir, "codex"),
+        workDir: codexWorkDir,
         additionalPrompt,
+        proofScratchDir,
         prompt: prompt.text,
       });
     } catch (error) {
@@ -6197,6 +6272,14 @@ function applyDecisionsCommand(args: Args): void {
       if (processedCount >= processedLimit) break;
       continue;
     }
+    if (state === "open" && item.kind === "pull_request") {
+      item.labels = syncRealBehaviorProofSufficientLabel({
+        number,
+        labels: item.labels,
+        proof: reportRealBehaviorProof(markdown),
+        dryRun,
+      });
+    }
     markdown = replaceFrontMatterValue(markdown, "labels", JSON.stringify(item.labels));
     const reviewComment = renderReviewCommentFromReport(markdown, closeReason ?? "none");
     const existingReviewComment = issueReviewComment(number, [

diff --git a/src/codex-env.ts b/src/codex-env.ts
@@ -8,6 +8,7 @@ export function codexEnv(options: CodexEnvOptions = {}): NodeJS.ProcessEnv {
   delete env.GH_TOKEN;
   delete env.GITHUB_TOKEN;
   delete env.COMMIT_SWEEPER_TARGET_GH_TOKEN;
+  delete env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN;
   delete env.CLAWSWEEPER_APP_ID;
   delete env.CLAWSWEEPER_APP_PRIVATE_KEY;
   delete env.OPENAI_API_KEY;