diff --git a/.github/workflows/sweep.yml b/.github/workflows/sweep.yml
index 8568174b31..e2b699646f 100644
--- a/.github/workflows/sweep.yml
+++ b/.github/workflows/sweep.yml
@@ -785,6 +785,18 @@ jobs:
           permission-issues: write
           permission-pull-requests: read
 
+      - name: Create target Codex inspection token
+        id: codex-inspection-token
+        uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
+        with:
+          client-id: ${{ env.CLAWSWEEPER_APP_CLIENT_ID }}
+          private-key: ${{ secrets.CLAWSWEEPER_APP_PRIVATE_KEY }}
+          owner: ${{ needs.plan.outputs.target_repo_owner }}
+          repositories: ${{ needs.plan.outputs.target_repo_name }}
+          permission-contents: read
+          permission-issues: read
+          permission-pull-requests: read
+
       - uses: actions/setup-node@v6
         with:
           node-version: 24
@@ -838,6 +850,7 @@ jobs:
         working-directory: clawsweeper
         env:
           GH_TOKEN: ${{ steps.target-read-token.outputs.token }}
+          CLAWSWEEPER_PROOF_INSPECTION_TOKEN: ${{ steps.codex-inspection-token.outputs.token }}
           ADDITIONAL_PROMPT: ${{ github.event.inputs.additional_prompt || github.event.client_payload.additional_prompt || '' }}
         run: |
           hot_intake_arg=()
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 669e932779..19a2762e10 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ checkpoint, and status-only commits are intentionally omitted.
 
 ### Added
 
+- Added agent-led real behavior proof judgement so ClawSweeper can inspect linked screenshots, videos, logs, and terminal output with a read-only GitHub token, explain the proof verdict in the review comment, tell contributors how to trigger a fresh review after adding proof, and sync `proof: sufficient` when the evidence is convincing.
 - Added a real behavior proof assessment to PR reviews so missing, mock-only, or insufficient contributor proof blocks pass/automerge markers and asks for screenshots, terminal output, redacted logs, recordings, linked artifacts, or copied live output instead.
 - Added `config/automation-limits.json` plus docs and a drift check so review,
   commit-review, repair, and issue-implementation capacity defaults have one
diff --git a/prompts/review-item.md b/prompts/review-item.md
index dd0cfad8ae..36913d945f 100644
--- a/prompts/review-item.md
+++ b/prompts/review-item.md
@@ -84,7 +84,7 @@ likely owner.
 
 For PRs, include a dedicated security review pass in addition to the functional review. Inspect whether the diff could introduce a security or supply-chain regression, especially when it touches CI workflows, GitHub Action refs, dependency sources, lockfiles, install/build/release scripts, package publishing metadata, secrets handling, permissions, downloaded artifacts, generated/vendor/minified files, or other code execution paths. Check whether those changes are consistent with the PR title, body, discussion, and stated purpose before deciding. Be cautious when a small or unrelated functional change also introduces new third-party code execution, broadens secret or permission access, changes package resolution, adds lifecycle hooks, downloads and executes artifacts, or mixes infrastructure changes into otherwise cosmetic work. Do not infer malicious intent without concrete evidence. Always summarize this pass in `securityReview`; set `status: "cleared"` when the diff has no concrete security or supply-chain concern, `status: "needs_attention"` when there is a concrete concern, and `status: "not_applicable"` for non-PR items without a security-sensitive report. Put concrete security concerns in `securityReview.concerns` with file/line when possible, and also include blocking concerns in `risks` and `evidence` when they affect the merge/close decision.
 
-For PRs, include a dedicated `realBehaviorProof` assessment before any pass, automerge, or repair verdict. External PRs must show that the contributor ran the changed behavior after the fix in a real setup. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only; they are not real behavior proof by themselves. Treat screenshots, recordings, terminal screenshots, console output, copied live output, linked artifacts, and redacted runtime logs as valid proof, including for non-visual CLI, console, text, or error-message changes. Use `status: "sufficient"` only when the PR body or linked artifacts include after-fix real behavior evidence and an observed improved result. Use `status: "missing"` when proof is absent, `status: "mock_only"` when proof is only tests/mocks/CI, `status: "insufficient"` when the evidence does not show the changed real behavior after the fix, `status: "override"` when the PR has `proof: override`, and `status: "not_applicable"` for non-PR items or maintainer/bot PRs where the gate does not apply. When proof is missing, mock-only, or insufficient, set `needsContributorAction: true`, make the PR a human-only merge blocker, and do not request ClawSweeper repair markers because automation cannot prove the contributor's setup for them.
+For PRs, include a dedicated `realBehaviorProof` assessment before any pass, automerge, or repair verdict. External PRs must show that the contributor ran the changed behavior after the fix in a real setup. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only; they are not real behavior proof by themselves. Treat screenshots, recordings, terminal screenshots, console output, copied live output, linked artifacts, and redacted runtime logs as valid proof, including for non-visual CLI, console, text, or error-message changes. Use your tools and best judgement: inspect the PR body, comments, links, screenshots, videos, logs, terminal output, and changed behavior context; you may download/open GitHub attachment links, generate stills or contact sheets from videos, inspect terminal screenshots and logs, and compare the proof against the PR diff. Use the provided scratch directory for downloaded artifacts and keep the target checkout read-only. Use `status: "sufficient"` only when the evidence convincingly shows after-fix real behavior and an observed improved result. Use `status: "missing"` when proof is absent, `status: "mock_only"` when proof is only tests/mocks/CI, `status: "insufficient"` when the evidence is unrelated, unviewable, too weak, or does not show the changed real behavior after the fix, `status: "override"` when the PR has `proof: override`, and `status: "not_applicable"` for non-PR items or maintainer/bot PRs where the gate does not apply. When proof is missing, mock-only, or insufficient, set `needsContributorAction: true`, make the PR a human-only merge blocker, and do not request ClawSweeper repair markers because automation cannot prove the contributor's setup for them.
 
 For PRs, also emit Codex `/review`-style findings in `reviewFindings`.
 Review the diff as another engineer's proposed patch and list every discrete,
@@ -324,9 +324,14 @@ Always fill `realBehaviorProof`. For external PRs, this is a merge gate, not a
 nice-to-have. Missing, mock-only, or insufficient proof should appear near the
 top of the public review as "needs real behavior proof before merge"; tell the
 contributor that terminal screenshots, console output, copied live output,
-linked artifacts, and redacted logs count. Use `evidenceKind: "none"` when proof
-is absent or mock-only, and set `needsContributorAction: false` only for
-`sufficient`, `override`, or `not_applicable`.
+linked artifacts, recordings, and redacted logs count. If the proof links to
+public or GitHub-hosted media, inspect it when possible before deciding. Also
+tell contributors that after they add proof, updating the PR body should trigger
+a fresh ClawSweeper review automatically; if it does not, they can ask a
+maintainer to comment `@clawsweeper re-review`. Use `evidenceKind: "none"` when
+proof is absent or mock-only, and set
+`needsContributorAction: false` only for `sufficient`, `override`, or
+`not_applicable`.
 
 Always fill the work-lane fields too. For non-candidates, use
 `workCandidate: "none"`, low confidence/priority, an empty `workPrompt`, and
diff --git a/src/clawsweeper.ts b/src/clawsweeper.ts
index 2e63385a0b..fc1f856122 100644
--- a/src/clawsweeper.ts
+++ b/src/clawsweeper.ts
@@ -361,6 +361,10 @@ interface ReviewPromptBuild {
   telemetry: ReviewPromptTelemetry;
 }
 
+interface ReviewPromptRuntimeHints {
+  proofScratchDir?: string;
+}
+
 interface DashboardItem {
   repo: string;
   number: number;
@@ -638,12 +642,13 @@ const RECENT_MISSING_OPEN_MS = DAY_MS;
 const DEFAULT_CODEX_MODEL = "gpt-5.5";
 const DEFAULT_REASONING_EFFORT = "high";
 const DEFAULT_SERVICE_TIER = "";
-const REVIEW_POLICY_VERSION = "2026-05-05-policy-v13";
+const REVIEW_POLICY_VERSION = "2026-05-05-policy-v14";
 const REVIEW_COMMENT_MARKER_PREFIX = "<!-- clawsweeper-review";
 const REVIEW_START_STATUS_MARKER_PREFIX = "<!-- clawsweeper-review-status";
 const AUTOMERGE_LABEL = "clawsweeper:automerge";
 const AUTOFIX_LABEL = "clawsweeper:autofix";
 const PROOF_OVERRIDE_LABEL = "proof: override";
+const PROOF_SUFFICIENT_LABEL = "proof: sufficient";
 const PROTECTED_LABELS = new Set(["security", "beta-blocker", "release-blocker", "maintainer"]);
 const ALLOWED_REASONS = new Set<CloseReason>([
   "implemented_on_main",
@@ -3260,10 +3265,12 @@ function buildReviewPrompt(
   context: ItemContext,
   git: GitInfo,
   additionalPrompt = "",
+  runtimeHints: ReviewPromptRuntimeHints = {},
 ): ReviewPromptBuild {
   const prompt = reviewPromptTemplate();
   const contextJson = contextJsonForPrompt(context);
   const schema = reviewDecisionSchemaText();
+  const proofScratchDir = runtimeHints.proofScratchDir?.trim();
   const extra = additionalPrompt.trim()
     ? `
 
@@ -3289,6 +3296,12 @@ ${additionalPrompt.trim()}
 - Current main SHA: ${git.mainSha}
 - Latest release: ${git.latestRelease?.tagName ?? "unknown"} (${git.latestRelease?.sha ?? "unknown sha"})
 
+## Runtime Capabilities
+
+- You may use the available network and read-only GitHub token to inspect PR body links, comments, screenshots, videos, logs, terminal output, and target-repo artifacts.
+- Download proof artifacts into ${proofScratchDir ? `\`${proofScratchDir}\`` : "a temporary scratch directory"} before inspecting them.
+- The target checkout is read-only for review. Do not modify repository files; use the scratch directory or /tmp for downloaded evidence and generated video stills/contact sheets.
+
 ## GitHub Context
 
 \`\`\`json
@@ -3308,10 +3321,6 @@ ${extra}
   };
 }
 
-function promptFor(item: Item, context: ItemContext, git: GitInfo, additionalPrompt = ""): string {
-  return buildReviewPrompt(item, context, git, additionalPrompt).text;
-}
-
 function reviewPromptTelemetry(
   item: Item,
   context: ItemContext,
@@ -3434,14 +3443,20 @@ function runCodex(options: {
   timeoutMs: number;
   workDir: string;
   additionalPrompt?: string;
+  proofScratchDir?: string;
   prompt?: string;
 }): Decision {
   ensureDir(options.workDir);
+  const proofScratchDir =
+    options.proofScratchDir ?? join(options.workDir, "proof-scratch", String(options.item.number));
+  ensureDir(proofScratchDir);
   const promptPath = join(options.workDir, `${options.item.number}.prompt.md`);
   const outputPath = join(options.workDir, `${options.item.number}.json`);
   const prompt =
     options.prompt ??
-    promptFor(options.item, options.context, options.git, options.additionalPrompt);
+    buildReviewPrompt(options.item, options.context, options.git, options.additionalPrompt, {
+      proofScratchDir,
+    }).text;
   writeFileSync(promptPath, prompt, "utf8");
   const dirtyBefore = openclawDirtyStatus(options.openclawDir);
   if (dirtyBefore) {
@@ -3470,12 +3485,17 @@ function runCodex(options: {
       outputPath,
       "--sandbox",
       options.sandboxMode,
+      "--add-dir",
+      proofScratchDir,
       "-",
     ],
     {
       cwd: options.openclawDir,
       encoding: "utf8",
-      env: codexEnv(),
+      env: {
+        ...codexEnv({ ghToken: process.env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN }),
+        CLAWSWEEPER_PROOF_SCRATCH_DIR: proofScratchDir,
+      },
       input: prompt,
       maxBuffer: 128 * 1024 * 1024,
       timeout: options.timeoutMs,
@@ -4146,6 +4166,18 @@ function publicSecurityReviewLine(review: SecurityReview): string {
   return `${prefix}: ${sentence(review.summary)}`;
 }
 
+function realBehaviorProofReReviewGuidance(): string {
+  return "After adding proof, update the PR body; ClawSweeper should re-review automatically. If it does not, ask a maintainer to comment `@clawsweeper re-review`.";
+}
+
+function realBehaviorProofBlockerSummary(summary: string, fallback: string): string {
+  const body = sentence(summary) || fallback;
+  if (/\b(?:@clawsweeper re-review|re-review automatically|update the PR body)\b/i.test(body)) {
+    return body;
+  }
+  return `${body} ${realBehaviorProofReReviewGuidance()}`;
+}
+
 function publicRealBehaviorProofLine(proof: RealBehaviorProof): string {
   const summary = sentence(proof.summary);
   switch (proof.status) {
@@ -4154,20 +4186,20 @@ function publicRealBehaviorProofLine(proof: RealBehaviorProof): string {
     case "override":
       return `Override: ${summary || "A maintainer applied proof: override."}`;
     case "missing":
-      return `Needs real behavior proof before merge: ${
-        summary ||
-        "the PR must include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count."
-      }`;
+      return `Needs real behavior proof before merge: ${realBehaviorProofBlockerSummary(
+        summary,
+        "The PR must include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count.",
+      )}`;
     case "mock_only":
-      return `Needs real behavior proof before merge: ${
-        summary ||
-        "tests, mocks, snapshots, lint, typechecks, and CI are supplemental only. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count."
-      }`;
+      return `Needs real behavior proof before merge: ${realBehaviorProofBlockerSummary(
+        summary,
+        "Tests, mocks, snapshots, lint, typechecks, and CI are supplemental only. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count.",
+      )}`;
     case "insufficient":
-      return `Needs stronger real behavior proof before merge: ${
-        summary ||
-        "include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count."
-      }`;
+      return `Needs stronger real behavior proof before merge: ${realBehaviorProofBlockerSummary(
+        summary,
+        "Include after-fix evidence from a real setup. Terminal screenshots, console output, copied live output, linked artifacts, and redacted logs count.",
+      )}`;
     case "not_applicable":
       return summary ? `Not applicable: ${summary}` : "";
   }
@@ -4448,6 +4480,46 @@ function reportRealBehaviorProof(markdown: string): RealBehaviorProof {
   };
 }
 
+function nextRealBehaviorProofSufficientLabels(
+  labels: readonly string[],
+  proof: Pick<RealBehaviorProof, "status">,
+): string[] {
+  const nextLabels = labels.filter((label) => label !== PROOF_SUFFICIENT_LABEL);
+  if (proof.status === "sufficient") nextLabels.push(PROOF_SUFFICIENT_LABEL);
+  return nextLabels;
+}
+
+export function realBehaviorProofSufficientLabelsForTest(
+  labels: readonly string[],
+  status: string,
+): string[] {
+  const proofStatus = REAL_BEHAVIOR_PROOF_STATUSES.has(status as RealBehaviorProofStatus)
+    ? (status as RealBehaviorProofStatus)
+    : "not_applicable";
+  return nextRealBehaviorProofSufficientLabels(labels, { status: proofStatus });
+}
+
+function syncRealBehaviorProofSufficientLabel(options: {
+  number: number;
+  labels: readonly string[];
+  proof: Pick<RealBehaviorProof, "status">;
+  dryRun: boolean;
+}): string[] {
+  const nextLabels = nextRealBehaviorProofSufficientLabels(options.labels, options.proof);
+  const hadLabel = options.labels.includes(PROOF_SUFFICIENT_LABEL);
+  const wantsLabel = nextLabels.includes(PROOF_SUFFICIENT_LABEL);
+  if (hadLabel === wantsLabel) return nextLabels;
+  if (options.dryRun) return nextLabels;
+  ghWithRetry([
+    "issue",
+    "edit",
+    String(options.number),
+    wantsLabel ? "--add-label" : "--remove-label",
+    PROOF_SUFFICIENT_LABEL,
+  ]);
+  return nextLabels;
+}
+
 function isAutomationReportAuthor(author: string | undefined): boolean {
   return Boolean(author && (/\[bot\]$/i.test(author) || author.startsWith("app/")));
 }
@@ -5969,7 +6041,9 @@ function reviewCommand(args: Args): void {
     const contextStartedAt = Date.now();
     const context = collectItemContext(item);
     const contextElapsedMs = Date.now() - contextStartedAt;
-    const prompt = buildReviewPrompt(item, context, git, additionalPrompt);
+    const codexWorkDir = join(artifactDir, "codex");
+    const proofScratchDir = join(codexWorkDir, "proof-scratch", String(item.number));
+    const prompt = buildReviewPrompt(item, context, git, additionalPrompt, { proofScratchDir });
     const snapshotHash = itemSnapshotHash(item, context);
     try {
       const startComment = postReviewStartStatusComment({
@@ -6003,8 +6077,9 @@ function reviewCommand(args: Args): void {
         sandboxMode,
         serviceTier,
         timeoutMs,
-        workDir: join(artifactDir, "codex"),
+        workDir: codexWorkDir,
         additionalPrompt,
+        proofScratchDir,
         prompt: prompt.text,
       });
     } catch (error) {
@@ -6197,6 +6272,14 @@ function applyDecisionsCommand(args: Args): void {
       if (processedCount >= processedLimit) break;
       continue;
     }
+    if (state === "open" && item.kind === "pull_request") {
+      item.labels = syncRealBehaviorProofSufficientLabel({
+        number,
+        labels: item.labels,
+        proof: reportRealBehaviorProof(markdown),
+        dryRun,
+      });
+    }
     markdown = replaceFrontMatterValue(markdown, "labels", JSON.stringify(item.labels));
     const reviewComment = renderReviewCommentFromReport(markdown, closeReason ?? "none");
     const existingReviewComment = issueReviewComment(number, [
diff --git a/src/codex-env.ts b/src/codex-env.ts
index 1bc69ca052..adb62cf131 100644
--- a/src/codex-env.ts
+++ b/src/codex-env.ts
@@ -8,6 +8,7 @@ export function codexEnv(options: CodexEnvOptions = {}): NodeJS.ProcessEnv {
   delete env.GH_TOKEN;
   delete env.GITHUB_TOKEN;
   delete env.COMMIT_SWEEPER_TARGET_GH_TOKEN;
+  delete env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN;
   delete env.CLAWSWEEPER_APP_ID;
   delete env.CLAWSWEEPER_APP_PRIVATE_KEY;
   delete env.OPENAI_API_KEY;
diff --git a/test/clawsweeper.test.ts b/test/clawsweeper.test.ts
index a7835b1216..4a2ec88152 100644
--- a/test/clawsweeper.test.ts
+++ b/test/clawsweeper.test.ts
@@ -31,6 +31,7 @@ import {
   parseGhJsonLines,
   parseDecision,
   protectedLabels,
+  realBehaviorProofSufficientLabelsForTest,
   relatedTitleSearchTerms,
   renderReviewStartStatusComment,
   reviewArtifactDestination,
@@ -1875,6 +1876,8 @@ Full review comments:
   assert.match(comment, /Codex review: needs real behavior proof before merge\./);
   assert.match(comment, /\*\*Real behavior proof\*\*/);
   assert.match(comment, /terminal screenshots, console output, copied live output/);
+  assert.match(comment, /update the PR body; ClawSweeper should re-review automatically/);
+  assert.match(comment, /@clawsweeper re-review/);
   assert.match(markers, /clawsweeper-verdict:needs-human/);
   assert.doesNotMatch(markers, /clawsweeper-verdict:pass/);
   assert.doesNotMatch(markers, /clawsweeper-action:fix-required/);
@@ -2494,6 +2497,11 @@ test("review prompt requires real behavior proof for PR reviews", () => {
 
   assert.match(prompt, /realBehaviorProof/);
   assert.match(prompt, /Terminal screenshots|terminal screenshots/);
+  assert.match(prompt, /download\/open GitHub attachment links/);
+  assert.match(prompt, /generate stills or contact sheets from videos/);
+  assert.match(prompt, /compare the proof against the PR diff/);
+  assert.match(prompt, /scratch directory/);
+  assert.match(prompt, /@clawsweeper re-review/);
   assert.match(
     prompt,
     /Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only/,
@@ -2501,6 +2509,29 @@ test("review prompt requires real behavior proof for PR reviews", () => {
   assert.match(prompt, /do not request ClawSweeper repair markers/);
 });
 
+test("ClawSweeper proof judgement controls the sufficient proof label", () => {
+  assert.deepEqual(realBehaviorProofSufficientLabelsForTest(["proof: supplied"], "sufficient"), [
+    "proof: supplied",
+    "proof: sufficient",
+  ]);
+  assert.deepEqual(
+    realBehaviorProofSufficientLabelsForTest(
+      ["proof: supplied", "proof: sufficient"],
+      "insufficient",
+    ),
+    ["proof: supplied"],
+  );
+  assert.deepEqual(realBehaviorProofSufficientLabelsForTest(["proof: sufficient"], "missing"), []);
+});
+
+test("review workflow gives Codex a read-only inspection token", () => {
+  const workflow = readFileSync(".github/workflows/sweep.yml", "utf8");
+
+  assert.match(workflow, /id: codex-inspection-token/);
+  assert.match(workflow, /permission-issues: read/);
+  assert.match(workflow, /CLAWSWEEPER_PROOF_INSPECTION_TOKEN/);
+});
+
 test("review prompt asks for concise public review fields", () => {
   const prompt = readFileSync("prompts/review-item.md", "utf8");
 
@@ -2608,6 +2639,7 @@ test("codex subprocess env strips GitHub and App credentials", () => {
     process.env.GH_TOKEN = "gh";
     process.env.GITHUB_TOKEN = "github";
     process.env.COMMIT_SWEEPER_TARGET_GH_TOKEN = "target";
+    process.env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN = "codex-target";
     process.env.CLAWSWEEPER_APP_ID = "123";
     process.env.CLAWSWEEPER_APP_PRIVATE_KEY = "private";
     process.env.OPENAI_API_KEY = "openai";
@@ -2618,6 +2650,7 @@ test("codex subprocess env strips GitHub and App credentials", () => {
     assert.equal(env.GH_TOKEN, undefined);
     assert.equal(env.GITHUB_TOKEN, undefined);
     assert.equal(env.COMMIT_SWEEPER_TARGET_GH_TOKEN, undefined);
+    assert.equal(env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN, undefined);
     assert.equal(env.CLAWSWEEPER_APP_ID, undefined);
     assert.equal(env.CLAWSWEEPER_APP_PRIVATE_KEY, undefined);
     assert.equal(env.OPENAI_API_KEY, undefined);
@@ -2634,12 +2667,14 @@ test("codex subprocess env can expose an explicit read-only GitHub token", () =>
     process.env.GH_TOKEN = "ambient";
     process.env.GITHUB_TOKEN = "github";
     process.env.COMMIT_SWEEPER_TARGET_GH_TOKEN = "hidden";
+    process.env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN = "hidden-codex";
 
     const env = codexEnv({ ghToken: "target-read" });
 
     assert.equal(env.GH_TOKEN, "target-read");
     assert.equal(env.GITHUB_TOKEN, undefined);
     assert.equal(env.COMMIT_SWEEPER_TARGET_GH_TOKEN, undefined);
+    assert.equal(env.CLAWSWEEPER_PROOF_INSPECTION_TOKEN, undefined);
     assert.equal(env.GIT_OPTIONAL_LOCKS, "0");
   } finally {
     process.env = originalEnv;