Skip to content

Commit 0ce807d

Browse files
feat: cross-agent diff review before auto-merge (#37)
* feat: cross-agent diff review before auto-merge When a coding agent completes a task successfully, spawn a different agent to review the git diff before merging to main. Claude codes → Codex reviews, Codex codes → Claude reviews. - Add reviewDiffWithAgent() to AgentRunner with structured JSON prompt - Add pickReviewAgent() for cross-agent selection logic - Add parseReviewResponse() with non-greedy regex for robust JSON extraction - Insert review gate in scheduler's executeAndRelease() between task success and pool.release(merge) - Review runs in /tmp to prevent reviewer from modifying the worktree - Falls back to heuristic reviewDiff() if agent fails or times out - Add ReviewResult.approve field to gate merge decisions - Add task.review field to persist review results - 14 new tests (299 total, 0 failures) Closes #37 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(review): address round-2 review issues - Remove unused `cwd` param from reviewDiffWithAgent() - Fix JSON extraction: use indexOf/lastIndexOf instead of regex (handles braces in strings correctly) - Add task.error message when review rejects merge - Add review_started event assertions to scheduler tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(review): extract normalizeReviewObj, add reviewAgent to result - Extract duplicated JSON normalisation into normalizeReviewObj() - Add reviewAgent field to ReviewResult so consumers know which agent reviewed - Update Task.review type to include reviewAgent Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(review): move ReviewResult to types.ts, add missing tests - Move ReviewResult interface to types.ts (single source of truth) - Task.review now references ReviewResult instead of inline duplicate - Re-export ReviewResult from agent-runner.ts for backward compat - Add test: normalizeReviewObj returns null for missing/wrong-type fields - Add test: scheduler catch block when reviewDiffWithAgent throws Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ded33ec commit 0ce807d

4 files changed

Lines changed: 442 additions & 9 deletions

File tree

src/__tests__/agent-runner.test.ts

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { describe, it, before, after } from "node:test";
22
import assert from "node:assert/strict";
3-
import { AgentRunner } from "../agent-runner.js";
3+
import { AgentRunner, type ReviewResult } from "../agent-runner.js";
44
import { createTask } from "../types.js";
55
import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs";
66
import { join } from "node:path";
@@ -345,4 +345,97 @@ describe("AgentRunner", () => {
345345
assert.strictEqual(task.events[0].type, "evt-5");
346346
assert.strictEqual(task.events[199].type, "evt-204");
347347
});
348+
349+
// ── pickReviewAgent ──
350+
351+
it("pickReviewAgent selects codex to review claude's work", () => {
352+
assert.strictEqual(AgentRunner.pickReviewAgent("claude"), "codex");
353+
});
354+
355+
it("pickReviewAgent selects claude to review codex's work", () => {
356+
assert.strictEqual(AgentRunner.pickReviewAgent("codex"), "claude");
357+
});
358+
359+
it("pickReviewAgent selects codex to review claude-sdk's work", () => {
360+
assert.strictEqual(AgentRunner.pickReviewAgent("claude-sdk"), "codex");
361+
});
362+
363+
it("pickReviewAgent selects claude for generic agents", () => {
364+
assert.strictEqual(AgentRunner.pickReviewAgent("aider --yes"), "claude");
365+
assert.strictEqual(AgentRunner.pickReviewAgent("custom-agent"), "claude");
366+
});
367+
368+
// ── parseReviewResponse ──
369+
370+
it("parseReviewResponse parses clean JSON", () => {
371+
const runner = new AgentRunner();
372+
const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner);
373+
const result = parse('{"approve": true, "score": 85, "issues": [], "suggestions": ["looks good"]}');
374+
assert.ok(result);
375+
assert.strictEqual(result.approve, true);
376+
assert.strictEqual(result.score, 85);
377+
assert.deepStrictEqual(result.suggestions, ["looks good"]);
378+
});
379+
380+
it("parseReviewResponse extracts JSON from surrounding text", () => {
381+
const runner = new AgentRunner();
382+
const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner);
383+
const result = parse('Here is my review:\n{"approve": false, "score": 30, "issues": ["bug found"], "suggestions": []}\nDone.');
384+
assert.ok(result);
385+
assert.strictEqual(result.approve, false);
386+
assert.strictEqual(result.score, 30);
387+
assert.deepStrictEqual(result.issues, ["bug found"]);
388+
});
389+
390+
it("parseReviewResponse returns null for unparseable output", () => {
391+
const runner = new AgentRunner();
392+
const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner);
393+
assert.strictEqual(parse("I can't produce JSON right now"), null);
394+
});
395+
396+
it("parseReviewResponse clamps score to 0-100", () => {
397+
const runner = new AgentRunner();
398+
const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner);
399+
const result = parse('{"approve": true, "score": 150, "issues": [], "suggestions": []}');
400+
assert.ok(result);
401+
assert.strictEqual(result.score, 100);
402+
});
403+
404+
it("parseReviewResponse returns null when approve or score is missing/wrong type", () => {
405+
const runner = new AgentRunner();
406+
const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner);
407+
assert.strictEqual(parse('{"score": 80, "issues": []}'), null); // missing approve
408+
assert.strictEqual(parse('{"approve": "yes", "score": 80}'), null); // approve is string
409+
assert.strictEqual(parse('{"approve": true, "issues": []}'), null); // missing score
410+
});
411+
412+
// ── reviewDiff approve field ──
413+
414+
it("reviewDiff includes approve field based on score threshold", () => {
415+
const runner = new AgentRunner();
416+
const clean = runner.reviewDiff("diff --git a/foo.ts\n+const x = 1;");
417+
assert.strictEqual(typeof clean.approve, "boolean");
418+
assert.strictEqual(clean.approve, true); // score=50 >= 40
419+
420+
const bad = runner.reviewDiff("diff --git a/foo.ts\n+console.log('x');\n+console.log('y');\n+console.log('z');");
421+
// console.log penalty takes score below 40
422+
// Actually score=50-10=40, still >=40
423+
assert.strictEqual(typeof bad.approve, "boolean");
424+
});
425+
426+
// ── reviewDiffWithAgent fallback ──
427+
428+
it("reviewDiffWithAgent falls back to heuristic when agent fails", async () => {
429+
const runner = new AgentRunner();
430+
// codex not available in test env → will fail → should fall back to heuristic
431+
const result = await runner.reviewDiffWithAgent(
432+
"diff --git a/foo.test.ts b/foo.test.ts\n+it('works', () => {});",
433+
"claude", // task was by claude → review by codex → codex fails → fallback
434+
2, // 2 second timeout to keep test fast
435+
);
436+
assert.strictEqual(typeof result.approve, "boolean");
437+
assert.strictEqual(typeof result.score, "number");
438+
assert.ok(Array.isArray(result.issues));
439+
assert.ok(Array.isArray(result.suggestions));
440+
});
348441
});

src/__tests__/scheduler.test.ts

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ function makeRunner(): AgentRunner {
2121
return {
2222
run: async (task: Task) => { task.status = "success"; task.durationMs = 100; return task; },
2323
getRunningTasks: () => [],
24+
reviewDiffWithAgent: async () => ({ approve: true, score: 80, issues: [], suggestions: [] }),
2425
} as unknown as AgentRunner;
2526
}
2627

@@ -845,4 +846,208 @@ describe("Scheduler", () => {
845846
assert.ok(result);
846847
});
847848
});
849+
850+
// ─── 11. Cross-agent review gate ───
851+
852+
describe("cross-agent review gate", () => {
853+
it("review gate blocks merge when review rejects", async () => {
854+
let mergeCalledWith = false;
855+
const pool = {
856+
...makePool(),
857+
release: async (_name: string, merge: boolean) => {
858+
mergeCalledWith = merge;
859+
return { merged: merge };
860+
},
861+
} as unknown as WorktreePool;
862+
863+
const runner = {
864+
run: async (task: Task) => {
865+
task.status = "success";
866+
task.durationMs = 100;
867+
// Simulate agent creating a git diff event
868+
task.events.push({
869+
type: "git_diff",
870+
timestamp: new Date().toISOString(),
871+
data: { diff: "diff --git a/foo.ts\n+console.log('debug');" },
872+
});
873+
return task;
874+
},
875+
getRunningTasks: () => [],
876+
reviewDiffWithAgent: async () => ({
877+
approve: false,
878+
score: 25,
879+
issues: ["console.log found"],
880+
suggestions: [],
881+
}),
882+
} as unknown as AgentRunner;
883+
884+
const events: Record<string, unknown>[] = [];
885+
const s = new Scheduler(pool, runner, makeStore(), (ev) => events.push(ev));
886+
887+
// Manually invoke executeAndRelease
888+
const task = s.submit("test review gate");
889+
const exec = (s as any).executeAndRelease.bind(s);
890+
await exec(task, "w0", "/tmp/w0");
891+
892+
// Task should still be "success" (review doesn't change status)
893+
assert.strictEqual(task.status, "success");
894+
// But merge should have been blocked
895+
assert.strictEqual(mergeCalledWith, false);
896+
// Review should be attached to task
897+
assert.ok(task.review);
898+
assert.strictEqual(task.review!.approve, false);
899+
assert.strictEqual(task.review!.score, 25);
900+
// Error should explain why merge was blocked
901+
assert.ok(task.error.includes("review rejected"), "error should explain rejection");
902+
// Should have emitted review_started and review_rejected events
903+
assert.ok(events.some(e => e.type === "review_started"), "should emit review_started");
904+
assert.ok(events.some(e => e.type === "review_rejected"), "should emit review_rejected");
905+
});
906+
907+
it("review gate allows merge when review approves", async () => {
908+
let mergeCalledWith = false;
909+
const pool = {
910+
...makePool(),
911+
release: async (_name: string, merge: boolean) => {
912+
mergeCalledWith = merge;
913+
return { merged: true };
914+
},
915+
} as unknown as WorktreePool;
916+
917+
const runner = {
918+
run: async (task: Task) => {
919+
task.status = "success";
920+
task.durationMs = 100;
921+
task.events.push({
922+
type: "git_diff",
923+
timestamp: new Date().toISOString(),
924+
data: { diff: "diff --git a/foo.ts\n+const x = 1;" },
925+
});
926+
return task;
927+
},
928+
getRunningTasks: () => [],
929+
reviewDiffWithAgent: async () => ({
930+
approve: true,
931+
score: 85,
932+
issues: [],
933+
suggestions: ["looks clean"],
934+
}),
935+
} as unknown as AgentRunner;
936+
937+
const events: Record<string, unknown>[] = [];
938+
const s = new Scheduler(pool, runner, makeStore(), (ev) => events.push(ev));
939+
940+
const task = s.submit("test approve");
941+
const exec = (s as any).executeAndRelease.bind(s);
942+
await exec(task, "w0", "/tmp/w0");
943+
944+
assert.strictEqual(task.status, "success");
945+
assert.strictEqual(mergeCalledWith, true);
946+
assert.ok(task.review);
947+
assert.strictEqual(task.review!.approve, true);
948+
assert.ok(events.some(e => e.type === "review_started"), "should emit review_started");
949+
assert.ok(events.some(e => e.type === "review_approved"), "should emit review_approved");
950+
});
951+
952+
it("skips review when task has no diff", async () => {
953+
let mergeCalledWith = false;
954+
const pool = {
955+
...makePool(),
956+
release: async (_name: string, merge: boolean) => {
957+
mergeCalledWith = merge;
958+
return { merged: true };
959+
},
960+
} as unknown as WorktreePool;
961+
962+
const runner = {
963+
run: async (task: Task) => {
964+
task.status = "success";
965+
task.durationMs = 100;
966+
// No git_diff event
967+
return task;
968+
},
969+
getRunningTasks: () => [],
970+
reviewDiffWithAgent: async () => { throw new Error("should not be called"); },
971+
} as unknown as AgentRunner;
972+
973+
const s = new Scheduler(pool, runner, makeStore());
974+
975+
const task = s.submit("no diff task");
976+
const exec = (s as any).executeAndRelease.bind(s);
977+
await exec(task, "w0", "/tmp/w0");
978+
979+
// Should merge without review since there's no diff
980+
assert.strictEqual(mergeCalledWith, true);
981+
assert.strictEqual(task.review, undefined);
982+
});
983+
984+
it("skips review when task failed", async () => {
985+
let mergeCalledWith = false;
986+
const pool = {
987+
...makePool(),
988+
release: async (_name: string, merge: boolean) => {
989+
mergeCalledWith = merge;
990+
return { merged: false };
991+
},
992+
} as unknown as WorktreePool;
993+
994+
const runner = {
995+
run: async (task: Task) => {
996+
task.status = "failed";
997+
task.error = "compilation error";
998+
task.durationMs = 50;
999+
return task;
1000+
},
1001+
getRunningTasks: () => [],
1002+
reviewDiffWithAgent: async () => { throw new Error("should not be called"); },
1003+
} as unknown as AgentRunner;
1004+
1005+
const s = new Scheduler(pool, runner, makeStore());
1006+
1007+
const task = s.submit("failed task");
1008+
const exec = (s as any).executeAndRelease.bind(s);
1009+
await exec(task, "w0", "/tmp/w0");
1010+
1011+
assert.strictEqual(mergeCalledWith, false);
1012+
assert.strictEqual(task.review, undefined);
1013+
});
1014+
1015+
it("falls back gracefully when reviewDiffWithAgent throws", async () => {
1016+
let mergeCalledWith = false;
1017+
const pool = {
1018+
...makePool(),
1019+
release: async (_name: string, merge: boolean) => {
1020+
mergeCalledWith = merge;
1021+
return { merged: false };
1022+
},
1023+
} as unknown as WorktreePool;
1024+
1025+
const runner = {
1026+
run: async (task: Task) => {
1027+
task.status = "success";
1028+
task.durationMs = 100;
1029+
task.events.push({
1030+
type: "git_diff",
1031+
timestamp: new Date().toISOString(),
1032+
data: { diff: "diff --git a/foo.ts\n+const x = 1;" },
1033+
});
1034+
return task;
1035+
},
1036+
getRunningTasks: () => [],
1037+
reviewDiffWithAgent: async () => { throw new Error("network error"); },
1038+
} as unknown as AgentRunner;
1039+
1040+
const s = new Scheduler(pool, runner, makeStore());
1041+
1042+
const task = s.submit("review throws");
1043+
task.maxRetries = 0; // prevent retry so we can assert final status
1044+
const exec = (s as any).executeAndRelease.bind(s);
1045+
await exec(task, "w0", "/tmp/w0");
1046+
1047+
// The outer catch should mark the task as failed
1048+
assert.strictEqual(task.status, "failed");
1049+
assert.ok(task.error.includes("network error"));
1050+
assert.strictEqual(mergeCalledWith, false);
1051+
});
1052+
});
8481053
});

0 commit comments

Comments
 (0)