Skip to content

Commit a999d82

Browse files
Adds cost reporting
1 parent 1d8da4f commit a999d82

File tree

2 files changed

+41
-16
lines changed

2 files changed

+41
-16
lines changed

tests/eval/DESIGN.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@ These are copied into the temp clone so that any local modifications to the revi
155155

156156
## Phase 2
157157

158-
### Cost Tracking
158+
### Cost Tracking ✅ IMPLEMENTED
159+
159160

160161
Use `--output-format json` to capture `total_cost_usd` from each Claude invocation. Accumulate across all calls (review + judge) and print the total in `AfterSuite`.
161162

@@ -254,4 +255,4 @@ The API review step is the slowest part of the eval suite. Options to improve:
254255

255256
3. **Parallel test execution** - Run golden tests in parallel (requires separate repo clones per test).
256257

257-
4. **Smaller/faster model for development** - Use Haiku for rapid iteration, Sonnet/Opus for CI validation.
258+
4. **Smaller/faster model for development** - Use Haiku for rapid iteration, Sonnet/Opus for CI validation.

tests/eval/eval_test.go

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,22 @@ or
6464
)
6565

6666
var (
67-
tempDir string
68-
localRepoRoot string
69-
testCases []string
70-
goldenModel string
71-
integrationModel string
72-
judgeModel string
67+
tempDir string
68+
localRepoRoot string
69+
testCases []string
70+
goldenModel string
71+
integrationModel string
72+
judgeModel string
73+
totalReviewerCost float64
74+
totalJudgeCost float64
7375
)
7476

77+
type claudeOutput struct {
78+
Type string `json:"type"`
79+
Result string `json:"result"`
80+
TotalCostUSD float64 `json:"total_cost_usd"`
81+
}
82+
7583
func TestEval(t *testing.T) {
7684
RegisterFailHandler(Fail)
7785
RunSpecs(t, "API Review Eval Suite")
@@ -123,6 +131,7 @@ var _ = AfterSuite(func() {
123131
By("cleaning up temp directory")
124132
os.RemoveAll(tempDir)
125133
}
134+
fmt.Printf("\nTotal Cost: $%.4f (Reviewer: $%.4f, Judge: $%.4f)\n", totalReviewerCost+totalJudgeCost, totalReviewerCost, totalJudgeCost)
126135
})
127136

128137
func copyLocalFiles() {
@@ -251,7 +260,7 @@ func readAndApplyPatch(patchPath string) {
251260
}
252261

253262
// runAPIReview and runJudge can probably share some common code.
254-
func runAPIReview(model string) string {
263+
func runAPIReview(model string) (string, float64) {
255264
By(fmt.Sprintf("running API review via Claude (%s)", model))
256265
ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout)
257266
defer cancel()
@@ -262,15 +271,22 @@ func runAPIReview(model string) string {
262271
"--model", model,
263272
"-p", "/api-review",
264273
"--allowedTools", "Bash,Read,Grep,Glob,Task",
274+
"--output-format", "json",
265275
)
266276
cmd.Dir = tempDir
267277

268278
output, err := cmd.CombinedOutput()
269279
Expect(err).NotTo(HaveOccurred(), "claude command failed: %s", string(output))
270-
return string(output)
280+
281+
var parsed claudeOutput
282+
err = json.Unmarshal(output, &parsed)
283+
Expect(err).NotTo(HaveOccurred(), "failed to parse claude output: %s", string(output))
284+
285+
totalReviewerCost += parsed.TotalCostUSD
286+
return parsed.Result, parsed.TotalCostUSD
271287
}
272288

273-
func runJudge(model, reviewOutput, expectedIssues string) evalResult {
289+
func runJudge(model, reviewOutput, expectedIssues string) (evalResult, float64) {
274290
By(fmt.Sprintf("comparing results with Claude judge (%s)", model))
275291
ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout)
276292
defer cancel()
@@ -281,17 +297,24 @@ func runJudge(model, reviewOutput, expectedIssues string) evalResult {
281297
"--dangerously-skip-permissions",
282298
"--model", model,
283299
"-p", prompt,
300+
"--output-format", "json",
284301
)
285302
cmd.Dir = tempDir
286303

287304
output, err := cmd.CombinedOutput()
288305
Expect(err).NotTo(HaveOccurred(), "claude judge command failed: %s", string(output))
289306

307+
var parsed claudeOutput
308+
err = json.Unmarshal(output, &parsed)
309+
Expect(err).NotTo(HaveOccurred(), "failed to parse judge output: %s", string(output))
310+
311+
totalJudgeCost += parsed.TotalCostUSD
312+
290313
var result evalResult
291-
jsonStr := stripMarkdownCodeBlock(string(output))
314+
jsonStr := stripMarkdownCodeBlock(parsed.Result)
292315
err = json.Unmarshal([]byte(jsonStr), &result)
293-
Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", string(output))
294-
return result
316+
Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", parsed.Result)
317+
return result, parsed.TotalCostUSD
295318
}
296319

297320
func runTestCase(tier, tc, reviewModel, judgeModelName string) {
@@ -304,9 +327,10 @@ func runTestCase(tier, tc, reviewModel, judgeModelName string) {
304327
Expect(err).NotTo(HaveOccurred())
305328
expectedIssues := strings.TrimSpace(string(expectedContent))
306329

307-
reviewOutput := runAPIReview(reviewModel)
308-
result := runJudge(judgeModelName, reviewOutput, expectedIssues)
330+
reviewOutput, reviewCost := runAPIReview(reviewModel)
331+
result, judgeCost := runJudge(judgeModelName, reviewOutput, expectedIssues)
309332

333+
GinkgoWriter.Printf("Cost: Reviewer=$%.4f, Judge=$%.4f, Total=$%.4f\n", reviewCost, judgeCost, reviewCost+judgeCost)
310334
GinkgoWriter.Printf("Judge result: pass=%v, reason=%s\n", result.Pass, result.Reason)
311335
Expect(result.Pass).To(BeTrue(), "API review did not match expected issues.\nJudge reason: %s\nReview output:\n%s\nExpected issues:\n%s", result.Reason, reviewOutput, expectedIssues)
312336
}

0 commit comments

Comments
 (0)