Skip to content

Commit b69d5eb

Browse files
Adds cost reporting
1 parent f326165 commit b69d5eb

File tree

1 file changed

+38
-14
lines changed

1 file changed

+38
-14
lines changed

tests/eval/eval_test.go

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,22 @@ or
6363
)
6464

6565
var (
66-
tempDir string
67-
localRepoRoot string
68-
testCases []string
69-
goldenModel string
70-
integrationModel string
71-
judgeModel string
66+
tempDir string
67+
localRepoRoot string
68+
testCases []string
69+
goldenModel string
70+
integrationModel string
71+
judgeModel string
72+
totalReviewerCost float64
73+
totalJudgeCost float64
7274
)
7375

76+
type claudeOutput struct {
77+
Type string `json:"type"`
78+
Result string `json:"result"`
79+
TotalCostUSD float64 `json:"total_cost_usd"`
80+
}
81+
7482
func TestEval(t *testing.T) {
7583
RegisterFailHandler(Fail)
7684
RunSpecs(t, "API Review Eval Suite")
@@ -122,6 +130,7 @@ var _ = AfterSuite(func() {
122130
By("cleaning up temp directory")
123131
os.RemoveAll(tempDir)
124132
}
133+
fmt.Printf("\nTotal Cost: $%.4f (Reviewer: $%.4f, Judge: $%.4f)\n", totalReviewerCost+totalJudgeCost, totalReviewerCost, totalJudgeCost)
125134
})
126135

127136
func copyLocalFiles() {
@@ -250,7 +259,7 @@ func readAndApplyPatch(patchPath string) {
250259
}
251260

252261
// runAPIReview and runJudge can probably share some common code.
253-
func runAPIReview(model string) string {
262+
func runAPIReview(model string) (string, float64) {
254263
By(fmt.Sprintf("running API review via Claude (%s)", model))
255264
ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout)
256265
defer cancel()
@@ -261,15 +270,22 @@ func runAPIReview(model string) string {
261270
"--model", model,
262271
"-p", "/api-review",
263272
"--allowedTools", "Bash,Read,Grep,Glob,Task",
273+
"--output-format", "json",
264274
)
265275
cmd.Dir = tempDir
266276

267277
output, err := cmd.CombinedOutput()
268278
Expect(err).NotTo(HaveOccurred(), "claude command failed: %s", string(output))
269-
return string(output)
279+
280+
var parsed claudeOutput
281+
err = json.Unmarshal(output, &parsed)
282+
Expect(err).NotTo(HaveOccurred(), "failed to parse claude output: %s", string(output))
283+
284+
totalReviewerCost += parsed.TotalCostUSD
285+
return parsed.Result, parsed.TotalCostUSD
270286
}
271287

272-
func runJudge(model, reviewOutput, expectedIssues string) evalResult {
288+
func runJudge(model, reviewOutput, expectedIssues string) (evalResult, float64) {
273289
By(fmt.Sprintf("comparing results with Claude judge (%s)", model))
274290
ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout)
275291
defer cancel()
@@ -280,17 +296,24 @@ func runJudge(model, reviewOutput, expectedIssues string) evalResult {
280296
"--dangerously-skip-permissions",
281297
"--model", model,
282298
"-p", prompt,
299+
"--output-format", "json",
283300
)
284301
cmd.Dir = tempDir
285302

286303
output, err := cmd.CombinedOutput()
287304
Expect(err).NotTo(HaveOccurred(), "claude judge command failed: %s", string(output))
288305

306+
var parsed claudeOutput
307+
err = json.Unmarshal(output, &parsed)
308+
Expect(err).NotTo(HaveOccurred(), "failed to parse judge output: %s", string(output))
309+
310+
totalJudgeCost += parsed.TotalCostUSD
311+
289312
var result evalResult
290-
jsonStr := stripMarkdownCodeBlock(string(output))
313+
jsonStr := stripMarkdownCodeBlock(parsed.Result)
291314
err = json.Unmarshal([]byte(jsonStr), &result)
292-
Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", string(output))
293-
return result
315+
Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", parsed.Result)
316+
return result, parsed.TotalCostUSD
294317
}
295318

296319
func runTestCase(tier, tc, reviewModel, judgeModelName string) {
@@ -303,9 +326,10 @@ func runTestCase(tier, tc, reviewModel, judgeModelName string) {
303326
Expect(err).NotTo(HaveOccurred())
304327
expectedIssues := strings.TrimSpace(string(expectedContent))
305328

306-
reviewOutput := runAPIReview(reviewModel)
307-
result := runJudge(judgeModelName, reviewOutput, expectedIssues)
329+
reviewOutput, reviewCost := runAPIReview(reviewModel)
330+
result, judgeCost := runJudge(judgeModelName, reviewOutput, expectedIssues)
308331

332+
GinkgoWriter.Printf("Cost: Reviewer=$%.4f, Judge=$%.4f, Total=$%.4f\n", reviewCost, judgeCost, reviewCost+judgeCost)
309333
GinkgoWriter.Printf("Judge result: pass=%v, reason=%s\n", result.Pass, result.Reason)
310334
Expect(result.Pass).To(BeTrue(), "API review did not match expected issues.\nJudge reason: %s\nReview output:\n%s\nExpected issues:\n%s", result.Reason, reviewOutput, expectedIssues)
311335
}

0 commit comments

Comments
 (0)