6363)
6464
6565var (
66- tempDir string
67- localRepoRoot string
68- testCases []string
69- goldenModel string
70- integrationModel string
71- judgeModel string
66+ tempDir string
67+ localRepoRoot string
68+ testCases []string
69+ goldenModel string
70+ integrationModel string
71+ judgeModel string
72+ totalReviewerCost float64
73+ totalJudgeCost float64
7274)
7375
76+ type claudeOutput struct {
77+ Type string `json:"type"`
78+ Result string `json:"result"`
79+ TotalCostUSD float64 `json:"total_cost_usd"`
80+ }
81+
7482func TestEval (t * testing.T ) {
7583 RegisterFailHandler (Fail )
7684 RunSpecs (t , "API Review Eval Suite" )
@@ -122,6 +130,7 @@ var _ = AfterSuite(func() {
122130 By ("cleaning up temp directory" )
123131 os .RemoveAll (tempDir )
124132 }
133+ fmt .Printf ("\n Total Cost: $%.4f (Reviewer: $%.4f, Judge: $%.4f)\n " , totalReviewerCost + totalJudgeCost , totalReviewerCost , totalJudgeCost )
125134})
126135
127136func copyLocalFiles () {
@@ -250,7 +259,7 @@ func readAndApplyPatch(patchPath string) {
250259}
251260
252261// runAPIReview and runJudge can probably share some common code.
253- func runAPIReview (model string ) string {
262+ func runAPIReview (model string ) ( string , float64 ) {
254263 By (fmt .Sprintf ("running API review via Claude (%s)" , model ))
255264 ctx , cancel := context .WithTimeout (context .Background (), claudeTimeout )
256265 defer cancel ()
@@ -261,15 +270,22 @@ func runAPIReview(model string) string {
261270 "--model" , model ,
262271 "-p" , "/api-review" ,
263272 "--allowedTools" , "Bash,Read,Grep,Glob,Task" ,
273+ "--output-format" , "json" ,
264274 )
265275 cmd .Dir = tempDir
266276
267277 output , err := cmd .CombinedOutput ()
268278 Expect (err ).NotTo (HaveOccurred (), "claude command failed: %s" , string (output ))
269- return string (output )
279+
280+ var parsed claudeOutput
281+ err = json .Unmarshal (output , & parsed )
282+ Expect (err ).NotTo (HaveOccurred (), "failed to parse claude output: %s" , string (output ))
283+
284+ totalReviewerCost += parsed .TotalCostUSD
285+ return parsed .Result , parsed .TotalCostUSD
270286}
271287
272- func runJudge (model , reviewOutput , expectedIssues string ) evalResult {
288+ func runJudge (model , reviewOutput , expectedIssues string ) ( evalResult , float64 ) {
273289 By (fmt .Sprintf ("comparing results with Claude judge (%s)" , model ))
274290 ctx , cancel := context .WithTimeout (context .Background (), claudeTimeout )
275291 defer cancel ()
@@ -280,17 +296,24 @@ func runJudge(model, reviewOutput, expectedIssues string) evalResult {
280296 "--dangerously-skip-permissions" ,
281297 "--model" , model ,
282298 "-p" , prompt ,
299+ "--output-format" , "json" ,
283300 )
284301 cmd .Dir = tempDir
285302
286303 output , err := cmd .CombinedOutput ()
287304 Expect (err ).NotTo (HaveOccurred (), "claude judge command failed: %s" , string (output ))
288305
306+ var parsed claudeOutput
307+ err = json .Unmarshal (output , & parsed )
308+ Expect (err ).NotTo (HaveOccurred (), "failed to parse judge output: %s" , string (output ))
309+
310+ totalJudgeCost += parsed .TotalCostUSD
311+
289312 var result evalResult
290- jsonStr := stripMarkdownCodeBlock (string ( output ) )
313+ jsonStr := stripMarkdownCodeBlock (parsed . Result )
291314 err = json .Unmarshal ([]byte (jsonStr ), & result )
292- Expect (err ).NotTo (HaveOccurred (), "failed to parse judge response as JSON: %s" , string ( output ) )
293- return result
315+ Expect (err ).NotTo (HaveOccurred (), "failed to parse judge response as JSON: %s" , parsed . Result )
316+ return result , parsed . TotalCostUSD
294317}
295318
296319func runTestCase (tier , tc , reviewModel , judgeModelName string ) {
@@ -303,9 +326,10 @@ func runTestCase(tier, tc, reviewModel, judgeModelName string) {
303326 Expect (err ).NotTo (HaveOccurred ())
304327 expectedIssues := strings .TrimSpace (string (expectedContent ))
305328
306- reviewOutput := runAPIReview (reviewModel )
307- result := runJudge (judgeModelName , reviewOutput , expectedIssues )
329+ reviewOutput , reviewCost := runAPIReview (reviewModel )
330+ result , judgeCost := runJudge (judgeModelName , reviewOutput , expectedIssues )
308331
332+ GinkgoWriter .Printf ("Cost: Reviewer=$%.4f, Judge=$%.4f, Total=$%.4f\n " , reviewCost , judgeCost , reviewCost + judgeCost )
309333 GinkgoWriter .Printf ("Judge result: pass=%v, reason=%s\n " , result .Pass , result .Reason )
310334 Expect (result .Pass ).To (BeTrue (), "API review did not match expected issues.\n Judge reason: %s\n Review output:\n %s\n Expected issues:\n %s" , result .Reason , reviewOutput , expectedIssues )
311335}
0 commit comments