1717import os
1818import sys
1919import time
20- from datetime import datetime
20+ from datetime import datetime , timezone
2121from typing import Any , Dict , List , Optional
2222
2323try :
2626 print ("ERROR: 'requests' package required. Install with: pip install requests" )
2727 sys .exit (2 )
2828
29+ INVERTED_KEYWORDS = ("bias" , "toxicity" , "hallucination" , "conversationsafety" )
30+
2931
3032def parse_args () -> argparse .Namespace :
3133 p = argparse .ArgumentParser (description = "VerifyWise CI/CD Evaluation Runner" )
@@ -95,8 +97,8 @@ def create_experiment(
9597 dataset_name = dataset_info .get ("name" , f"dataset-{ dataset_id } " )
9698 print (f"Resolved dataset '{ dataset_name } ' -> { dataset_path } " )
9799
98- now = datetime .now (tz = __import__ ( 'datetime' ). timezone .utc )
99- experiment_name = name or f"CI Eval — { now .strftime ('%Y-%m-%d %H:%M' )} "
100+ now = datetime .now (tz = timezone .utc )
101+ experiment_name = name or f"CI Eval -- { now .strftime ('%Y-%m-%d %H:%M' )} "
100102
101103 payload = {
102104 "project_id" : project_id ,
@@ -170,13 +172,18 @@ def poll_experiment(
170172 raise TimeoutError (f"Experiment did not complete within { timeout_minutes } minutes" )
171173
172174
175+ def is_inverted (name : str ) -> bool :
176+ return any (k in name .lower () for k in INVERTED_KEYWORDS )
177+
178+
173179def parse_results (experiment : Dict [str , Any ], threshold : float ) -> Dict [str , Any ]:
174180 results = experiment .get ("results" , {})
175181 if isinstance (results , str ):
176182 results = json .loads (results )
177183
178184 avg_scores = results .get ("avg_scores" , {})
179185 metric_thresholds_raw = results .get ("metric_thresholds" , {})
186+ detailed_results = results .get ("detailed_results" , [])
180187
181188 config = experiment .get ("config" , {})
182189 if isinstance (config , str ):
@@ -189,7 +196,7 @@ def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any
189196 score = float (score )
190197 mt = metric_thresholds_raw .get (name )
191198 mt = float (mt ) if mt is not None else threshold
192- inverted = any ( k in name . lower () for k in [ "bias" , "toxicity" , "hallucination" , "conversationsafety" ] )
199+ inverted = is_inverted ( name )
193200 passed = (score <= mt ) if inverted else (score >= mt )
194201 if not passed :
195202 all_passed = False
@@ -201,6 +208,31 @@ def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any
201208 "inverted" : inverted ,
202209 })
203210
211+ samples = []
212+ for i , sample in enumerate (detailed_results ):
213+ sample_entry = {
214+ "index" : i + 1 ,
215+ "input" : sample .get ("input" , "" ),
216+ "output" : sample .get ("output" , "" ),
217+ "expected" : sample .get ("expected" , "" ),
218+ "metric_scores" : {},
219+ }
220+ raw_scores = sample .get ("metric_scores" , {})
221+ for metric_name , metric_data in raw_scores .items ():
222+ if isinstance (metric_data , dict ):
223+ sample_entry ["metric_scores" ][metric_name ] = {
224+ "score" : metric_data .get ("score" ),
225+ "passed" : metric_data .get ("passed" ),
226+ "reason" : metric_data .get ("reason" , "" ),
227+ }
228+ else :
229+ sample_entry ["metric_scores" ][metric_name ] = {
230+ "score" : metric_data ,
231+ "passed" : None ,
232+ "reason" : "" ,
233+ }
234+ samples .append (sample_entry )
235+
204236 return {
205237 "experiment_id" : experiment .get ("id" , "" ),
206238 "name" : experiment .get ("name" , "" ),
@@ -210,47 +242,118 @@ def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any
210242 "duration_ms" : results .get ("duration" ),
211243 "passed" : all_passed ,
212244 "metrics" : metrics_out ,
245+ "samples" : samples ,
213246 }
214247
215248
249+ def _truncate (text : str , max_len : int = 200 ) -> str :
250+ if not text :
251+ return "(empty)"
252+ text = text .replace ("\n " , " " ).strip ()
253+ if len (text ) <= max_len :
254+ return text
255+ return text [:max_len ] + "..."
256+
257+
216258def generate_markdown (results : Dict [str , Any ]) -> str :
217259 lines = [
218260 "## VerifyWise LLM Evaluation Results" ,
219261 "" ,
220- f"**Experiment:** { results ['name' ]} " ,
221- f"**Model:** { results ['model' ]} " ,
222- f"**Status:** { results ['status' ]} " ,
223- f"**Samples:** { results ['total_prompts' ]} " ,
262+ f"**Experiment:** { results ['name' ]} " ,
263+ f"**Model:** { results ['model' ]} " ,
264+ f"**Status:** { results ['status' ]} " ,
265+ f"**Samples:** { results ['total_prompts' ]} " ,
224266 ]
225267
226268 if results .get ("duration_ms" ):
227- lines .append (f"**Duration:** { results ['duration_ms' ] / 1000 :.1f} s" )
269+ lines .append (f"**Duration:** { results ['duration_ms' ] / 1000 :.1f} s " )
228270
229271 overall = "PASS" if results ["passed" ] else "FAIL"
230- emoji = "white_check_mark" if results ["passed" ] else "x"
231272 lines .extend ([
232273 "" ,
233- f"### Overall: : { emoji } : **{ overall } **" ,
274+ f"### Overall: **{ overall } **" ,
234275 "" ,
235- "| Metric | Score | Threshold | Status |" ,
236- "|--------|------- |----------- |--------|" ,
276+ "| Metric | Score | Threshold | Result |" ,
277+ "|--------|------: |----------: |--------|" ,
237278 ])
238279
239280 for m in results ["metrics" ]:
240- status_icon = ":white_check_mark: " if m ["passed " ] else ":x: "
241- inv = " *(inverted)* " if m ["inverted " ] else ""
281+ inv = " (inverted) " if m ["inverted " ] else ""
282+ result = "PASS " if m ["passed " ] else "FAIL "
242283 lines .append (
243- f"| { m ['name' ]} { inv } | { m ['score' ]* 100 :.1f} % | { m ['threshold' ]* 100 :.0f} % | { status_icon } |"
284+ f"| { m ['name' ]} { inv } | { m ['score' ]* 100 :.1f} % | { m ['threshold' ]* 100 :.0f} % | { result } |"
244285 )
245286
287+ # Per-sample breakdown for failing metrics
288+ failing_metrics = {m ["name" ] for m in results ["metrics" ] if not m ["passed" ]}
289+ samples = results .get ("samples" , [])
290+
291+ if failing_metrics and samples :
292+ lines .extend (["" , "---" , "" , "### Failure Details" , "" ])
293+ lines .append (
294+ "Showing per-sample breakdown for metrics that did not meet the threshold."
295+ )
296+
297+ for sample in samples :
298+ sample_scores = sample .get ("metric_scores" , {})
299+ has_failing = any (
300+ _metric_name_matches (name , failing_metrics )
301+ for name in sample_scores
302+ )
303+ if not has_failing :
304+ continue
305+
306+ lines .extend ([
307+ "" ,
308+ f"#### Sample { sample ['index' ]} " ,
309+ "" ,
310+ f"> **Input:** { _truncate (sample ['input' ], 300 )} " ,
311+ "" ,
312+ f"> **Response:** { _truncate (sample ['output' ], 300 )} " ,
313+ ])
314+
315+ if sample .get ("expected" ):
316+ lines .append (f"> **Expected:** { _truncate (sample ['expected' ], 300 )} " )
317+
318+ lines .extend (["" , "| Metric | Score | Result | Reason |" , "|--------|------:|--------|--------|" ])
319+ for metric_name , score_data in sample_scores .items ():
320+ score_val = score_data .get ("score" )
321+ passed = score_data .get ("passed" )
322+ reason = score_data .get ("reason" , "" )
323+
324+ if score_val is not None :
325+ score_str = f"{ score_val * 100 :.1f} %" if isinstance (score_val , float ) else str (score_val )
326+ else :
327+ score_str = "N/A"
328+
329+ if passed is True :
330+ result_str = "PASS"
331+ elif passed is False :
332+ result_str = "FAIL"
333+ else :
334+ result_str = "-"
335+
336+ reason_str = _truncate (reason , 120 ) if reason else "-"
337+ lines .append (f"| { metric_name } | { score_str } | { result_str } | { reason_str } |" )
338+
246339 lines .extend ([
247340 "" ,
248- f"*Generated by [VerifyWise](https://verifywise.ai) at { datetime .now (tz = __import__ ('datetime' ).timezone .utc ).strftime ('%Y-%m-%d %H:%M UTC' )} *" ,
341+ "---" ,
342+ f"*Generated by [VerifyWise](https://verifywise.ai) at { datetime .now (tz = timezone .utc ).strftime ('%Y-%m-%d %H:%M UTC' )} *" ,
249343 ])
250344
251345 return "\n " .join (lines )
252346
253347
348+ def _metric_name_matches (name : str , targets : set ) -> bool :
349+ """Check if a metric name matches any target, case-insensitively."""
350+ lower = name .lower ()
351+ for t in targets :
352+ if t .lower () == lower or t .lower ().replace ("_" , "" ) == lower .replace ("_" , "" ):
353+ return True
354+ return False
355+
356+
254357def main ():
255358 args = parse_args ()
256359
@@ -319,10 +422,10 @@ def main():
319422 print (f" [{ icon } ] { m ['name' ]} : { m ['score' ]* 100 :.1f} % (threshold: { m ['threshold' ]* 100 :.0f} %)" )
320423
321424 if not results ["passed" ]:
322- print ("\n Evaluation FAILED — one or more metrics below threshold" )
425+ print ("\n Evaluation FAILED -- one or more metrics below threshold" )
323426 sys .exit (1 )
324427 else :
325- print ("\n Evaluation PASSED — all metrics within threshold" )
428+ print ("\n Evaluation PASSED -- all metrics within threshold" )
326429 sys .exit (0 )
327430
328431 except TimeoutError as e :
0 commit comments