feat: show per-sample failure details and drop emojis

EfeAcar6431 · EfeAcar6431 · commit 5edd013dfbfc · 2026-04-03T15:52:17.000-04:00
- Parse detailed_results from the experiment API to include per-sample
  input/response/reason in the Markdown summary
- Replace emoji indicators with text (PASS/FAIL) throughout
- Add "Failure Details" section showing each sample that failed with
  the model's response and the judge's reasoning
- Update README with copy-paste PR workflow and secrets setup guide

Made-with: Cursor
diff --git a/README.md b/README.md
@@ -60,67 +60,79 @@ The action will:
 4. Fail the step if any metric is below threshold
 5. Upload structured JSON results and a Markdown summary as build artifacts
 
-### Full Workflow — Quality Gate on PRs
+### Run on Every Pull Request
+
+Copy this file to `.github/workflows/llm-eval.yml` in your repository. That's it — every PR against `main` or `develop` will be evaluated automatically.
 
 ```yaml
+# .github/workflows/llm-eval.yml
 name: LLM Quality Gate
+
 on:
   pull_request:
     branches: [main, develop]
 
 jobs:
   eval:
+    name: Evaluate LLM
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
       contents: read
     steps:
       - uses: actions/checkout@v4
 
-      - name: Evaluate LLM quality
+      - name: Run evaluation
         id: eval
         uses: verifywise-ai/verifywise-eval-action@v1
         with:
           api_url: https://app.verifywise.ai
           project_id: proj_abc
           dataset_id: '2'
-          metrics: 'correctness,faithfulness,hallucination'
+          metrics: correctness,faithfulness,hallucination
           model_name: gpt-4o-mini
           model_provider: openai
           threshold: '0.7'
-          fail_on_threshold: 'true'
           vw_api_token: ${{ secrets.VW_API_TOKEN }}
           llm_api_key: ${{ secrets.LLM_API_KEY }}
 
-      - name: Comment results on PR
-        if: github.event_name == 'pull_request' && always()
+      # Optional: post results as a PR comment
+      - name: Comment on PR
+        if: always() && github.event_name == 'pull_request'
         uses: actions/github-script@v7
         with:
           script: |
             const fs = require('fs');
-            const summaryPath = '${{ steps.eval.outputs.summary_path }}';
-            if (!summaryPath || !fs.existsSync(summaryPath)) return;
-            const body = fs.readFileSync(summaryPath, 'utf8');
-            const marker = '<!-- verifywise-eval-results -->';
+            const path = '${{ steps.eval.outputs.summary_path }}';
+            if (!path || !fs.existsSync(path)) return;
+            const body = fs.readFileSync(path, 'utf8');
+            const tag = '<!-- verifywise-eval -->';
             const { data: comments } = await github.rest.issues.listComments({
               owner: context.repo.owner, repo: context.repo.repo,
               issue_number: context.issue.number,
             });
-            const existing = comments.find(c => c.body.includes(marker));
-            const fullBody = `${marker}\n${body}`;
-            if (existing) {
+            const prev = comments.find(c => c.body.includes(tag));
+            const full = `${tag}\n${body}`;
+            if (prev) {
               await github.rest.issues.updateComment({
                 owner: context.repo.owner, repo: context.repo.repo,
-                comment_id: existing.id, body: fullBody,
+                comment_id: prev.id, body: full,
               });
             } else {
               await github.rest.issues.createComment({
                 owner: context.repo.owner, repo: context.repo.repo,
-                issue_number: context.issue.number, body: fullBody,
+                issue_number: context.issue.number, body: full,
               });
             }
 ```
 
+**Required secrets** — add these in your repo's Settings > Secrets and variables > Actions:
+
+| Secret | Where to get it |
+|--------|----------------|
+| `VW_API_TOKEN` | VerifyWise dashboard > Settings > API Tokens |
+| `LLM_API_KEY` | Your LLM provider (OpenAI, Anthropic, etc.) |
+
 ---
 
 ## Inputs
diff --git a/action.yml b/action.yml
@@ -146,23 +146,25 @@ runs:
         SUMMARY_PATH: ${{ steps.eval.outputs.summary_path }}
         FAIL_ON_THRESHOLD: ${{ inputs.fail_on_threshold }}
       run: |
-        # If no results at all, report error
         if [ ! -f "$RESULTS_PATH" ]; then
-          echo "::error title=VerifyWise Evaluation Failed::No results produced — the evaluation may have failed to start. Check the logs above."
+          echo "::error title=Evaluation Failed::No results produced. The evaluation may have failed to connect to the VerifyWise instance."
           echo "passed=false" >> "$GITHUB_OUTPUT"
-          echo "### ❌ VerifyWise Evaluation Failed" >> "$GITHUB_STEP_SUMMARY"
-          echo "" >> "$GITHUB_STEP_SUMMARY"
-          echo "No results were produced. The evaluation may have failed to connect to the VerifyWise instance." >> "$GITHUB_STEP_SUMMARY"
+          {
+            echo "## VerifyWise Evaluation"
+            echo ""
+            echo "**FAILED** -- No results were produced. The evaluation may have failed to start."
+            echo "Check the logs in the \"Run evaluation\" step for details."
+          } >> "$GITHUB_STEP_SUMMARY"
           [ "$FAIL_ON_THRESHOLD" = "true" ] && exit 1
           exit 0
         fi
 
-        # Write the Markdown summary to the Job Summary (visible on the run page)
+        # Write Markdown summary to Job Summary (shown on the run page)
         if [ -f "$SUMMARY_PATH" ]; then
           cat "$SUMMARY_PATH" >> "$GITHUB_STEP_SUMMARY"
         fi
 
-        # Parse pass/fail and annotate
+        # Parse results, create annotations, set outputs
         python3 << 'PYEOF'
         import json, os
 
@@ -174,25 +176,29 @@ runs:
 
         passed = data.get("passed", False)
         metrics = data.get("metrics", [])
+        samples = data.get("samples", [])
         name = data.get("name", "Evaluation")
         model = data.get("model", "unknown")
 
         failing = [m for m in metrics if not m.get("passed")]
         passing = [m for m in metrics if m.get("passed")]
 
-        # Write outputs
         with open(os.environ["GITHUB_OUTPUT"], "a") as out:
             out.write(f"passed={'true' if passed else 'false'}\n")
 
         if passed:
-            print(f"::notice title=✅ {name} Passed::All {len(metrics)} metrics passed for {model}")
+            print(f"::notice title=All metrics passed::{len(metrics)} metrics passed for {model}")
         else:
             for m in failing:
-                inv = " (inverted — lower is better)" if m.get("inverted") else ""
-                print(f"::error title=❌ {m['name']} failed threshold::{m['name']}: scored {m['score']*100:.1f}% against {m['threshold']*100:.0f}% threshold{inv}")
+                inv = " (inverted -- lower is better)" if m.get("inverted") else ""
+                print(f"::error title={m['name']} failed threshold::"
+                      f"{m['name']}: scored {m['score']*100:.1f}% "
+                      f"against {m['threshold']*100:.0f}% threshold{inv}")
 
             summary = ", ".join(f"{m['name']}={m['score']*100:.0f}%" for m in failing)
-            print(f"::error title=VerifyWise Evaluation Failed::{len(failing)}/{len(metrics)} metrics below threshold on {model}: {summary}")
+            print(f"::error title=Evaluation Failed::"
+                  f"{len(failing)}/{len(metrics)} metrics below threshold "
+                  f"on {model}: {summary}")
 
             if fail_on:
                 raise SystemExit(1)
diff --git a/ci_eval_runner.py b/ci_eval_runner.py
@@ -17,7 +17,7 @@
 import os
 import sys
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional
 
 try:
@@ -26,6 +26,8 @@
     print("ERROR: 'requests' package required. Install with: pip install requests")
     sys.exit(2)
 
+INVERTED_KEYWORDS = ("bias", "toxicity", "hallucination", "conversationsafety")
+
 
 def parse_args() -> argparse.Namespace:
     p = argparse.ArgumentParser(description="VerifyWise CI/CD Evaluation Runner")
@@ -95,8 +97,8 @@ def create_experiment(
     dataset_name = dataset_info.get("name", f"dataset-{dataset_id}")
     print(f"Resolved dataset '{dataset_name}' -> {dataset_path}")
 
-    now = datetime.now(tz=__import__('datetime').timezone.utc)
-    experiment_name = name or f"CI Eval — {now.strftime('%Y-%m-%d %H:%M')}"
+    now = datetime.now(tz=timezone.utc)
+    experiment_name = name or f"CI Eval -- {now.strftime('%Y-%m-%d %H:%M')}"
 
     payload = {
         "project_id": project_id,
@@ -170,13 +172,18 @@ def poll_experiment(
     raise TimeoutError(f"Experiment did not complete within {timeout_minutes} minutes")
 
 
+def is_inverted(name: str) -> bool:
+    return any(k in name.lower() for k in INVERTED_KEYWORDS)
+
+
 def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any]:
     results = experiment.get("results", {})
     if isinstance(results, str):
         results = json.loads(results)
 
     avg_scores = results.get("avg_scores", {})
     metric_thresholds_raw = results.get("metric_thresholds", {})
+    detailed_results = results.get("detailed_results", [])
 
     config = experiment.get("config", {})
     if isinstance(config, str):
@@ -189,7 +196,7 @@ def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any
         score = float(score)
         mt = metric_thresholds_raw.get(name)
         mt = float(mt) if mt is not None else threshold
-        inverted = any(k in name.lower() for k in ["bias", "toxicity", "hallucination", "conversationsafety"])
+        inverted = is_inverted(name)
         passed = (score <= mt) if inverted else (score >= mt)
         if not passed:
             all_passed = False
@@ -201,6 +208,31 @@ def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any
             "inverted": inverted,
         })
 
+    samples = []
+    for i, sample in enumerate(detailed_results):
+        sample_entry = {
+            "index": i + 1,
+            "input": sample.get("input", ""),
+            "output": sample.get("output", ""),
+            "expected": sample.get("expected", ""),
+            "metric_scores": {},
+        }
+        raw_scores = sample.get("metric_scores", {})
+        for metric_name, metric_data in raw_scores.items():
+            if isinstance(metric_data, dict):
+                sample_entry["metric_scores"][metric_name] = {
+                    "score": metric_data.get("score"),
+                    "passed": metric_data.get("passed"),
+                    "reason": metric_data.get("reason", ""),
+                }
+            else:
+                sample_entry["metric_scores"][metric_name] = {
+                    "score": metric_data,
+                    "passed": None,
+                    "reason": "",
+                }
+        samples.append(sample_entry)
+
     return {
         "experiment_id": experiment.get("id", ""),
         "name": experiment.get("name", ""),
@@ -210,47 +242,118 @@ def parse_results(experiment: Dict[str, Any], threshold: float) -> Dict[str, Any
         "duration_ms": results.get("duration"),
         "passed": all_passed,
         "metrics": metrics_out,
+        "samples": samples,
     }
 
 
+def _truncate(text: str, max_len: int = 200) -> str:
+    if not text:
+        return "(empty)"
+    text = text.replace("\n", " ").strip()
+    if len(text) <= max_len:
+        return text
+    return text[:max_len] + "..."
+
+
 def generate_markdown(results: Dict[str, Any]) -> str:
     lines = [
         "## VerifyWise LLM Evaluation Results",
         "",
-        f"**Experiment:** {results['name']}",
-        f"**Model:** {results['model']}",
-        f"**Status:** {results['status']}",
-        f"**Samples:** {results['total_prompts']}",
+        f"**Experiment:** {results['name']}  ",
+        f"**Model:** {results['model']}  ",
+        f"**Status:** {results['status']}  ",
+        f"**Samples:** {results['total_prompts']}  ",
     ]
 
     if results.get("duration_ms"):
-        lines.append(f"**Duration:** {results['duration_ms'] / 1000:.1f}s")
+        lines.append(f"**Duration:** {results['duration_ms'] / 1000:.1f}s  ")
 
     overall = "PASS" if results["passed"] else "FAIL"
-    emoji = "white_check_mark" if results["passed"] else "x"
     lines.extend([
         "",
-        f"### Overall: :{emoji}: **{overall}**",
+        f"### Overall: **{overall}**",
         "",
-        "| Metric | Score | Threshold | Status |",
-        "|--------|-------|-----------|--------|",
+        "| Metric | Score | Threshold | Result |",
+        "|--------|------:|----------:|--------|",
     ])
 
     for m in results["metrics"]:
-        status_icon = ":white_check_mark:" if m["passed"] else ":x:"
-        inv = " *(inverted)*" if m["inverted"] else ""
+        inv = " (inverted)" if m["inverted"] else ""
+        result = "PASS" if m["passed"] else "FAIL"
         lines.append(
-            f"| {m['name']}{inv} | {m['score']*100:.1f}% | {m['threshold']*100:.0f}% | {status_icon} |"
+            f"| {m['name']}{inv} | {m['score']*100:.1f}% | {m['threshold']*100:.0f}% | {result} |"
         )
 
+    # Per-sample breakdown for failing metrics
+    failing_metrics = {m["name"] for m in results["metrics"] if not m["passed"]}
+    samples = results.get("samples", [])
+
+    if failing_metrics and samples:
+        lines.extend(["", "---", "", "### Failure Details", ""])
+        lines.append(
+            "Showing per-sample breakdown for metrics that did not meet the threshold."
+        )
+
+        for sample in samples:
+            sample_scores = sample.get("metric_scores", {})
+            has_failing = any(
+                _metric_name_matches(name, failing_metrics)
+                for name in sample_scores
+            )
+            if not has_failing:
+                continue
+
+            lines.extend([
+                "",
+                f"#### Sample {sample['index']}",
+                "",
+                f"> **Input:** {_truncate(sample['input'], 300)}",
+                "",
+                f"> **Response:** {_truncate(sample['output'], 300)}",
+            ])
+
+            if sample.get("expected"):
+                lines.append(f"> **Expected:** {_truncate(sample['expected'], 300)}")
+
+            lines.extend(["", "| Metric | Score | Result | Reason |", "|--------|------:|--------|--------|"])
+            for metric_name, score_data in sample_scores.items():
+                score_val = score_data.get("score")
+                passed = score_data.get("passed")
+                reason = score_data.get("reason", "")
+
+                if score_val is not None:
+                    score_str = f"{score_val * 100:.1f}%" if isinstance(score_val, float) else str(score_val)
+                else:
+                    score_str = "N/A"
+
+                if passed is True:
+                    result_str = "PASS"
+                elif passed is False:
+                    result_str = "FAIL"
+                else:
+                    result_str = "-"
+
+                reason_str = _truncate(reason, 120) if reason else "-"
+                lines.append(f"| {metric_name} | {score_str} | {result_str} | {reason_str} |")
+
     lines.extend([
         "",
-        f"*Generated by [VerifyWise](https://verifywise.ai) at {datetime.now(tz=__import__('datetime').timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
+        "---",
+        f"*Generated by [VerifyWise](https://verifywise.ai) at {datetime.now(tz=timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
     ])
 
     return "\n".join(lines)
 
 
+def _metric_name_matches(name: str, targets: set) -> bool:
+    """Check if a metric name matches any target, case-insensitively."""
+    lower = name.lower()
+    for t in targets:
+        if t.lower() == lower or t.lower().replace("_", "") == lower.replace("_", ""):
+            return True
+    return False
+
+
 def main():
     args = parse_args()
 
@@ -319,10 +422,10 @@ def main():
             print(f"  [{icon}] {m['name']}: {m['score']*100:.1f}% (threshold: {m['threshold']*100:.0f}%)")
 
         if not results["passed"]:
-            print("\nEvaluation FAILED — one or more metrics below threshold")
+            print("\nEvaluation FAILED -- one or more metrics below threshold")
             sys.exit(1)
         else:
-            print("\nEvaluation PASSED — all metrics within threshold")
+            print("\nEvaluation PASSED -- all metrics within threshold")
             sys.exit(0)
 
     except TimeoutError as e: