|
1 | 1 | name: 'VerifyWise LLM Evaluation' |
2 | 2 | description: > |
3 | 3 | Run LLM evaluations against a VerifyWise instance and gate CI on quality |
4 | | - thresholds. Creates an experiment, polls until completion, checks metrics, |
5 | | - and uploads results as build artifacts. |
| 4 | + thresholds. Supports chatbot, RAG, and agent use cases. |
| 5 | +
|
6 | 6 | branding: |
7 | 7 | icon: 'check-circle' |
8 | 8 | color: 'green' |
9 | 9 |
|
| 10 | +# ─── Available metrics ────────────────────────────────────────────────── |
| 11 | +# |
| 12 | +# Universal: answer_relevancy, correctness, completeness, |
| 13 | +# hallucination*, toxicity*, bias*, instruction_following |
| 14 | +# |
| 15 | +# RAG: faithfulness, contextual_relevancy, |
| 16 | +# context_precision, context_recall |
| 17 | +# |
| 18 | +# Agent: tool_correctness, argument_correctness, |
| 19 | +# task_completion, step_efficiency, |
| 20 | +# plan_quality, plan_adherence |
| 21 | +# |
| 22 | +# (* = inverted — lower is better) |
| 23 | +# ──────────────────────────────────────────────────────────────────────── |
| 24 | + |
10 | 25 | inputs: |
11 | 26 | api_url: |
12 | | - description: 'Base URL of the VerifyWise instance (e.g. https://app.verifywise.ai)' |
| 27 | + description: 'VerifyWise instance URL' |
13 | 28 | required: true |
14 | 29 | project_id: |
15 | | - description: 'VerifyWise project ID' |
| 30 | + description: 'Project ID' |
16 | 31 | required: true |
17 | 32 | dataset_id: |
18 | 33 | description: 'Dataset ID to evaluate against' |
19 | 34 | required: true |
20 | 35 | metrics: |
21 | | - description: 'Comma-separated metric names (e.g. answerRelevancy,faithfulness,hallucination)' |
| 36 | + description: 'Comma-separated metric names (e.g. answer_relevancy,bias,toxicity)' |
22 | 37 | required: true |
23 | 38 | model_name: |
24 | | - description: 'Model to evaluate (e.g. gpt-4o, claude-3.5-sonnet)' |
| 39 | + description: 'Model to evaluate (e.g. gpt-4o-mini, claude-3.5-sonnet)' |
25 | 40 | required: true |
26 | 41 | model_provider: |
27 | | - description: 'Model provider (openai, anthropic, google, mistral, xai, self-hosted)' |
| 42 | + description: 'Provider: openai | anthropic | google | mistral | xai | self-hosted' |
28 | 43 | required: true |
29 | 44 | judge_model: |
30 | | - description: 'Judge LLM model name' |
| 45 | + description: 'Judge LLM for metric scoring' |
31 | 46 | required: false |
32 | 47 | default: 'gpt-4o' |
33 | 48 | judge_provider: |
34 | 49 | description: 'Judge LLM provider' |
35 | 50 | required: false |
36 | 51 | default: 'openai' |
37 | 52 | threshold: |
38 | | - description: 'Global pass threshold (0-1). Metrics below this fail.' |
| 53 | + description: 'Pass/fail threshold (0.0–1.0)' |
39 | 54 | required: false |
40 | 55 | default: '0.7' |
41 | 56 | timeout_minutes: |
42 | | - description: 'Max minutes to wait for evaluation to complete' |
| 57 | + description: 'Max wait time in minutes' |
43 | 58 | required: false |
44 | 59 | default: '30' |
45 | 60 | poll_interval_seconds: |
46 | 61 | description: 'Seconds between status polls' |
47 | 62 | required: false |
48 | 63 | default: '15' |
49 | 64 | experiment_name: |
50 | | - description: 'Custom experiment name (defaults to timestamp-based name)' |
| 65 | + description: 'Custom experiment name (auto-generated if empty)' |
51 | 66 | required: false |
52 | 67 | default: '' |
53 | 68 | fail_on_threshold: |
54 | | - description: 'Whether to fail the step when thresholds are not met (true/false)' |
| 69 | + description: 'Fail the step when thresholds are not met' |
55 | 70 | required: false |
56 | 71 | default: 'true' |
57 | 72 | vw_api_token: |
58 | | - description: 'VerifyWise JWT token or API key' |
| 73 | + description: 'VerifyWise API token' |
59 | 74 | required: true |
60 | 75 | llm_api_key: |
61 | | - description: 'API key for the LLM provider being evaluated' |
| 76 | + description: 'LLM provider API key' |
62 | 77 | required: true |
63 | 78 |
|
64 | 79 | outputs: |
65 | 80 | passed: |
66 | | - description: 'Whether all metrics passed their thresholds (true/false)' |
| 81 | + description: 'Whether all metrics passed (true/false)' |
67 | 82 | value: ${{ steps.check.outputs.passed }} |
68 | 83 | results_path: |
69 | | - description: 'Path to the results JSON file' |
| 84 | + description: 'Path to results JSON' |
70 | 85 | value: ${{ steps.eval.outputs.results_path }} |
71 | 86 | summary_path: |
72 | | - description: 'Path to the Markdown summary file' |
| 87 | + description: 'Path to Markdown summary' |
73 | 88 | value: ${{ steps.eval.outputs.summary_path }} |
74 | 89 | experiment_id: |
75 | 90 | description: 'ID of the created experiment' |
@@ -106,74 +121,54 @@ runs: |
106 | 121 | VW_EXPERIMENT_NAME: ${{ inputs.experiment_name }} |
107 | 122 | LLM_API_KEY: ${{ inputs.llm_api_key }} |
108 | 123 | run: | |
109 | | - RESULTS_PATH="${{ runner.temp }}/verifywise-results.json" |
110 | | - SUMMARY_PATH="${{ runner.temp }}/verifywise-summary.md" |
| 124 | + RESULTS="${{ runner.temp }}/vw-results.json" |
| 125 | + SUMMARY="${{ runner.temp }}/vw-summary.md" |
111 | 126 |
|
112 | 127 | python "${{ github.action_path }}/ci_eval_runner.py" \ |
113 | | - --api-url "$VW_API_URL" \ |
114 | | - --token "$VW_API_TOKEN" \ |
115 | | - --project-id "$VW_PROJECT_ID" \ |
116 | | - --dataset-id "$VW_DATASET_ID" \ |
117 | | - --metrics "$VW_METRICS" \ |
118 | | - --model-name "$VW_MODEL_NAME" \ |
119 | | - --model-provider "$VW_MODEL_PROVIDER" \ |
120 | | - --judge-model "$VW_JUDGE_MODEL" \ |
121 | | - --judge-provider "$VW_JUDGE_PROVIDER" \ |
122 | | - --threshold "$VW_THRESHOLD" \ |
123 | | - --timeout "$VW_TIMEOUT_MINUTES" \ |
124 | | - --poll-interval "$VW_POLL_INTERVAL" \ |
125 | | - --output "$RESULTS_PATH" \ |
126 | | - --markdown-output "$SUMMARY_PATH" \ |
127 | | - ${VW_EXPERIMENT_NAME:+--name "$VW_EXPERIMENT_NAME"} \ |
| 128 | + --output "$RESULTS" \ |
| 129 | + --markdown-output "$SUMMARY" \ |
128 | 130 | || true |
129 | 131 |
|
130 | | - echo "results_path=$RESULTS_PATH" >> "$GITHUB_OUTPUT" |
131 | | - echo "summary_path=$SUMMARY_PATH" >> "$GITHUB_OUTPUT" |
| 132 | + echo "results_path=$RESULTS" >> "$GITHUB_OUTPUT" |
| 133 | + echo "summary_path=$SUMMARY" >> "$GITHUB_OUTPUT" |
132 | 134 |
|
133 | | - if [ -f "$RESULTS_PATH" ]; then |
134 | | - EXP_ID=$(python -c "import json; print(json.load(open('$RESULTS_PATH')).get('experiment_id',''))" 2>/dev/null || echo "") |
| 135 | + if [ -f "$RESULTS" ]; then |
| 136 | + EXP_ID=$(python -c "import json; print(json.load(open('$RESULTS')).get('experiment_id',''))" 2>/dev/null || echo "") |
135 | 137 | echo "experiment_id=$EXP_ID" >> "$GITHUB_OUTPUT" |
136 | 138 | fi |
137 | 139 |
|
138 | 140 | - name: Check thresholds |
139 | 141 | id: check |
140 | 142 | shell: bash |
141 | 143 | run: | |
142 | | - RESULTS_PATH="${{ steps.eval.outputs.results_path }}" |
143 | | - if [ ! -f "$RESULTS_PATH" ]; then |
144 | | - echo "::error::No results file found — evaluation may have failed to start." |
| 144 | + RESULTS="${{ steps.eval.outputs.results_path }}" |
| 145 | + if [ ! -f "$RESULTS" ]; then |
| 146 | + echo "::error::No results file — evaluation may have failed." |
145 | 147 | echo "passed=false" >> "$GITHUB_OUTPUT" |
146 | | - if [ "${{ inputs.fail_on_threshold }}" = "true" ]; then |
147 | | - exit 1 |
148 | | - fi |
| 148 | + [ "${{ inputs.fail_on_threshold }}" = "true" ] && exit 1 |
149 | 149 | exit 0 |
150 | 150 | fi |
151 | 151 |
|
152 | | - PASSED=$(python -c "import json; print(json.load(open('$RESULTS_PATH')).get('passed', False))" 2>/dev/null || echo "False") |
| 152 | + PASSED=$(python -c "import json; print(json.load(open('$RESULTS')).get('passed', False))" 2>/dev/null || echo "False") |
153 | 153 |
|
154 | 154 | if [ "$PASSED" = "True" ]; then |
155 | 155 | echo "passed=true" >> "$GITHUB_OUTPUT" |
156 | | - echo "All metrics passed!" |
| 156 | + echo "All metrics passed." |
157 | 157 | else |
158 | 158 | echo "passed=false" >> "$GITHUB_OUTPUT" |
159 | | -
|
160 | 159 | python -c " |
161 | | - import json, sys |
162 | | - with open('$RESULTS_PATH') as f: |
163 | | - data = json.load(f) |
| 160 | + import json |
| 161 | + data = json.load(open('$RESULTS')) |
164 | 162 | failed = [m for m in data.get('metrics', []) if not m.get('passed')] |
165 | 163 | if failed: |
166 | | - print('::error::LLM evaluation failed. Metrics below threshold:') |
| 164 | + print('::error::Metrics below threshold:') |
167 | 165 | for m in failed: |
168 | | - print(f' - {m[\"name\"]}: {m[\"score\"]*100:.1f}% (threshold: {m[\"threshold\"]*100:.0f}%)') |
| 166 | + print(f' {m[\"name\"]}: {m[\"score\"]*100:.1f}% (threshold: {m[\"threshold\"]*100:.0f}%)') |
169 | 167 | " || true |
170 | | -
|
171 | | - if [ "${{ inputs.fail_on_threshold }}" = "true" ]; then |
172 | | - exit 1 |
173 | | - fi |
| 168 | + [ "${{ inputs.fail_on_threshold }}" = "true" ] && exit 1 |
174 | 169 | fi |
175 | 170 |
|
176 | | - - name: Upload results artifact |
| 171 | + - name: Upload results |
177 | 172 | if: always() |
178 | 173 | uses: actions/upload-artifact@v7 |
179 | 174 | with: |
|
0 commit comments