Skip to content

Commit 4361b13

Browse files
committed
refactor: simplify action.yml, document all metric categories
- Clean up action.yml: env-based arg passing, concise threshold check - Document available metrics for chatbot, RAG, and agent in comments - Mark inverted metrics (bias, toxicity, hallucination) Made-with: Cursor
1 parent 3012d14 commit 4361b13

1 file changed

Lines changed: 52 additions & 57 deletions

File tree

action.yml

Lines changed: 52 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,90 @@
11
name: 'VerifyWise LLM Evaluation'
22
description: >
33
Run LLM evaluations against a VerifyWise instance and gate CI on quality
4-
thresholds. Creates an experiment, polls until completion, checks metrics,
5-
and uploads results as build artifacts.
4+
thresholds. Supports chatbot, RAG, and agent use cases.
5+
66
branding:
77
icon: 'check-circle'
88
color: 'green'
99

10+
# ─── Available metrics ──────────────────────────────────────────────────
11+
#
12+
# Universal: answer_relevancy, correctness, completeness,
13+
# hallucination*, toxicity*, bias*, instruction_following
14+
#
15+
# RAG: faithfulness, contextual_relevancy,
16+
# context_precision, context_recall
17+
#
18+
# Agent: tool_correctness, argument_correctness,
19+
# task_completion, step_efficiency,
20+
# plan_quality, plan_adherence
21+
#
22+
# (* = inverted — lower is better)
23+
# ────────────────────────────────────────────────────────────────────────
24+
1025
inputs:
1126
api_url:
12-
description: 'Base URL of the VerifyWise instance (e.g. https://app.verifywise.ai)'
27+
description: 'VerifyWise instance URL'
1328
required: true
1429
project_id:
15-
description: 'VerifyWise project ID'
30+
description: 'Project ID'
1631
required: true
1732
dataset_id:
1833
description: 'Dataset ID to evaluate against'
1934
required: true
2035
metrics:
21-
description: 'Comma-separated metric names (e.g. answerRelevancy,faithfulness,hallucination)'
36+
description: 'Comma-separated metric names (e.g. answer_relevancy,bias,toxicity)'
2237
required: true
2338
model_name:
24-
description: 'Model to evaluate (e.g. gpt-4o, claude-3.5-sonnet)'
39+
description: 'Model to evaluate (e.g. gpt-4o-mini, claude-3.5-sonnet)'
2540
required: true
2641
model_provider:
27-
description: 'Model provider (openai, anthropic, google, mistral, xai, self-hosted)'
42+
description: 'Provider: openai | anthropic | google | mistral | xai | self-hosted'
2843
required: true
2944
judge_model:
30-
description: 'Judge LLM model name'
45+
description: 'Judge LLM for metric scoring'
3146
required: false
3247
default: 'gpt-4o'
3348
judge_provider:
3449
description: 'Judge LLM provider'
3550
required: false
3651
default: 'openai'
3752
threshold:
38-
description: 'Global pass threshold (0-1). Metrics below this fail.'
53+
description: 'Pass/fail threshold (0.0–1.0)'
3954
required: false
4055
default: '0.7'
4156
timeout_minutes:
42-
description: 'Max minutes to wait for evaluation to complete'
57+
description: 'Max wait time in minutes'
4358
required: false
4459
default: '30'
4560
poll_interval_seconds:
4661
description: 'Seconds between status polls'
4762
required: false
4863
default: '15'
4964
experiment_name:
50-
description: 'Custom experiment name (defaults to timestamp-based name)'
65+
description: 'Custom experiment name (auto-generated if empty)'
5166
required: false
5267
default: ''
5368
fail_on_threshold:
54-
description: 'Whether to fail the step when thresholds are not met (true/false)'
69+
description: 'Fail the step when thresholds are not met'
5570
required: false
5671
default: 'true'
5772
vw_api_token:
58-
description: 'VerifyWise JWT token or API key'
73+
description: 'VerifyWise API token'
5974
required: true
6075
llm_api_key:
61-
description: 'API key for the LLM provider being evaluated'
76+
description: 'LLM provider API key'
6277
required: true
6378

6479
outputs:
6580
passed:
66-
description: 'Whether all metrics passed their thresholds (true/false)'
81+
description: 'Whether all metrics passed (true/false)'
6782
value: ${{ steps.check.outputs.passed }}
6883
results_path:
69-
description: 'Path to the results JSON file'
84+
description: 'Path to results JSON'
7085
value: ${{ steps.eval.outputs.results_path }}
7186
summary_path:
72-
description: 'Path to the Markdown summary file'
87+
description: 'Path to Markdown summary'
7388
value: ${{ steps.eval.outputs.summary_path }}
7489
experiment_id:
7590
description: 'ID of the created experiment'
@@ -106,74 +121,54 @@ runs:
106121
VW_EXPERIMENT_NAME: ${{ inputs.experiment_name }}
107122
LLM_API_KEY: ${{ inputs.llm_api_key }}
108123
run: |
109-
RESULTS_PATH="${{ runner.temp }}/verifywise-results.json"
110-
SUMMARY_PATH="${{ runner.temp }}/verifywise-summary.md"
124+
RESULTS="${{ runner.temp }}/vw-results.json"
125+
SUMMARY="${{ runner.temp }}/vw-summary.md"
111126
112127
python "${{ github.action_path }}/ci_eval_runner.py" \
113-
--api-url "$VW_API_URL" \
114-
--token "$VW_API_TOKEN" \
115-
--project-id "$VW_PROJECT_ID" \
116-
--dataset-id "$VW_DATASET_ID" \
117-
--metrics "$VW_METRICS" \
118-
--model-name "$VW_MODEL_NAME" \
119-
--model-provider "$VW_MODEL_PROVIDER" \
120-
--judge-model "$VW_JUDGE_MODEL" \
121-
--judge-provider "$VW_JUDGE_PROVIDER" \
122-
--threshold "$VW_THRESHOLD" \
123-
--timeout "$VW_TIMEOUT_MINUTES" \
124-
--poll-interval "$VW_POLL_INTERVAL" \
125-
--output "$RESULTS_PATH" \
126-
--markdown-output "$SUMMARY_PATH" \
127-
${VW_EXPERIMENT_NAME:+--name "$VW_EXPERIMENT_NAME"} \
128+
--output "$RESULTS" \
129+
--markdown-output "$SUMMARY" \
128130
|| true
129131
130-
echo "results_path=$RESULTS_PATH" >> "$GITHUB_OUTPUT"
131-
echo "summary_path=$SUMMARY_PATH" >> "$GITHUB_OUTPUT"
132+
echo "results_path=$RESULTS" >> "$GITHUB_OUTPUT"
133+
echo "summary_path=$SUMMARY" >> "$GITHUB_OUTPUT"
132134
133-
if [ -f "$RESULTS_PATH" ]; then
134-
EXP_ID=$(python -c "import json; print(json.load(open('$RESULTS_PATH')).get('experiment_id',''))" 2>/dev/null || echo "")
135+
if [ -f "$RESULTS" ]; then
136+
EXP_ID=$(python -c "import json; print(json.load(open('$RESULTS')).get('experiment_id',''))" 2>/dev/null || echo "")
135137
echo "experiment_id=$EXP_ID" >> "$GITHUB_OUTPUT"
136138
fi
137139
138140
- name: Check thresholds
139141
id: check
140142
shell: bash
141143
run: |
142-
RESULTS_PATH="${{ steps.eval.outputs.results_path }}"
143-
if [ ! -f "$RESULTS_PATH" ]; then
144-
echo "::error::No results file found — evaluation may have failed to start."
144+
RESULTS="${{ steps.eval.outputs.results_path }}"
145+
if [ ! -f "$RESULTS" ]; then
146+
echo "::error::No results file — evaluation may have failed."
145147
echo "passed=false" >> "$GITHUB_OUTPUT"
146-
if [ "${{ inputs.fail_on_threshold }}" = "true" ]; then
147-
exit 1
148-
fi
148+
[ "${{ inputs.fail_on_threshold }}" = "true" ] && exit 1
149149
exit 0
150150
fi
151151
152-
PASSED=$(python -c "import json; print(json.load(open('$RESULTS_PATH')).get('passed', False))" 2>/dev/null || echo "False")
152+
PASSED=$(python -c "import json; print(json.load(open('$RESULTS')).get('passed', False))" 2>/dev/null || echo "False")
153153
154154
if [ "$PASSED" = "True" ]; then
155155
echo "passed=true" >> "$GITHUB_OUTPUT"
156-
echo "All metrics passed!"
156+
echo "All metrics passed."
157157
else
158158
echo "passed=false" >> "$GITHUB_OUTPUT"
159-
160159
python -c "
161-
import json, sys
162-
with open('$RESULTS_PATH') as f:
163-
data = json.load(f)
160+
import json
161+
data = json.load(open('$RESULTS'))
164162
failed = [m for m in data.get('metrics', []) if not m.get('passed')]
165163
if failed:
166-
print('::error::LLM evaluation failed. Metrics below threshold:')
164+
print('::error::Metrics below threshold:')
167165
for m in failed:
168-
print(f' - {m[\"name\"]}: {m[\"score\"]*100:.1f}% (threshold: {m[\"threshold\"]*100:.0f}%)')
166+
print(f' {m[\"name\"]}: {m[\"score\"]*100:.1f}% (threshold: {m[\"threshold\"]*100:.0f}%)')
169167
" || true
170-
171-
if [ "${{ inputs.fail_on_threshold }}" = "true" ]; then
172-
exit 1
173-
fi
168+
[ "${{ inputs.fail_on_threshold }}" = "true" ] && exit 1
174169
fi
175170
176-
- name: Upload results artifact
171+
- name: Upload results
177172
if: always()
178173
uses: actions/upload-artifact@v7
179174
with:

0 commit comments

Comments
 (0)