Skip to content

Commit 2280fd4

Browse files
committed
Merge branch 'copilot/ci-integration-balance-validation'
2 parents d535618 + d306d2e commit 2280fd4

5 files changed

Lines changed: 2171 additions & 0 deletions

File tree

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
name: Balance Validation
2+
3+
on:
4+
schedule:
5+
# Run nightly at 3:00 AM UTC (offset from ai-tournament at 2:00 AM)
6+
- cron: '0 3 * * *'
7+
workflow_dispatch:
8+
inputs:
9+
compare_baseline:
10+
description: 'Compare against baseline (true/false)'
11+
required: false
12+
default: 'true'
13+
fail_on_regression:
14+
description: 'Fail workflow on regression detection (true/false)'
15+
required: false
16+
default: 'true'
17+
stability_threshold:
18+
description: 'Stability delta threshold for regression (percentage)'
19+
required: false
20+
default: '5'
21+
push:
22+
branches:
23+
- main
24+
paths:
25+
# Trigger on changes to simulation code, content, or AI systems
26+
- 'src/gengine/echoes/sim/**'
27+
- 'src/gengine/echoes/core/**'
28+
- 'src/gengine/ai_player/**'
29+
- 'content/worlds/**'
30+
- 'content/config/**'
31+
- 'scripts/run_batch_sweeps.py'
32+
- 'scripts/analyze_balance.py'
33+
pull_request:
34+
branches:
35+
- main
36+
paths:
37+
- 'src/gengine/echoes/sim/**'
38+
- 'src/gengine/echoes/core/**'
39+
- 'src/gengine/ai_player/**'
40+
- 'content/worlds/**'
41+
- 'content/config/**'
42+
- 'scripts/run_batch_sweeps.py'
43+
- 'scripts/analyze_balance.py'
44+
45+
permissions:
46+
contents: read
47+
pull-requests: write
48+
49+
jobs:
50+
balance-validation:
51+
runs-on: ubuntu-latest
52+
outputs:
53+
regression_detected: ${{ steps.compare.outputs.regression_detected }}
54+
regression_summary: ${{ steps.compare.outputs.regression_summary }}
55+
56+
steps:
57+
- uses: actions/checkout@v4
58+
59+
- name: Set up Python 3.12
60+
uses: actions/setup-python@v5
61+
with:
62+
python-version: "3.12"
63+
64+
- name: Install dependencies
65+
run: |
66+
python -m pip install --upgrade pip
67+
pip install -e ".[dev]"
68+
69+
- name: Create build directory
70+
run: mkdir -p build
71+
72+
- name: Run CI balance sweeps
73+
id: sweep
74+
run: |
75+
echo "Running reduced parameter grid balance sweeps..."
76+
python scripts/run_batch_sweeps.py \
77+
--config content/config/ci_balance_sweeps.yml \
78+
--output-dir build/ci_sweeps \
79+
--verbose
80+
81+
echo "sweep_completed=true" >> $GITHUB_OUTPUT
82+
83+
- name: Aggregate sweep results
84+
run: |
85+
python scripts/aggregate_sweep_results.py \
86+
--database build/ci_sweep_results.db \
87+
ingest build/ci_sweeps \
88+
--verbose
89+
90+
- name: Generate balance report
91+
run: |
92+
python scripts/analyze_balance.py \
93+
--database build/ci_sweep_results.db \
94+
report
95+
96+
- name: Compare against baseline
97+
id: compare
98+
env:
99+
COMPARE_BASELINE: ${{ github.event.inputs.compare_baseline || 'true' }}
100+
STABILITY_THRESHOLD: ${{ github.event.inputs.stability_threshold || '5' }}
101+
run: |
102+
if [[ "$COMPARE_BASELINE" != "true" ]]; then
103+
echo "Baseline comparison skipped by user input"
104+
echo "regression_detected=false" >> $GITHUB_OUTPUT
105+
echo "regression_summary=Baseline comparison skipped" >> $GITHUB_OUTPUT
106+
exit 0
107+
fi
108+
109+
python scripts/manage_balance_baseline.py compare \
110+
--current build/ci_sweeps/batch_sweep_summary.json \
111+
--baseline content/baselines/balance_baseline.json \
112+
--output build/regression_summary.json \
113+
--stability-threshold "$STABILITY_THRESHOLD"
114+
115+
# Check if regressions were detected
116+
if [[ -f build/regression_summary.json ]]; then
117+
REGRESSION_COUNT=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(len(d.get('regressions', [])))")
118+
if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
119+
echo "regression_detected=true" >> $GITHUB_OUTPUT
120+
SUMMARY=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(d.get('summary', 'Regressions detected'))")
121+
echo "regression_summary=$SUMMARY" >> $GITHUB_OUTPUT
122+
else
123+
echo "regression_detected=false" >> $GITHUB_OUTPUT
124+
echo "regression_summary=No regressions detected" >> $GITHUB_OUTPUT
125+
fi
126+
else
127+
echo "regression_detected=false" >> $GITHUB_OUTPUT
128+
echo "regression_summary=No baseline comparison performed" >> $GITHUB_OUTPUT
129+
fi
130+
131+
- name: Generate comparison charts
132+
if: always()
133+
run: |
134+
python scripts/manage_balance_baseline.py chart \
135+
--current build/ci_sweeps/batch_sweep_summary.json \
136+
--baseline content/baselines/balance_baseline.json \
137+
--output build/comparison_charts.png || echo "Chart generation skipped (missing baseline or data)"
138+
139+
- name: Archive balance validation results
140+
uses: actions/upload-artifact@v4
141+
with:
142+
name: balance-validation-${{ github.run_id }}
143+
path: |
144+
build/ci_sweeps/batch_sweep_summary.json
145+
build/ci_sweep_results.db
146+
build/balance_report.html
147+
build/balance_report.json
148+
build/regression_summary.json
149+
build/comparison_charts.png
150+
retention-days: 30
151+
152+
- name: Comment on PR with balance summary
153+
if: github.event_name == 'pull_request'
154+
uses: actions/github-script@v7
155+
with:
156+
script: |
157+
const fs = require('fs');
158+
159+
let report = {};
160+
try {
161+
report = JSON.parse(fs.readFileSync('build/balance_report.json', 'utf8'));
162+
} catch (e) {
163+
console.log('Could not read balance report');
164+
return;
165+
}
166+
167+
let regressionSummary = {};
168+
try {
169+
regressionSummary = JSON.parse(fs.readFileSync('build/regression_summary.json', 'utf8'));
170+
} catch (e) {
171+
regressionSummary = { regressions: [], summary: 'No baseline comparison' };
172+
}
173+
174+
const regressions = regressionSummary.regressions || [];
175+
const hasRegressions = regressions.length > 0;
176+
177+
let body = `## 📊 Balance Validation Results\n\n`;
178+
179+
if (hasRegressions) {
180+
body += `### ⚠️ Regressions Detected\n\n`;
181+
for (const reg of regressions) {
182+
const icon = reg.severity === 'failure' ? '❌' : '⚠️';
183+
body += `- ${icon} **${reg.metric_name}**: ${reg.description}\n`;
184+
}
185+
body += `\n`;
186+
} else {
187+
body += `### ✅ No Regressions Detected\n\n`;
188+
}
189+
190+
// Strategy stats
191+
if (regressionSummary.current_stats && regressionSummary.current_stats.strategy_stats) {
192+
body += `### Strategy Win Rates\n\n`;
193+
body += `| Strategy | Avg Stability | Win Rate |\n`;
194+
body += `|----------|---------------|----------|\n`;
195+
for (const [strategy, stats] of Object.entries(regressionSummary.current_stats.strategy_stats)) {
196+
const avgStab = stats.avg_stability?.toFixed(3) || 'N/A';
197+
const winRate = ((stats.avg_stability || 0) >= 0.5 ? '✅' : '⚠️');
198+
body += `| ${strategy} | ${avgStab} | ${winRate} |\n`;
199+
}
200+
body += `\n`;
201+
}
202+
203+
body += `\n📁 [Download full report](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID})\n`;
204+
205+
github.rest.issues.createComment({
206+
issue_number: context.issue.number,
207+
owner: context.repo.owner,
208+
repo: context.repo.repo,
209+
body: body
210+
});
211+
212+
- name: Fail on regression (if configured)
213+
if: steps.compare.outputs.regression_detected == 'true'
214+
env:
215+
FAIL_ON_REGRESSION: ${{ github.event.inputs.fail_on_regression || 'true' }}
216+
run: |
217+
echo "::error::Balance regression detected: ${{ steps.compare.outputs.regression_summary }}"
218+
if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
219+
echo "Failing workflow due to regression detection"
220+
exit 1
221+
else
222+
echo "::warning::Regression detected but workflow configured to continue"
223+
fi
224+
225+
update-baseline:
226+
needs: balance-validation
227+
runs-on: ubuntu-latest
228+
# Only update baseline on successful main branch runs (scheduled or push)
229+
if: |
230+
github.ref == 'refs/heads/main' &&
231+
(github.event_name == 'schedule' || github.event_name == 'push') &&
232+
needs.balance-validation.outputs.regression_detected != 'true'
233+
234+
steps:
235+
- uses: actions/checkout@v4
236+
237+
- name: Set up Python 3.12
238+
uses: actions/setup-python@v5
239+
with:
240+
python-version: "3.12"
241+
242+
- name: Install dependencies
243+
run: |
244+
python -m pip install --upgrade pip
245+
pip install -e ".[dev]"
246+
247+
- name: Download sweep results
248+
uses: actions/download-artifact@v4.1.8
249+
with:
250+
name: balance-validation-${{ github.run_id }}
251+
path: build
252+
253+
- name: Update baseline
254+
run: |
255+
python scripts/manage_balance_baseline.py update \
256+
--source build/ci_sweeps/batch_sweep_summary.json \
257+
--output content/baselines/balance_baseline.json
258+
259+
- name: Create baseline update PR
260+
uses: peter-evans/create-pull-request@v6
261+
with:
262+
token: ${{ secrets.GITHUB_TOKEN }}
263+
commit-message: "chore: update balance validation baseline"
264+
branch: automated/update-balance-baseline
265+
delete-branch: true
266+
title: "[Automated] Update Balance Validation Baseline"
267+
body: |
268+
This PR updates the balance validation baseline from the latest successful CI run.
269+
270+
**Run ID:** ${{ github.run_id }}
271+
**Commit:** ${{ github.sha }}
272+
273+
The baseline is updated automatically when:
274+
- The balance validation workflow passes on main branch
275+
- No regressions are detected against the previous baseline
276+
277+
Please review the changes before merging.
278+
labels: |
279+
automated
280+
balance
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
{
2+
"version": "1.0",
3+
"created_at": "2025-01-01T00:00:00Z",
4+
"updated_at": "2025-01-01T00:00:00Z",
5+
"git_commit": null,
6+
"description": "Initial balance baseline - placeholder for first CI run",
7+
"strategy_stats": {
8+
"balanced": {
9+
"avg_stability": 0.7,
10+
"min_stability": 0.5,
11+
"max_stability": 0.9,
12+
"win_rate": 0.8,
13+
"avg_actions": 10,
14+
"total_actions": 50,
15+
"count": 5,
16+
"completed": 5,
17+
"failed": 0
18+
},
19+
"aggressive": {
20+
"avg_stability": 0.5,
21+
"min_stability": 0.3,
22+
"max_stability": 0.7,
23+
"win_rate": 0.5,
24+
"avg_actions": 15,
25+
"total_actions": 75,
26+
"count": 5,
27+
"completed": 5,
28+
"failed": 0
29+
},
30+
"diplomatic": {
31+
"avg_stability": 0.65,
32+
"min_stability": 0.45,
33+
"max_stability": 0.85,
34+
"win_rate": 0.6,
35+
"avg_actions": 8,
36+
"total_actions": 40,
37+
"count": 5,
38+
"completed": 5,
39+
"failed": 0
40+
}
41+
},
42+
"difficulty_stats": {
43+
"easy": {
44+
"avg_stability": 0.75,
45+
"count": 5
46+
},
47+
"normal": {
48+
"avg_stability": 0.6,
49+
"count": 5
50+
},
51+
"hard": {
52+
"avg_stability": 0.5,
53+
"count": 5
54+
}
55+
},
56+
"total_sweeps": 45,
57+
"completed_sweeps": 45,
58+
"failed_sweeps": 0,
59+
"total_duration_seconds": 0,
60+
"thresholds": {
61+
"stability_delta_warning": 5.0,
62+
"stability_delta_failure": 10.0,
63+
"win_rate_delta_warning": 5.0,
64+
"win_rate_delta_failure": 10.0,
65+
"unused_content_warning": true
66+
}
67+
}

0 commit comments

Comments
 (0)