diff --git a/.github/workflows/skill-evaluation.yml b/.github/workflows/skill-evaluation.yml index 00e9157..f2338d4 100644 --- a/.github/workflows/skill-evaluation.yml +++ b/.github/workflows/skill-evaluation.yml @@ -126,6 +126,8 @@ jobs: path: | tests/results.md tests/results.json + retention-days: 7 + if-no-files-found: warn - name: Check evaluation results if: always() @@ -133,3 +135,34 @@ jobs: if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then echo "::warning::Some skills have failing scenarios in real SDK evaluation" fi + + - name: Add detailed failure annotations + if: always() + working-directory: tests + run: | + if [ "${{ steps.harness.outputs.exit_code }}" != "0" ] && [ -f results.json ]; then + node - <<'EOF' + const fs = require('fs'); + + const raw = fs.readFileSync('results.json', 'utf-8'); + const data = JSON.parse(raw); + const skills = data.skills ?? []; + + for (const skill of skills) { + const results = skill.results ?? []; + for (const result of results) { + if (result.passed) continue; + const findings = result.findings ?? []; + const failures = findings.filter(f => f.severity === 'error'); + if (failures.length === 0) continue; + + const top = failures.slice(0, 3); + const details = top.map(f => `- ${f.message}${f.suggestion ? ` (💡 ${f.suggestion})` : ''}`).join(' '); + const summary = `${skill.skill_name} / ${result.scenario} failed (score: ${Number(result.score).toFixed(1)})`; + const message = `${summary} ${details}`; + + console.log(`::error::${message}`); + } + } + EOF + fi diff --git a/tests/harness/runner.ts b/tests/harness/runner.ts index 68be2dc..4d4c4f8 100644 --- a/tests/harness/runner.ts +++ b/tests/harness/runner.ts @@ -296,6 +296,7 @@ export class SkillEvaluationRunner { if (this.verbose) { const status = evalResult.passed ? chalk.green("✓") : chalk.red("✗"); console.log(` ${status} Score: ${evalResult.score.toFixed(1)}`); + this.printVerboseScenarioResult(evalResult, scenario); } } @@ -372,6 +373,132 @@ export class SkillEvaluationRunner { return Math.max(0, Math.min(100, score)); } + private getSeverityStyle(severity: Severity | string): (text: string) => string { + const severityValue = typeof severity === "string" ? severity : severity; + switch (severityValue) { + case "error": + return chalk.red; + case "warning": + return chalk.yellow; + case "info": + return chalk.blue; + default: + return chalk.white; + } + } + + private printFinding(finding: Finding): void { + const severityStyle = this.getSeverityStyle(finding.severity); + const severityLabel = finding.severity.toUpperCase(); + + console.log( + ` ${severityStyle(`[${severityLabel}]`)} ${finding.rule}: ${finding.message}` + ); + + if (finding.suggestion) { + console.log(` 💡 ${finding.suggestion}`); + } + + if (finding.codeSnippet) { + console.log(chalk.dim(` ${finding.codeSnippet}`)); + } + } + + private printFindings(findings: Finding[]): void { + if (findings.length === 0) { + return; + } + + console.log(" Findings:"); + for (const finding of findings) { + this.printFinding(finding); + } + } + + private printScenarioPatternChecks(code: string, scenario: TestScenario): void { + const expectedPatterns = scenario.expectedPatterns ?? []; + const forbiddenPatterns = scenario.forbiddenPatterns ?? []; + + if (expectedPatterns.length === 0 && forbiddenPatterns.length === 0) { + return; + } + + console.log(" Scenario checks:"); + + for (const pattern of expectedPatterns) { + const found = code.includes(pattern); + const status = found ? chalk.green("✓") : chalk.red("✗"); + console.log(` ${status} Expected: ${pattern}`); + } + + for (const pattern of forbiddenPatterns) { + const found = code.includes(pattern); + const status = found ? chalk.red("✗") : chalk.green("✓"); + console.log(` ${status} Forbidden: ${pattern}`); + } + } + + private printAcceptanceCriteriaMatches( + matchedCorrect: string[], + matchedIncorrect: string[] + ): void { + const uniqueCorrect = Array.from(new Set(matchedCorrect)); + const uniqueIncorrect = Array.from(new Set(matchedIncorrect)); + + if (uniqueCorrect.length === 0 && uniqueIncorrect.length === 0) { + return; + } + + console.log(" Acceptance criteria:"); + if (uniqueCorrect.length > 0) { + console.log( + ` ${chalk.green("✓")} Matched sections: ${uniqueCorrect.join(", ")}` + ); + } + if (uniqueIncorrect.length > 0) { + console.log( + ` ${chalk.red("✗")} Incorrect sections: ${uniqueIncorrect.join(", ")}` + ); + } + } + + private printVerboseScenarioResult( + result: EvaluationResult, + scenario: TestScenario + ): void { + this.printScenarioPatternChecks(result.generatedCode, scenario); + this.printAcceptanceCriteriaMatches( + result.matchedCorrect, + result.matchedIncorrect + ); + this.printFindings(result.findings); + } + + private printVerboseRalphResult( + result: RalphLoopResult, + scenario: TestScenario + ): void { + if (result.iterations.length === 0) { + return; + } + + const scoreTrail = result.iterations + .map((iteration) => `#${iteration.iteration} ${iteration.score.toFixed(1)}`) + .join(" → "); + + console.log(` Iterations: ${scoreTrail}`); + console.log( + ` Improvement: ${result.improvement >= 0 ? "+" : ""}${result.improvement.toFixed(1)} pts` + ); + + const lastIteration = result.iterations[result.iterations.length - 1]; + if (!lastIteration) { + return; + } + this.printScenarioPatternChecks(lastIteration.generatedCode, scenario); + this.printFindings(lastIteration.findings); + } + async runWithLoop( skillName: string, scenarioFilter?: string, @@ -427,6 +554,7 @@ export class SkillEvaluationRunner { console.log( ` ${status} Score: ${result.finalScore.toFixed(1)} (${result.iterations.length} iterations, ${result.stopReason})` ); + this.printVerboseRalphResult(result, scenario); } } @@ -562,20 +690,41 @@ function formatAllSkillsMarkdown(summary: AllSkillsSummary): string { lines.push("## Failed Scenarios"); lines.push(""); - for (const skill of failedSkills) { - lines.push(`### ${skill.skillName}`); - lines.push(""); - for (const result of skill.results) { - if (!result.passed) { - lines.push(`- **${result.scenario}** (score: ${result.score.toFixed(1)})`); - const errors = result.findings.filter(f => f.severity === Severity.ERROR); - for (const err of errors.slice(0, 3)) { // Limit to 3 errors per scenario - lines.push(` - ${err.message}`); + for (const skill of failedSkills) { + lines.push(`### ${skill.skillName}`); + lines.push(""); + for (const result of skill.results) { + if (!result.passed) { + lines.push(`- **${result.scenario}** (score: ${result.score.toFixed(1)})`); + if (result.findings.length > 0) { + const errors = result.findings.filter(f => f.severity === Severity.ERROR); + const warnings = result.findings.filter(f => f.severity === Severity.WARNING); + const infos = result.findings.filter(f => f.severity === Severity.INFO); + const ordered = [...errors, ...warnings, ...infos].slice(0, 5); + for (const finding of ordered) { + const severity = finding.severity.toUpperCase(); + lines.push(` - [${severity}] ${finding.message}`); + if (finding.suggestion) { + lines.push(` - 💡 ${finding.suggestion}`); + } + } + } + if (result.matchedIncorrect.length > 0) { + lines.push(" - Incorrect sections:"); + for (const section of result.matchedIncorrect) { + lines.push(` - ${section}`); + } + } + if (result.matchedCorrect.length > 0) { + lines.push(" - Matched sections:"); + for (const section of result.matchedCorrect) { + lines.push(` - ${section}`); + } + } } } + lines.push(""); } - lines.push(""); - } } return lines.join("\n"); @@ -734,6 +883,23 @@ async function main(): Promise { } } else { failedSkills++; + if (options.verbose) { + console.log(` Failed scenarios: ${summary.failed}/${summary.totalScenarios}`); + for (const result of summary.results) { + if (!result.passed) { + console.log(` - ${result.scenario} (score: ${result.score.toFixed(1)})`); + const errors = result.findings.filter( + (finding) => finding.severity === Severity.ERROR + ); + for (const error of errors.slice(0, 3)) { + console.log(` ${chalk.red("[ERROR]")} ${error.message}`); + if (error.suggestion) { + console.log(` 💡 ${error.suggestion}`); + } + } + } + } + } if (!options.verbose) { console.log(chalk.red(`✗ ${summary.passed}/${summary.totalScenarios}`)); }