Skip to content

Commit afdf792

Browse files
cameroncookecodex
andcommitted
fix(benchmarks): Address Claude UI review feedback
Harden the Claude UI benchmark harness so preflight retries transient UI inspection failures, temporary simulators are cleaned up after partial setup failures, and suite config validation fails before expensive runs. Also report unsettled post-action runtime snapshots as recoverable UI automation warnings instead of returning unstable refs. Co-Authored-By: OpenAI Codex <codex@openai.com>
1 parent bbab5f7 commit afdf792

9 files changed

Lines changed: 396 additions & 94 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222

2323
### Fixed
2424

25+
- Fixed Claude UI benchmark preflight so transient malformed or still-loading UI snapshots no longer crash the harness or finish before app UI is observable.
26+
- Fixed Claude UI benchmark config handling so invalid `failurePatterns` regexes fail before a suite starts and partial `allowedVariance` overrides preserve defaults for omitted metrics.
27+
- Fixed Claude UI benchmark temporary simulator cleanup so simulators created by the harness are deleted even when post-creation setup fails.
28+
- Fixed UI action snapshot refreshes so timeout while waiting for a settled post-action snapshot returns a recoverable warning instead of unstable element refs.
2529
- Fixed Claude UI benchmark suite runs so temporary simulators are applied through an isolated per-run MCP config instead of being overridden by repo or example-project config defaults.
2630
- Fixed simulator launch failures before simulator-name resolution so they are not reported as macOS launch failures.
2731
- Fixed CLI JSON output so simulator-name resolution failures return the structured error envelope instead of plain stderr.

src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { tmpdir } from 'node:os';
33
import path from 'node:path';
44
import { fileURLToPath } from 'node:url';
55
import { compareBenchmark, diffToolSequence } from '../compare.ts';
6+
import { readConfig } from '../config.ts';
67
import { resolveParserPath } from '../harness.ts';
78
import { analyzeClaudeJsonl } from '../transcript.ts';
89
import type { BenchmarkConfig, BenchmarkRunMetadata } from '../types.ts';
@@ -199,6 +200,19 @@ describe('Claude UI benchmark analysis', () => {
199200
expect(audit.patternFailures).toHaveLength(1);
200201
});
201202

203+
it('rejects malformed failure pattern regexes when loading config', () => {
204+
expect(() =>
205+
readConfig(
206+
{
207+
name: 'weather',
208+
prompt: 'prompt.md',
209+
failurePatterns: ['stale element ref', '[unclosed'],
210+
},
211+
'weather.yml',
212+
),
213+
).toThrow('weather.yml.failurePatterns[1]: invalid regular expression');
214+
});
215+
202216
it('warns by default when tool sequences drift', () => {
203217
const config: BenchmarkConfig = {
204218
name: 'weather',
@@ -251,6 +265,57 @@ describe('Claude UI benchmark analysis', () => {
251265
expect(result.pass).toBe(false);
252266
});
253267

268+
it('preserves default allowed variance when config only overrides some keys', () => {
269+
const config: BenchmarkConfig = readConfig(
270+
{
271+
name: 'weather',
272+
prompt: 'prompt.md',
273+
baseline: {
274+
totalToolCalls: 3,
275+
wallClockSeconds: 120,
276+
},
277+
allowedVariance: {
278+
wallClockSeconds: 30,
279+
},
280+
},
281+
'weather.yml',
282+
);
283+
const audit = analyzeClaudeJsonl(
284+
[
285+
line({
286+
type: 'assistant',
287+
message: {
288+
content: [
289+
{ type: 'tool_use', id: 'tool-1', name: 'Read', input: {} },
290+
{ type: 'tool_use', id: 'tool-2', name: 'Edit', input: {} },
291+
{ type: 'tool_use', id: 'tool-3', name: 'Write', input: {} },
292+
],
293+
},
294+
}),
295+
].join('\n'),
296+
{ mcpToolPrefix: toolPrefix },
297+
);
298+
299+
const result = compareBenchmark(config, audit, runMetadata(145));
300+
301+
expect(result.metrics).toEqual([
302+
{
303+
name: 'totalToolCalls',
304+
actual: 3,
305+
expected: 3,
306+
allowedVariance: 0,
307+
pass: true,
308+
},
309+
{
310+
name: 'wallClockSeconds',
311+
actual: 145,
312+
expected: 120,
313+
allowedVariance: 30,
314+
pass: true,
315+
},
316+
]);
317+
});
318+
254319
it('fails on tool sequence drift when strict mode is enabled', () => {
255320
const config: BenchmarkConfig = {
256321
name: 'weather',

src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ function describeUiWithLabel(label: string): string {
3838
}
3939

4040
const emptyDescribeUi = JSON.stringify({ elements: [] });
41+
const loadedDescribeUi = describeUiWithLabel('Application Ready');
4142

4243
describe('Claude UI first-run prompt preflight', () => {
4344
it('launches the app, dismisses configured first-run prompts, and terminates before Claude runs', async () => {
@@ -47,7 +48,7 @@ describe('Claude UI first-run prompt preflight', () => {
4748
const describeOutputs = [
4849
describeUiWithLabel('Continue'),
4950
describeUiWithLabel('Not Now'),
50-
emptyDescribeUi,
51+
loadedDescribeUi,
5152
];
5253
const executor: LifecycleCommandExecutor = async (opts) => {
5354
commands.push(opts);
@@ -120,7 +121,7 @@ describe('Claude UI first-run prompt preflight', () => {
120121
{ exitCode: 1, stdout: '' },
121122
{ exitCode: 1, stdout: '' },
122123
{ exitCode: 0, stdout: describeUiWithLabel('Continue') },
123-
{ exitCode: 0, stdout: emptyDescribeUi },
124+
{ exitCode: 0, stdout: loadedDescribeUi },
124125
];
125126
const executor: LifecycleCommandExecutor = async (opts) => {
126127
commands.push(opts);
@@ -175,7 +176,7 @@ describe('Claude UI first-run prompt preflight', () => {
175176
const commands: LifecycleCommandOptions[] = [];
176177
const describeResults = [
177178
{ exitCode: 1, stdout: '' },
178-
{ exitCode: 0, stdout: emptyDescribeUi },
179+
{ exitCode: 0, stdout: loadedDescribeUi },
179180
];
180181
let now = 1_000;
181182
const executor: LifecycleCommandExecutor = async (opts) => {
@@ -220,7 +221,7 @@ describe('Claude UI first-run prompt preflight', () => {
220221
const executor: LifecycleCommandExecutor = async (opts) => {
221222
commands.push(opts);
222223
if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') {
223-
return { exitCode: 0, stdout: emptyDescribeUi, stderr: '', durationSeconds: 0.01 };
224+
return { exitCode: 0, stdout: loadedDescribeUi, stderr: '', durationSeconds: 0.01 };
224225
}
225226
return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
226227
};
@@ -251,6 +252,88 @@ describe('Claude UI first-run prompt preflight', () => {
251252
expect(log).toContain('First-run prompt preflight: complete');
252253
});
253254

255+
it('waits for observable UI before treating missing prompt labels as complete', async () => {
256+
const logPath = await tempLogPath();
257+
const commands: LifecycleCommandOptions[] = [];
258+
const describeResults = [
259+
{ exitCode: 0, stdout: emptyDescribeUi },
260+
{ exitCode: 0, stdout: loadedDescribeUi },
261+
];
262+
const executor: LifecycleCommandExecutor = async (opts) => {
263+
commands.push(opts);
264+
if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') {
265+
const result = describeResults.shift() ?? { exitCode: 0, stdout: loadedDescribeUi };
266+
return { ...result, stderr: '', durationSeconds: 0.01 };
267+
}
268+
return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
269+
};
270+
let now = 1_000;
271+
272+
await dismissFirstRunPrompts({
273+
config: config(),
274+
simulatorId: 'TEMP-SIM-123',
275+
cwd: '/repo',
276+
logPath,
277+
executor,
278+
axePath: '/mock/axe',
279+
timing: {
280+
now: () => now,
281+
sleep: async (milliseconds) => {
282+
now += milliseconds;
283+
},
284+
},
285+
});
286+
287+
expect(commands.map((item) => [item.command, ...item.args])).toEqual([
288+
['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'],
289+
['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
290+
['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
291+
['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'],
292+
]);
293+
});
294+
295+
it('retries malformed describe-ui output as transiently unavailable', async () => {
296+
const logPath = await tempLogPath();
297+
const commands: LifecycleCommandOptions[] = [];
298+
const describeResults = [
299+
{ exitCode: 0, stdout: 'not json' },
300+
{ exitCode: 0, stdout: loadedDescribeUi },
301+
];
302+
const executor: LifecycleCommandExecutor = async (opts) => {
303+
commands.push(opts);
304+
if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') {
305+
const result = describeResults.shift() ?? { exitCode: 0, stdout: loadedDescribeUi };
306+
return { ...result, stderr: '', durationSeconds: 0.01 };
307+
}
308+
return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
309+
};
310+
let now = 1_000;
311+
312+
await dismissFirstRunPrompts({
313+
config: config(),
314+
simulatorId: 'TEMP-SIM-123',
315+
cwd: '/repo',
316+
logPath,
317+
executor,
318+
axePath: '/mock/axe',
319+
timing: {
320+
now: () => now,
321+
sleep: async (milliseconds) => {
322+
now += milliseconds;
323+
},
324+
},
325+
});
326+
327+
expect(commands.map((item) => [item.command, ...item.args])).toEqual([
328+
['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'],
329+
['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
330+
['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
331+
['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'],
332+
]);
333+
const log = await readFile(logPath, 'utf8');
334+
expect(log).toContain('First-run prompt preflight: UI unavailable; retrying (exit null)');
335+
});
336+
254337
it('does nothing when a suite has no configured first-run prompt dismissals', async () => {
255338
const logPath = await tempLogPath();
256339
const commands: LifecycleCommandOptions[] = [];

src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,6 @@ function config(overrides: Partial<BenchmarkConfig> = {}): BenchmarkConfig {
4242
};
4343
}
4444

45-
async function tempLogPath(): Promise<string> {
46-
const directory = await mkdtemp(path.join(os.tmpdir(), 'claude-ui-lifecycle-'));
47-
return path.join(directory, 'simulator-lifecycle.log');
48-
}
49-
5045
function inMemoryLifecycleLog() {
5146
const messages: string[] = [];
5247
return {
@@ -225,6 +220,51 @@ describe('Claude UI temporary simulator lifecycle', () => {
225220
);
226221
});
227222

223+
it('deletes the harness-created simulator when setup fails after creation', async () => {
224+
const logPath = '/tmp/simulator-lifecycle.log';
225+
const log = inMemoryLifecycleLog();
226+
const commands: LifecycleCommandOptions[] = [];
227+
const executor: LifecycleCommandExecutor = async (opts) => {
228+
commands.push(opts);
229+
if (opts.args[1] === 'create') {
230+
return { exitCode: 0, stdout: 'TEMP-SIM-SETUP-FAIL\n', stderr: '', durationSeconds: 0.01 };
231+
}
232+
if (opts.args[1] === 'bootstatus') {
233+
return { exitCode: 1, stdout: '', stderr: 'not ready', durationSeconds: 0.01 };
234+
}
235+
return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
236+
};
237+
238+
await expect(
239+
prepareTemporarySimulator({
240+
config: config(),
241+
suiteSlug: 'weather',
242+
timestamp: '20260522T120000Z',
243+
cwd: '/repo',
244+
logPath,
245+
executor,
246+
logWriter: log.writer,
247+
readinessDelayMs: 0,
248+
}),
249+
).rejects.toThrow('temporary simulator did not reach bootstatus');
250+
251+
expect(commands.map((item) => [item.command, ...item.args])).toEqual([
252+
[
253+
'xcrun',
254+
'simctl',
255+
'create',
256+
'XcodeBuildMCP Claude UI weather 20260522T120000Z',
257+
'iPhone 17 Pro Max',
258+
],
259+
['xcrun', 'simctl', 'boot', 'TEMP-SIM-SETUP-FAIL'],
260+
['xcrun', 'simctl', 'bootstatus', 'TEMP-SIM-SETUP-FAIL', '-b'],
261+
['xcrun', 'simctl', 'delete', 'TEMP-SIM-SETUP-FAIL'],
262+
]);
263+
expect(log.messages.join('\n')).toContain(
264+
'Setup failed, cleaning up simulator TEMP-SIM-SETUP-FAIL',
265+
);
266+
});
267+
228268
it('logs deletion failures as best effort instead of throwing', async () => {
229269
const logPath = '/tmp/simulator-lifecycle.log';
230270
const log = inMemoryLifecycleLog();

src/benchmarks/claude-ui/config.ts

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,37 @@ function readAllowedVariance(raw: unknown, source: string): Partial<AllowedVaria
108108
if (raw === undefined) return undefined;
109109
if (!isRecord(raw)) throw new Error(`${source}: expected object`);
110110

111-
return {
112-
totalToolCalls: readOptionalNumber(raw, 'totalToolCalls', source),
113-
mcpToolCalls: readOptionalNumber(raw, 'mcpToolCalls', source),
114-
uiAutomationCalls: readOptionalNumber(raw, 'uiAutomationCalls', source),
115-
wallClockSeconds: readOptionalNumber(raw, 'wallClockSeconds', source),
116-
toolCalls: readOptionalNumber(raw, 'toolCalls', source),
117-
};
111+
const variance: Partial<AllowedVariance> = {};
112+
const totalToolCalls = readOptionalNumber(raw, 'totalToolCalls', source);
113+
if (totalToolCalls !== undefined) variance.totalToolCalls = totalToolCalls;
114+
const mcpToolCalls = readOptionalNumber(raw, 'mcpToolCalls', source);
115+
if (mcpToolCalls !== undefined) variance.mcpToolCalls = mcpToolCalls;
116+
const uiAutomationCalls = readOptionalNumber(raw, 'uiAutomationCalls', source);
117+
if (uiAutomationCalls !== undefined) variance.uiAutomationCalls = uiAutomationCalls;
118+
const wallClockSeconds = readOptionalNumber(raw, 'wallClockSeconds', source);
119+
if (wallClockSeconds !== undefined) variance.wallClockSeconds = wallClockSeconds;
120+
const toolCalls = readOptionalNumber(raw, 'toolCalls', source);
121+
if (toolCalls !== undefined) variance.toolCalls = toolCalls;
122+
return variance;
123+
}
124+
125+
function readFailurePatterns(raw: unknown, source: string): string[] | undefined {
126+
const patterns = readOptionalStringArray(
127+
raw as Record<string, unknown>,
128+
'failurePatterns',
129+
source,
130+
);
131+
for (const [index, pattern] of (patterns ?? []).entries()) {
132+
try {
133+
new RegExp(pattern, 'i');
134+
} catch (error) {
135+
const message = error instanceof Error ? error.message : String(error);
136+
throw new Error(
137+
`${source}.failurePatterns[${index}]: invalid regular expression: ${message}`,
138+
);
139+
}
140+
}
141+
return patterns;
118142
}
119143

120144
function readFirstRunPromptDismissals(
@@ -153,7 +177,7 @@ export function readConfig(raw: unknown, source: string): BenchmarkConfig {
153177
workingDirectory: readOptionalString(raw, 'workingDirectory', source),
154178
expectedToolSequence: readOptionalStringArray(raw, 'expectedToolSequence', source),
155179
sequence: readSequenceConfig(raw.sequence, `${source}.sequence`),
156-
failurePatterns: readOptionalStringArray(raw, 'failurePatterns', source),
180+
failurePatterns: readFailurePatterns(raw, source),
157181
temporarySimulator: readOptionalBoolean(raw, 'temporarySimulator', source),
158182
firstRunPromptDismissals: readFirstRunPromptDismissals(
159183
raw.firstRunPromptDismissals,
@@ -186,16 +210,3 @@ export async function loadSuite(suitePath: string): Promise<BenchmarkConfig> {
186210
const raw = parseYaml(await readFile(suitePath, 'utf8')) as unknown;
187211
return readConfig(raw, suitePath);
188212
}
189-
190-
export function sessionDefaultsEnv(
191-
sessionDefaults: Record<string, unknown> | undefined,
192-
): Record<string, string> {
193-
const validated = validateSessionDefaults(sessionDefaults);
194-
if (!validated) return {};
195-
196-
const env: Record<string, string> = {};
197-
for (const [key, value] of Object.entries(validated)) {
198-
env[sessionDefaultEnvNames[key]] = String(value);
199-
}
200-
return env;
201-
}

0 commit comments

Comments
 (0)