getsentry
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts‎
Lines changed: 65 additions & 0 deletions b/‎src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts‎
Lines changed: 87 additions & 4 deletions b/‎src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts‎
Lines changed: 87 additions & 4 deletions
diff --git a/‎src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts‎
Lines changed: 45 additions & 5 deletions b/‎src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts‎
Lines changed: 45 additions & 5 deletions
diff --git a/‎src/benchmarks/claude-ui/config.ts‎
Lines changed: 32 additions & 21 deletions b/‎src/benchmarks/claude-ui/config.ts‎
Lines changed: 32 additions & 21 deletions
@@ -22,6 +22,10 @@
 
 ### Fixed
 
+- Fixed Claude UI benchmark preflight so transient malformed or still-loading UI snapshots no longer crash the harness or finish before app UI is observable.
+- Fixed Claude UI benchmark config handling so invalid `failurePatterns` regexes fail before a suite starts and partial `allowedVariance` overrides preserve defaults for omitted metrics.
+- Fixed Claude UI benchmark temporary simulator cleanup so simulators created by the harness are deleted even when post-creation setup fails.
+- Fixed UI action snapshot refreshes so timeout while waiting for a settled post-action snapshot returns a recoverable warning instead of unstable element refs.
 - Fixed Claude UI benchmark suite runs so temporary simulators are applied through an isolated per-run MCP config instead of being overridden by repo or example-project config defaults.
 - Fixed simulator launch failures before simulator-name resolution so they are not reported as macOS launch failures.
 - Fixed CLI JSON output so simulator-name resolution failures return the structured error envelope instead of plain stderr.
 
@@ -3,6 +3,7 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { compareBenchmark, diffToolSequence } from '../compare.ts';
+import { readConfig } from '../config.ts';
 import { resolveParserPath } from '../harness.ts';
 import { analyzeClaudeJsonl } from '../transcript.ts';
 import type { BenchmarkConfig, BenchmarkRunMetadata } from '../types.ts';
@@ -199,6 +200,19 @@ describe('Claude UI benchmark analysis', () => {
     expect(audit.patternFailures).toHaveLength(1);
   });
 
+  it('rejects malformed failure pattern regexes when loading config', () => {
+    expect(() =>
+      readConfig(
+        {
+          name: 'weather',
+          prompt: 'prompt.md',
+          failurePatterns: ['stale element ref', '[unclosed'],
+        },
+        'weather.yml',
+      ),
+    ).toThrow('weather.yml.failurePatterns[1]: invalid regular expression');
+  });
+
   it('warns by default when tool sequences drift', () => {
     const config: BenchmarkConfig = {
       name: 'weather',
@@ -251,6 +265,57 @@ describe('Claude UI benchmark analysis', () => {
     expect(result.pass).toBe(false);
   });
 
+  it('preserves default allowed variance when config only overrides some keys', () => {
+    const config: BenchmarkConfig = readConfig(
+      {
+        name: 'weather',
+        prompt: 'prompt.md',
+        baseline: {
+          totalToolCalls: 3,
+          wallClockSeconds: 120,
+        },
+        allowedVariance: {
+          wallClockSeconds: 30,
+        },
+      },
+      'weather.yml',
+    );
+    const audit = analyzeClaudeJsonl(
+      [
+        line({
+          type: 'assistant',
+          message: {
+            content: [
+              { type: 'tool_use', id: 'tool-1', name: 'Read', input: {} },
+              { type: 'tool_use', id: 'tool-2', name: 'Edit', input: {} },
+              { type: 'tool_use', id: 'tool-3', name: 'Write', input: {} },
+            ],
+          },
+        }),
+      ].join('\n'),
+      { mcpToolPrefix: toolPrefix },
+    );
+
+    const result = compareBenchmark(config, audit, runMetadata(145));
+
+    expect(result.metrics).toEqual([
+      {
+        name: 'totalToolCalls',
+        actual: 3,
+        expected: 3,
+        allowedVariance: 0,
+        pass: true,
+      },
+      {
+        name: 'wallClockSeconds',
+        actual: 145,
+        expected: 120,
+        allowedVariance: 30,
+        pass: true,
+      },
+    ]);
+  });
+
   it('fails on tool sequence drift when strict mode is enabled', () => {
     const config: BenchmarkConfig = {
       name: 'weather',
 
@@ -38,6 +38,7 @@ function describeUiWithLabel(label: string): string {
 }
 
 const emptyDescribeUi = JSON.stringify({ elements: [] });
+const loadedDescribeUi = describeUiWithLabel('Application Ready');
 
 describe('Claude UI first-run prompt preflight', () => {
   it('launches the app, dismisses configured first-run prompts, and terminates before Claude runs', async () => {
@@ -47,7 +48,7 @@ describe('Claude UI first-run prompt preflight', () => {
     const describeOutputs = [
       describeUiWithLabel('Continue'),
       describeUiWithLabel('Not Now'),
-      emptyDescribeUi,
+      loadedDescribeUi,
     ];
     const executor: LifecycleCommandExecutor = async (opts) => {
       commands.push(opts);
@@ -120,7 +121,7 @@ describe('Claude UI first-run prompt preflight', () => {
       { exitCode: 1, stdout: '' },
       { exitCode: 1, stdout: '' },
       { exitCode: 0, stdout: describeUiWithLabel('Continue') },
-      { exitCode: 0, stdout: emptyDescribeUi },
+      { exitCode: 0, stdout: loadedDescribeUi },
     ];
     const executor: LifecycleCommandExecutor = async (opts) => {
       commands.push(opts);
@@ -175,7 +176,7 @@ describe('Claude UI first-run prompt preflight', () => {
     const commands: LifecycleCommandOptions[] = [];
     const describeResults = [
       { exitCode: 1, stdout: '' },
-      { exitCode: 0, stdout: emptyDescribeUi },
+      { exitCode: 0, stdout: loadedDescribeUi },
     ];
     let now = 1_000;
     const executor: LifecycleCommandExecutor = async (opts) => {
@@ -220,7 +221,7 @@ describe('Claude UI first-run prompt preflight', () => {
     const executor: LifecycleCommandExecutor = async (opts) => {
       commands.push(opts);
       if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') {
-        return { exitCode: 0, stdout: emptyDescribeUi, stderr: '', durationSeconds: 0.01 };
+        return { exitCode: 0, stdout: loadedDescribeUi, stderr: '', durationSeconds: 0.01 };
       }
       return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
     };
@@ -251,6 +252,88 @@ describe('Claude UI first-run prompt preflight', () => {
     expect(log).toContain('First-run prompt preflight: complete');
   });
 
+  it('waits for observable UI before treating missing prompt labels as complete', async () => {
+    const logPath = await tempLogPath();
+    const commands: LifecycleCommandOptions[] = [];
+    const describeResults = [
+      { exitCode: 0, stdout: emptyDescribeUi },
+      { exitCode: 0, stdout: loadedDescribeUi },
+    ];
+    const executor: LifecycleCommandExecutor = async (opts) => {
+      commands.push(opts);
+      if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') {
+        const result = describeResults.shift() ?? { exitCode: 0, stdout: loadedDescribeUi };
+        return { ...result, stderr: '', durationSeconds: 0.01 };
+      }
+      return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
+    };
+    let now = 1_000;
+
+    await dismissFirstRunPrompts({
+      config: config(),
+      simulatorId: 'TEMP-SIM-123',
+      cwd: '/repo',
+      logPath,
+      executor,
+      axePath: '/mock/axe',
+      timing: {
+        now: () => now,
+        sleep: async (milliseconds) => {
+          now += milliseconds;
+        },
+      },
+    });
+
+    expect(commands.map((item) => [item.command, ...item.args])).toEqual([
+      ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'],
+      ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
+      ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
+      ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'],
+    ]);
+  });
+
+  it('retries malformed describe-ui output as transiently unavailable', async () => {
+    const logPath = await tempLogPath();
+    const commands: LifecycleCommandOptions[] = [];
+    const describeResults = [
+      { exitCode: 0, stdout: 'not json' },
+      { exitCode: 0, stdout: loadedDescribeUi },
+    ];
+    const executor: LifecycleCommandExecutor = async (opts) => {
+      commands.push(opts);
+      if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') {
+        const result = describeResults.shift() ?? { exitCode: 0, stdout: loadedDescribeUi };
+        return { ...result, stderr: '', durationSeconds: 0.01 };
+      }
+      return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
+    };
+    let now = 1_000;
+
+    await dismissFirstRunPrompts({
+      config: config(),
+      simulatorId: 'TEMP-SIM-123',
+      cwd: '/repo',
+      logPath,
+      executor,
+      axePath: '/mock/axe',
+      timing: {
+        now: () => now,
+        sleep: async (milliseconds) => {
+          now += milliseconds;
+        },
+      },
+    });
+
+    expect(commands.map((item) => [item.command, ...item.args])).toEqual([
+      ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'],
+      ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
+      ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'],
+      ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'],
+    ]);
+    const log = await readFile(logPath, 'utf8');
+    expect(log).toContain('First-run prompt preflight: UI unavailable; retrying (exit null)');
+  });
+
   it('does nothing when a suite has no configured first-run prompt dismissals', async () => {
     const logPath = await tempLogPath();
     const commands: LifecycleCommandOptions[] = [];
 
@@ -42,11 +42,6 @@ function config(overrides: Partial<BenchmarkConfig> = {}): BenchmarkConfig {
   };
 }
 
-async function tempLogPath(): Promise<string> {
-  const directory = await mkdtemp(path.join(os.tmpdir(), 'claude-ui-lifecycle-'));
-  return path.join(directory, 'simulator-lifecycle.log');
-}
-
 function inMemoryLifecycleLog() {
   const messages: string[] = [];
   return {
@@ -225,6 +220,51 @@ describe('Claude UI temporary simulator lifecycle', () => {
     );
   });
 
+  it('deletes the harness-created simulator when setup fails after creation', async () => {
+    const logPath = '/tmp/simulator-lifecycle.log';
+    const log = inMemoryLifecycleLog();
+    const commands: LifecycleCommandOptions[] = [];
+    const executor: LifecycleCommandExecutor = async (opts) => {
+      commands.push(opts);
+      if (opts.args[1] === 'create') {
+        return { exitCode: 0, stdout: 'TEMP-SIM-SETUP-FAIL\n', stderr: '', durationSeconds: 0.01 };
+      }
+      if (opts.args[1] === 'bootstatus') {
+        return { exitCode: 1, stdout: '', stderr: 'not ready', durationSeconds: 0.01 };
+      }
+      return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 };
+    };
+
+    await expect(
+      prepareTemporarySimulator({
+        config: config(),
+        suiteSlug: 'weather',
+        timestamp: '20260522T120000Z',
+        cwd: '/repo',
+        logPath,
+        executor,
+        logWriter: log.writer,
+        readinessDelayMs: 0,
+      }),
+    ).rejects.toThrow('temporary simulator did not reach bootstatus');
+
+    expect(commands.map((item) => [item.command, ...item.args])).toEqual([
+      [
+        'xcrun',
+        'simctl',
+        'create',
+        'XcodeBuildMCP Claude UI weather 20260522T120000Z',
+        'iPhone 17 Pro Max',
+      ],
+      ['xcrun', 'simctl', 'boot', 'TEMP-SIM-SETUP-FAIL'],
+      ['xcrun', 'simctl', 'bootstatus', 'TEMP-SIM-SETUP-FAIL', '-b'],
+      ['xcrun', 'simctl', 'delete', 'TEMP-SIM-SETUP-FAIL'],
+    ]);
+    expect(log.messages.join('\n')).toContain(
+      'Setup failed, cleaning up simulator TEMP-SIM-SETUP-FAIL',
+    );
+  });
+
   it('logs deletion failures as best effort instead of throwing', async () => {
     const logPath = '/tmp/simulator-lifecycle.log';
     const log = inMemoryLifecycleLog();
 
@@ -108,13 +108,37 @@ function readAllowedVariance(raw: unknown, source: string): Partial<AllowedVaria
   if (raw === undefined) return undefined;
   if (!isRecord(raw)) throw new Error(`${source}: expected object`);
 
-  return {
-    totalToolCalls: readOptionalNumber(raw, 'totalToolCalls', source),
-    mcpToolCalls: readOptionalNumber(raw, 'mcpToolCalls', source),
-    uiAutomationCalls: readOptionalNumber(raw, 'uiAutomationCalls', source),
-    wallClockSeconds: readOptionalNumber(raw, 'wallClockSeconds', source),
-    toolCalls: readOptionalNumber(raw, 'toolCalls', source),
-  };
+  const variance: Partial<AllowedVariance> = {};
+  const totalToolCalls = readOptionalNumber(raw, 'totalToolCalls', source);
+  if (totalToolCalls !== undefined) variance.totalToolCalls = totalToolCalls;
+  const mcpToolCalls = readOptionalNumber(raw, 'mcpToolCalls', source);
+  if (mcpToolCalls !== undefined) variance.mcpToolCalls = mcpToolCalls;
+  const uiAutomationCalls = readOptionalNumber(raw, 'uiAutomationCalls', source);
+  if (uiAutomationCalls !== undefined) variance.uiAutomationCalls = uiAutomationCalls;
+  const wallClockSeconds = readOptionalNumber(raw, 'wallClockSeconds', source);
+  if (wallClockSeconds !== undefined) variance.wallClockSeconds = wallClockSeconds;
+  const toolCalls = readOptionalNumber(raw, 'toolCalls', source);
+  if (toolCalls !== undefined) variance.toolCalls = toolCalls;
+  return variance;
+}
+
+function readFailurePatterns(raw: unknown, source: string): string[] | undefined {
+  const patterns = readOptionalStringArray(
+    raw as Record<string, unknown>,
+    'failurePatterns',
+    source,
+  );
+  for (const [index, pattern] of (patterns ?? []).entries()) {
+    try {
+      new RegExp(pattern, 'i');
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      throw new Error(
+        `${source}.failurePatterns[${index}]: invalid regular expression: ${message}`,
+      );
+    }
+  }
+  return patterns;
 }
 
 function readFirstRunPromptDismissals(
@@ -153,7 +177,7 @@ export function readConfig(raw: unknown, source: string): BenchmarkConfig {
     workingDirectory: readOptionalString(raw, 'workingDirectory', source),
     expectedToolSequence: readOptionalStringArray(raw, 'expectedToolSequence', source),
     sequence: readSequenceConfig(raw.sequence, `${source}.sequence`),
-    failurePatterns: readOptionalStringArray(raw, 'failurePatterns', source),
+    failurePatterns: readFailurePatterns(raw, source),
     temporarySimulator: readOptionalBoolean(raw, 'temporarySimulator', source),
     firstRunPromptDismissals: readFirstRunPromptDismissals(
       raw.firstRunPromptDismissals,
@@ -186,16 +210,3 @@ export async function loadSuite(suitePath: string): Promise<BenchmarkConfig> {
   const raw = parseYaml(await readFile(suitePath, 'utf8')) as unknown;
   return readConfig(raw, suitePath);
 }
-
-export function sessionDefaultsEnv(
-  sessionDefaults: Record<string, unknown> | undefined,
-): Record<string, string> {
-  const validated = validateSessionDefaults(sessionDefaults);
-  if (!validated) return {};
-
-  const env: Record<string, string> = {};
-  for (const [key, value] of Object.entries(validated)) {
-    env[sessionDefaultEnvNames[key]] = String(value);
-  }
-  return env;
-}