-
Notifications
You must be signed in to change notification settings - Fork 37
feat(harness): task envelope budgets for browser work (#1034) #1082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7cff774
008706b
3856455
627829e
02b3a7b
a4f05ee
a9be0f0
347d250
fdde3d0
8a7f392
d285e28
c1db641
1a224a6
a71c663
af6dd95
a7a4dac
81bd126
4931234
e27b3dc
569865f
e26da31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,243 @@ | ||
| import type { | ||
| BudgetStatus, | ||
| RecordedToolCall, | ||
| TaskBudgetDecision, | ||
| TaskCounters, | ||
| TaskEnvelopePolicy, | ||
| TaskMeta, | ||
| TaskPhase, | ||
| TaskRecentEvent, | ||
| } from './types'; | ||
|
|
||
| const DEFAULT_MAX_CONSECUTIVE_SAME_TOOL = 5; | ||
| const DEFAULT_MAX_OBSERVATION_STREAK = 6; | ||
| const DEFAULT_MAX_FAILURE_STREAK = 4; | ||
| const DEFAULT_MAX_SAME_URL_NAVIGATIONS = 3; | ||
| const RECENT_EVENT_LIMIT = 10; | ||
|
|
||
| const OBSERVATION_TOOLS = new Set([ | ||
| 'read_page', | ||
| 'find', | ||
| 'tabs_context', | ||
| 'tabs_list', | ||
| 'tabs_get', | ||
| 'inspect', | ||
| 'page_screenshot', | ||
| 'vision_find', | ||
| 'oc_assert', | ||
| ]); | ||
|
|
||
| export function isObservationTool(tool: string, args: Record<string, unknown>): boolean { | ||
| if (OBSERVATION_TOOLS.has(tool)) return true; | ||
| return tool === 'computer' && args.action === 'screenshot'; | ||
| } | ||
|
|
||
| export function normalizeTaskPhase(value: unknown): TaskPhase { | ||
| switch (value) { | ||
| case 'explore': | ||
| case 'act': | ||
| case 'verify': | ||
| case 'recover': | ||
| case 'done': | ||
| return value; | ||
| default: | ||
| return 'explore'; | ||
| } | ||
| } | ||
|
|
||
| export function normalizeTaskPolicy(input: unknown): TaskEnvelopePolicy { | ||
| const raw = (input && typeof input === 'object') ? input as Record<string, unknown> : {}; | ||
| return { | ||
| maxToolCalls: positiveInt(raw.maxToolCalls), | ||
| maxWallMs: positiveInt(raw.maxWallMs), | ||
| maxConsecutiveSameTool: positiveInt(raw.maxConsecutiveSameTool) ?? DEFAULT_MAX_CONSECUTIVE_SAME_TOOL, | ||
| maxObservationStreak: positiveInt(raw.maxObservationStreak) ?? DEFAULT_MAX_OBSERVATION_STREAK, | ||
| maxFailureStreak: positiveInt(raw.maxFailureStreak) ?? DEFAULT_MAX_FAILURE_STREAK, | ||
| maxSameUrlNavigations: positiveInt(raw.maxSameUrlNavigations) ?? DEFAULT_MAX_SAME_URL_NAVIGATIONS, | ||
| allowedDomains: Array.isArray(raw.allowedDomains) | ||
| ? raw.allowedDomains.filter((d): d is string => typeof d === 'string' && d.length > 0) | ||
| : undefined, | ||
| checkpointEveryCalls: positiveInt(raw.checkpointEveryCalls), | ||
| }; | ||
| } | ||
|
|
||
| function positiveInt(value: unknown): number | undefined { | ||
| if (typeof value !== 'number' || !Number.isFinite(value)) return undefined; | ||
| const n = Math.floor(value); | ||
| return n > 0 ? n : undefined; | ||
| } | ||
|
|
||
| export function initialCounters(): TaskCounters { | ||
| return { | ||
| toolCalls: 0, | ||
| actionCalls: 0, | ||
| observationCalls: 0, | ||
| failureCalls: 0, | ||
| consecutiveSameTool: 0, | ||
| observationStreak: 0, | ||
| failureStreak: 0, | ||
| sameUrlNavigations: {}, | ||
| }; | ||
| } | ||
|
|
||
| export function applyToolCallToTask(meta: TaskMeta, call: RecordedToolCall): TaskMeta { | ||
| const policy = normalizeTaskPolicy(meta.policy); | ||
| const current = meta.counters ?? initialCounters(); | ||
| const counters: TaskCounters = { | ||
| ...initialCounters(), | ||
| ...current, | ||
| sameUrlNavigations: { ...(current.sameUrlNavigations ?? {}) }, | ||
| }; | ||
| const previousTool = meta.last_tool_name; | ||
| const observation = isObservationTool(call.tool, call.args); | ||
| const isFailure = !call.ok; | ||
|
|
||
| counters.toolCalls += 1; | ||
| if (observation) counters.observationCalls += 1; | ||
| else counters.actionCalls += 1; | ||
| if (isFailure) counters.failureCalls += 1; | ||
|
|
||
| counters.consecutiveSameTool = previousTool === call.tool | ||
| ? counters.consecutiveSameTool + 1 | ||
| : 1; | ||
| counters.observationStreak = observation ? counters.observationStreak + 1 : 0; | ||
| counters.failureStreak = isFailure ? counters.failureStreak + 1 : 0; | ||
|
|
||
| const navUrl = call.tool === 'navigate' ? extractUrl(call.args) : undefined; | ||
| if (navUrl) { | ||
| counters.sameUrlNavigations[navUrl] = (counters.sameUrlNavigations[navUrl] ?? 0) + 1; | ||
| } | ||
|
|
||
| const decision = evaluateBudget(meta, counters, policy, call); | ||
| const recentEvent: TaskRecentEvent = { | ||
| ts: call.ts, | ||
| tool: call.tool, | ||
| ok: call.ok, | ||
| summary: summarizeCall(call, decision), | ||
| }; | ||
| const recent_events = [...(meta.recent_events ?? []), recentEvent].slice(-RECENT_EVENT_LIMIT); | ||
|
|
||
| return { | ||
| ...meta, | ||
| phase: normalizeTaskPhase(meta.phase), | ||
| policy, | ||
| counters, | ||
| budget_status: decision.status, | ||
| budget_exceeded: decision.exceeded.length > 0 ? decision.exceeded : undefined, | ||
| recommended_next: decision.recommended_next, | ||
| recent_events, | ||
| last_tool_name: call.tool, | ||
| last_activity_at: call.ts, | ||
| }; | ||
| } | ||
|
|
||
| function evaluateBudget( | ||
| meta: TaskMeta, | ||
| counters: TaskCounters, | ||
| policy: TaskEnvelopePolicy, | ||
| call: RecordedToolCall, | ||
| ): TaskBudgetDecision { | ||
| const exceeded: string[] = []; | ||
| const warnings: string[] = []; | ||
|
|
||
| checkLimit('maxToolCalls', counters.toolCalls, policy.maxToolCalls, exceeded, warnings); | ||
| checkLimit('maxConsecutiveSameTool', counters.consecutiveSameTool, policy.maxConsecutiveSameTool, exceeded, warnings); | ||
| checkLimit('maxObservationStreak', counters.observationStreak, policy.maxObservationStreak, exceeded, warnings); | ||
| checkLimit('maxFailureStreak', counters.failureStreak, policy.maxFailureStreak, exceeded, warnings); | ||
| checkSameUrlNavigationLimit(counters.sameUrlNavigations, policy.maxSameUrlNavigations, exceeded, warnings); | ||
| checkAllowedDomain(extractUrl(call.args), policy.allowedDomains, exceeded); | ||
| checkCheckpointCadence(counters.toolCalls, policy.checkpointEveryCalls, warnings); | ||
| if (policy.maxWallMs) { | ||
| checkLimit('maxWallMs', Date.now() - meta.created_at, policy.maxWallMs, exceeded, warnings); | ||
|
Comment on lines
+143
to
+151
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| } | ||
|
|
||
| const status: BudgetStatus = exceeded.length > 0 ? 'exceeded' : warnings.length > 0 ? 'warning' : 'ok'; | ||
| return { | ||
| status, | ||
| exceeded, | ||
| warnings, | ||
| recommended_next: status === 'exceeded' | ||
| ? 'change_strategy_or_verify' | ||
| : status === 'warning' | ||
| ? 'checkpoint_or_verify' | ||
| : undefined, | ||
| }; | ||
| } | ||
|
|
||
| function checkSameUrlNavigationLimit( | ||
| sameUrlNavigations: Record<string, number>, | ||
| limit: number | undefined, | ||
| exceeded: string[], | ||
| warnings: string[], | ||
| ): void { | ||
| if (!limit) return; | ||
| let atWarning = false; | ||
| for (const count of Object.values(sameUrlNavigations)) { | ||
| if (count > limit) { | ||
| exceeded.push('maxSameUrlNavigations'); | ||
| return; | ||
| } | ||
| if (count >= Math.ceil(limit * 0.75)) atWarning = true; | ||
| } | ||
| if (atWarning) warnings.push('maxSameUrlNavigations'); | ||
| } | ||
|
|
||
| function checkAllowedDomain( | ||
| url: string | undefined, | ||
| allowedDomains: string[] | undefined, | ||
| exceeded: string[], | ||
| ): void { | ||
| if (!url || !allowedDomains || allowedDomains.length === 0) return; | ||
| let host: string; | ||
| try { | ||
| host = new URL(url).hostname.toLowerCase(); | ||
| } catch { | ||
| exceeded.push('allowedDomains'); | ||
| return; | ||
| } | ||
| const allowed = allowedDomains.some((domain) => { | ||
| const normalized = domain.trim().toLowerCase().replace(/^\./, ''); | ||
| return normalized.length > 0 && (host === normalized || host.endsWith(`.${normalized}`)); | ||
| }); | ||
| if (!allowed) exceeded.push('allowedDomains'); | ||
| } | ||
|
|
||
| function checkCheckpointCadence( | ||
| toolCalls: number, | ||
| checkpointEveryCalls: number | undefined, | ||
| warnings: string[], | ||
| ): void { | ||
| if (!checkpointEveryCalls || toolCalls === 0) return; | ||
| if (toolCalls % checkpointEveryCalls === 0) warnings.push('checkpointEveryCalls'); | ||
| } | ||
|
|
||
| function checkLimit( | ||
| key: string, | ||
| value: number, | ||
| limit: number | undefined, | ||
| exceeded: string[], | ||
| warnings: string[], | ||
| ): void { | ||
| if (!limit) return; | ||
| if (value > limit) { | ||
| exceeded.push(key); | ||
| } else if (value >= Math.ceil(limit * 0.75)) { | ||
| warnings.push(key); | ||
| } | ||
| } | ||
|
|
||
| function extractUrl(args: Record<string, unknown>): string | undefined { | ||
| const value = args.url ?? args.href; | ||
| return typeof value === 'string' && value.length > 0 ? value : undefined; | ||
| } | ||
|
|
||
|
|
||
| function summarizeCall(call: RecordedToolCall, decision: TaskBudgetDecision): string { | ||
| const status = call.ok ? 'ok' : 'error'; | ||
| const budget = decision.exceeded.length > 0 | ||
| ? ` budget_exceeded=${decision.exceeded.join(',')}` | ||
| : decision.warnings.length > 0 | ||
| ? ` budget_warning=${decision.warnings.join(',')}` | ||
| : ''; | ||
| return `${call.tool} ${status} durationMs=${call.durationMs}${budget}`; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| import type { TaskStore } from './store'; | ||
| import type { RecordedToolCall } from './types'; | ||
| import { applyToolCallToTask } from './budget'; | ||
|
|
||
| export function extractTaskId(args: Record<string, unknown>): string | undefined { | ||
| const taskId = args.taskId ?? args.task_id; | ||
| return typeof taskId === 'string' && /^[0-9a-f]{16}$/.test(taskId) ? taskId : undefined; | ||
| } | ||
|
|
||
| export async function recordTaskToolCall( | ||
| store: TaskStore, | ||
| taskId: string | undefined, | ||
| call: RecordedToolCall, | ||
| ): Promise<void> { | ||
| if (!taskId) return; | ||
| const meta = store.readMetaSync(taskId); | ||
| if (!meta) return; | ||
|
Comment on lines
+16
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| if (meta.owner) { | ||
| if (meta.owner.session_id !== call.sessionId) return; | ||
| if ((call.principalMode === 'api-key' || call.principalMode === 'jwt') && meta.owner.tenant_id !== call.tenantId) return; | ||
| } | ||
| if (meta.kind !== 'browser_task') return; | ||
| if (meta.status === 'COMPLETED' || meta.status === 'FAILED' || meta.status === 'CANCELLED') return; | ||
| try { | ||
| const updated = await store.update(taskId, (cur) => { | ||
| if (cur.status === 'COMPLETED' || cur.status === 'FAILED' || cur.status === 'CANCELLED') return undefined; | ||
| return applyToolCallToTask(cur, call); | ||
|
Comment on lines
+23
to
+27
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| }); | ||
| if (!updated) return; | ||
| store.appendEvent(taskId, { | ||
| ts: call.ts, | ||
| kind: 'tool_call', | ||
| data: { | ||
| tool: call.tool, | ||
| ok: call.ok, | ||
| durationMs: call.durationMs, | ||
| sessionId: call.sessionId, | ||
| }, | ||
| }); | ||
| } catch (err) { | ||
| console.error(`[task-envelope] failed to record tool call for ${taskId}:`, err); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| import { TaskStore, defaultTaskRootDir } from './store'; | ||
|
|
||
| let storeSingleton: TaskStore | undefined; | ||
|
|
||
| /** Resolve the process-wide task ledger store. */ | ||
| export function getTaskStore(): TaskStore { | ||
| if (!storeSingleton) { | ||
| storeSingleton = new TaskStore({ rootDir: defaultTaskRootDir() }); | ||
| } | ||
| return storeSingleton; | ||
| } | ||
|
|
||
| /** Test seam — override the process-wide store with a custom instance. */ | ||
| export function setTaskStoreForTests(store: TaskStore | undefined): void { | ||
| storeSingleton = store; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The observation budget is currently blind to several read-only browser tools (
query_dom,inspect,page_content,wait_for) because only four names are treated as observations here. In a loop that repeatedly uses one of those omitted tools,observationStreaknever increases andmaxObservationStreakwill not trigger, so wandering-detection can stayokeven when the agent is stuck gathering observations. This also misclassifies those calls as actions, skewing the envelope counters returned byoc_task_get.Useful? React with 👍 / 👎.