Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7cff774
Bound browser tasks so agents stop wandering
shaun0927 May 12, 2026
008706b
Let task get accept camelCase ids
shaun0927 May 13, 2026
3856455
Merge task ledger base into envelope PR
shaun0927 May 13, 2026
627829e
Enforce ownership on envelope mutations
shaun0927 May 13, 2026
02b3a7b
Gate envelope recording by owner
shaun0927 May 13, 2026
a4f05ee
Merge latest task ledger base for envelope PR
shaun0927 May 13, 2026
a9be0f0
Let host envelopes cancel terminally
shaun0927 May 13, 2026
347d250
Preserve exceeded navigation budgets
shaun0927 May 13, 2026
fdde3d0
Harden task envelope control tools
shaun0927 May 13, 2026
8a7f392
fix(harness): skip session init for task ledger tools (#1034)
shaun0927 May 13, 2026
d285e28
Merge develop into feat/1034-task-envelope
shaun0927 May 13, 2026
c1db641
Revalidate task envelope budget PR
shaun0927 May 13, 2026
1a224a6
Trigger task envelope CI on develop
shaun0927 May 13, 2026
a71c663
Describe task finish outcome as required
shaun0927 May 13, 2026
af6dd95
Merge task digest into task envelopes
shaun0927 May 13, 2026
a7a4dac
Expose task envelope update tools in capability surface
shaun0927 May 13, 2026
81bd126
Tighten task envelope accounting boundaries
shaun0927 May 13, 2026
4931234
Restrict envelope call accounting to browser tasks
shaun0927 May 13, 2026
e27b3dc
Require args for scheduled tool tasks
shaun0927 May 13, 2026
569865f
Enforce declared task envelope policies
shaun0927 May 13, 2026
e26da31
Keep task envelopes current with develop
shaun0927 May 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 243 additions & 0 deletions src/core/task-ledger/budget.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
import type {
BudgetStatus,
RecordedToolCall,
TaskBudgetDecision,
TaskCounters,
TaskEnvelopePolicy,
TaskMeta,
TaskPhase,
TaskRecentEvent,
} from './types';

const DEFAULT_MAX_CONSECUTIVE_SAME_TOOL = 5;
const DEFAULT_MAX_OBSERVATION_STREAK = 6;
const DEFAULT_MAX_FAILURE_STREAK = 4;
const DEFAULT_MAX_SAME_URL_NAVIGATIONS = 3;
const RECENT_EVENT_LIMIT = 10;

const OBSERVATION_TOOLS = new Set([
'read_page',
'find',
'tabs_context',
'tabs_list',
'tabs_get',
'inspect',
'page_screenshot',
'vision_find',
'oc_assert',
]);
Comment on lines +18 to +28
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Expand observation tool whitelist for budget streaks

The observation budget is currently blind to several read-only browser tools (query_dom, inspect, page_content, wait_for) because only four names are treated as observations here. In a loop that repeatedly uses one of those omitted tools, observationStreak never increases and maxObservationStreak will not trigger, so wandering-detection can stay ok even when the agent is stuck gathering observations. This also misclassifies those calls as actions, skewing the envelope counters returned by oc_task_get.

Useful? React with 👍 / 👎.


export function isObservationTool(tool: string, args: Record<string, unknown>): boolean {
if (OBSERVATION_TOOLS.has(tool)) return true;
return tool === 'computer' && args.action === 'screenshot';
}

export function normalizeTaskPhase(value: unknown): TaskPhase {
switch (value) {
case 'explore':
case 'act':
case 'verify':
case 'recover':
case 'done':
return value;
default:
return 'explore';
}
}

export function normalizeTaskPolicy(input: unknown): TaskEnvelopePolicy {
const raw = (input && typeof input === 'object') ? input as Record<string, unknown> : {};
return {
maxToolCalls: positiveInt(raw.maxToolCalls),
maxWallMs: positiveInt(raw.maxWallMs),
maxConsecutiveSameTool: positiveInt(raw.maxConsecutiveSameTool) ?? DEFAULT_MAX_CONSECUTIVE_SAME_TOOL,
maxObservationStreak: positiveInt(raw.maxObservationStreak) ?? DEFAULT_MAX_OBSERVATION_STREAK,
maxFailureStreak: positiveInt(raw.maxFailureStreak) ?? DEFAULT_MAX_FAILURE_STREAK,
maxSameUrlNavigations: positiveInt(raw.maxSameUrlNavigations) ?? DEFAULT_MAX_SAME_URL_NAVIGATIONS,
allowedDomains: Array.isArray(raw.allowedDomains)
? raw.allowedDomains.filter((d): d is string => typeof d === 'string' && d.length > 0)
: undefined,
checkpointEveryCalls: positiveInt(raw.checkpointEveryCalls),
};
}

function positiveInt(value: unknown): number | undefined {
if (typeof value !== 'number' || !Number.isFinite(value)) return undefined;
const n = Math.floor(value);
return n > 0 ? n : undefined;
}

export function initialCounters(): TaskCounters {
return {
toolCalls: 0,
actionCalls: 0,
observationCalls: 0,
failureCalls: 0,
consecutiveSameTool: 0,
observationStreak: 0,
failureStreak: 0,
sameUrlNavigations: {},
};
}

export function applyToolCallToTask(meta: TaskMeta, call: RecordedToolCall): TaskMeta {
const policy = normalizeTaskPolicy(meta.policy);
const current = meta.counters ?? initialCounters();
const counters: TaskCounters = {
...initialCounters(),
...current,
sameUrlNavigations: { ...(current.sameUrlNavigations ?? {}) },
};
const previousTool = meta.last_tool_name;
const observation = isObservationTool(call.tool, call.args);
const isFailure = !call.ok;

counters.toolCalls += 1;
if (observation) counters.observationCalls += 1;
else counters.actionCalls += 1;
if (isFailure) counters.failureCalls += 1;

counters.consecutiveSameTool = previousTool === call.tool
? counters.consecutiveSameTool + 1
: 1;
counters.observationStreak = observation ? counters.observationStreak + 1 : 0;
counters.failureStreak = isFailure ? counters.failureStreak + 1 : 0;

const navUrl = call.tool === 'navigate' ? extractUrl(call.args) : undefined;
if (navUrl) {
counters.sameUrlNavigations[navUrl] = (counters.sameUrlNavigations[navUrl] ?? 0) + 1;
}

const decision = evaluateBudget(meta, counters, policy, call);
const recentEvent: TaskRecentEvent = {
ts: call.ts,
tool: call.tool,
ok: call.ok,
summary: summarizeCall(call, decision),
};
const recent_events = [...(meta.recent_events ?? []), recentEvent].slice(-RECENT_EVENT_LIMIT);

return {
...meta,
phase: normalizeTaskPhase(meta.phase),
policy,
counters,
budget_status: decision.status,
budget_exceeded: decision.exceeded.length > 0 ? decision.exceeded : undefined,
recommended_next: decision.recommended_next,
recent_events,
last_tool_name: call.tool,
last_activity_at: call.ts,
};
}

function evaluateBudget(
meta: TaskMeta,
counters: TaskCounters,
policy: TaskEnvelopePolicy,
call: RecordedToolCall,
): TaskBudgetDecision {
const exceeded: string[] = [];
const warnings: string[] = [];

checkLimit('maxToolCalls', counters.toolCalls, policy.maxToolCalls, exceeded, warnings);
checkLimit('maxConsecutiveSameTool', counters.consecutiveSameTool, policy.maxConsecutiveSameTool, exceeded, warnings);
checkLimit('maxObservationStreak', counters.observationStreak, policy.maxObservationStreak, exceeded, warnings);
checkLimit('maxFailureStreak', counters.failureStreak, policy.maxFailureStreak, exceeded, warnings);
checkSameUrlNavigationLimit(counters.sameUrlNavigations, policy.maxSameUrlNavigations, exceeded, warnings);
checkAllowedDomain(extractUrl(call.args), policy.allowedDomains, exceeded);
checkCheckpointCadence(counters.toolCalls, policy.checkpointEveryCalls, warnings);
if (policy.maxWallMs) {
checkLimit('maxWallMs', Date.now() - meta.created_at, policy.maxWallMs, exceeded, warnings);
Comment on lines +143 to +151
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Enforce all declared task policy constraints

normalizeTaskPolicy stores allowedDomains and checkpointEveryCalls, and oc_task_start advertises both as part of the deterministic policy, but evaluateBudget never consults either field. As a result, a host can set those limits and still get budget_status: "ok" while navigating outside the allowed domain set or missing checkpoint cadence, which silently disables part of the policy contract for browser-task envelopes.

Useful? React with 👍 / 👎.

}

const status: BudgetStatus = exceeded.length > 0 ? 'exceeded' : warnings.length > 0 ? 'warning' : 'ok';
return {
status,
exceeded,
warnings,
recommended_next: status === 'exceeded'
? 'change_strategy_or_verify'
: status === 'warning'
? 'checkpoint_or_verify'
: undefined,
};
}

function checkSameUrlNavigationLimit(
sameUrlNavigations: Record<string, number>,
limit: number | undefined,
exceeded: string[],
warnings: string[],
): void {
if (!limit) return;
let atWarning = false;
for (const count of Object.values(sameUrlNavigations)) {
if (count > limit) {
exceeded.push('maxSameUrlNavigations');
return;
}
if (count >= Math.ceil(limit * 0.75)) atWarning = true;
}
if (atWarning) warnings.push('maxSameUrlNavigations');
}

function checkAllowedDomain(
url: string | undefined,
allowedDomains: string[] | undefined,
exceeded: string[],
): void {
if (!url || !allowedDomains || allowedDomains.length === 0) return;
let host: string;
try {
host = new URL(url).hostname.toLowerCase();
} catch {
exceeded.push('allowedDomains');
return;
}
const allowed = allowedDomains.some((domain) => {
const normalized = domain.trim().toLowerCase().replace(/^\./, '');
return normalized.length > 0 && (host === normalized || host.endsWith(`.${normalized}`));
});
if (!allowed) exceeded.push('allowedDomains');
}

function checkCheckpointCadence(
toolCalls: number,
checkpointEveryCalls: number | undefined,
warnings: string[],
): void {
if (!checkpointEveryCalls || toolCalls === 0) return;
if (toolCalls % checkpointEveryCalls === 0) warnings.push('checkpointEveryCalls');
}

function checkLimit(
key: string,
value: number,
limit: number | undefined,
exceeded: string[],
warnings: string[],
): void {
if (!limit) return;
if (value > limit) {
exceeded.push(key);
} else if (value >= Math.ceil(limit * 0.75)) {
warnings.push(key);
}
}

function extractUrl(args: Record<string, unknown>): string | undefined {
const value = args.url ?? args.href;
return typeof value === 'string' && value.length > 0 ? value : undefined;
}


function summarizeCall(call: RecordedToolCall, decision: TaskBudgetDecision): string {
const status = call.ok ? 'ok' : 'error';
const budget = decision.exceeded.length > 0
? ` budget_exceeded=${decision.exceeded.join(',')}`
: decision.warnings.length > 0
? ` budget_warning=${decision.warnings.join(',')}`
: '';
return `${call.tool} ${status} durationMs=${call.durationMs}${budget}`;
}
43 changes: 43 additions & 0 deletions src/core/task-ledger/envelope.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import type { TaskStore } from './store';
import type { RecordedToolCall } from './types';
import { applyToolCallToTask } from './budget';

export function extractTaskId(args: Record<string, unknown>): string | undefined {
const taskId = args.taskId ?? args.task_id;
return typeof taskId === 'string' && /^[0-9a-f]{16}$/.test(taskId) ? taskId : undefined;
}

export async function recordTaskToolCall(
store: TaskStore,
taskId: string | undefined,
call: RecordedToolCall,
): Promise<void> {
if (!taskId) return;
const meta = store.readMetaSync(taskId);
if (!meta) return;
Comment on lines +16 to +17
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Enforce owner checks before recording envelope tool calls

recordTaskToolCall updates any task whose 16-hex id appears in tool args, but it never verifies that the caller owns that task. Because MCPServer invokes this path for every non-oc_task_* tool call, a different session/tenant that knows a valid task_id can inject tool-call events and mutate another task’s counters/budget state (for example, driving budget_exceeded on someone else’s envelope). Add the same canAccessTask ownership gate used by task mutators before applying applyToolCallToTask/appending events.

Useful? React with 👍 / 👎.

if (meta.owner) {
if (meta.owner.session_id !== call.sessionId) return;
if ((call.principalMode === 'api-key' || call.principalMode === 'jwt') && meta.owner.tenant_id !== call.tenantId) return;
}
if (meta.kind !== 'browser_task') return;
if (meta.status === 'COMPLETED' || meta.status === 'FAILED' || meta.status === 'CANCELLED') return;
try {
const updated = await store.update(taskId, (cur) => {
if (cur.status === 'COMPLETED' || cur.status === 'FAILED' || cur.status === 'CANCELLED') return undefined;
return applyToolCallToTask(cur, call);
Comment on lines +23 to +27
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restrict envelope recording to browser_task rows

recordTaskToolCall currently updates any non-terminal task that passes ownership checks, without verifying meta.kind. If a caller includes a running non-envelope task id (for example, a crawl task) in taskId/task_id on a browser tool call, this path will still mutate that task’s counters, budget_status, and recent_events, which pollutes async task state and can surface misleading budget guidance in oc_task_get. Add a browser_task kind guard (including inside the update closure) before calling applyToolCallToTask.

Useful? React with 👍 / 👎.

});
if (!updated) return;
store.appendEvent(taskId, {
ts: call.ts,
kind: 'tool_call',
data: {
tool: call.tool,
ok: call.ok,
durationMs: call.durationMs,
sessionId: call.sessionId,
},
});
} catch (err) {
console.error(`[task-envelope] failed to record tool call for ${taskId}:`, err);
}
}
11 changes: 11 additions & 0 deletions src/core/task-ledger/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
export type {
TaskEvent,
TaskKind,
TaskPhase,
BudgetStatus,
TaskEnvelopePolicy,
TaskCounters,
TaskRecentEvent,
TaskBudgetDecision,
RecordedToolCall,
TaskListFilter,
TaskMeta,
TaskOwner,
Expand Down Expand Up @@ -36,3 +43,7 @@ export {
waitForTerminal,
TaskWaitTimeoutError,
} from './runner';

export { getTaskStore, setTaskStoreForTests } from './singleton';
export * from './budget';
export * from './envelope';
16 changes: 16 additions & 0 deletions src/core/task-ledger/singleton.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { TaskStore, defaultTaskRootDir } from './store';

let storeSingleton: TaskStore | undefined;

/** Resolve the process-wide task ledger store. */
export function getTaskStore(): TaskStore {
if (!storeSingleton) {
storeSingleton = new TaskStore({ rootDir: defaultTaskRootDir() });
}
return storeSingleton;
}

/** Test seam — override the process-wide store with a custom instance. */
export function setTaskStoreForTests(store: TaskStore | undefined): void {
storeSingleton = store;
}
Loading
Loading