diff --git a/src/core/task-ledger/budget.ts b/src/core/task-ledger/budget.ts new file mode 100644 index 000000000..093fca605 --- /dev/null +++ b/src/core/task-ledger/budget.ts @@ -0,0 +1,243 @@ +import type { + BudgetStatus, + RecordedToolCall, + TaskBudgetDecision, + TaskCounters, + TaskEnvelopePolicy, + TaskMeta, + TaskPhase, + TaskRecentEvent, +} from './types'; + +const DEFAULT_MAX_CONSECUTIVE_SAME_TOOL = 5; +const DEFAULT_MAX_OBSERVATION_STREAK = 6; +const DEFAULT_MAX_FAILURE_STREAK = 4; +const DEFAULT_MAX_SAME_URL_NAVIGATIONS = 3; +const RECENT_EVENT_LIMIT = 10; + +const OBSERVATION_TOOLS = new Set([ + 'read_page', + 'find', + 'tabs_context', + 'tabs_list', + 'tabs_get', + 'inspect', + 'page_screenshot', + 'vision_find', + 'oc_assert', +]); + +export function isObservationTool(tool: string, args: Record): boolean { + if (OBSERVATION_TOOLS.has(tool)) return true; + return tool === 'computer' && args.action === 'screenshot'; +} + +export function normalizeTaskPhase(value: unknown): TaskPhase { + switch (value) { + case 'explore': + case 'act': + case 'verify': + case 'recover': + case 'done': + return value; + default: + return 'explore'; + } +} + +export function normalizeTaskPolicy(input: unknown): TaskEnvelopePolicy { + const raw = (input && typeof input === 'object') ? input as Record : {}; + return { + maxToolCalls: positiveInt(raw.maxToolCalls), + maxWallMs: positiveInt(raw.maxWallMs), + maxConsecutiveSameTool: positiveInt(raw.maxConsecutiveSameTool) ?? DEFAULT_MAX_CONSECUTIVE_SAME_TOOL, + maxObservationStreak: positiveInt(raw.maxObservationStreak) ?? DEFAULT_MAX_OBSERVATION_STREAK, + maxFailureStreak: positiveInt(raw.maxFailureStreak) ?? DEFAULT_MAX_FAILURE_STREAK, + maxSameUrlNavigations: positiveInt(raw.maxSameUrlNavigations) ?? DEFAULT_MAX_SAME_URL_NAVIGATIONS, + allowedDomains: Array.isArray(raw.allowedDomains) + ? raw.allowedDomains.filter((d): d is string => typeof d === 'string' && d.length > 0) + : undefined, + checkpointEveryCalls: positiveInt(raw.checkpointEveryCalls), + }; +} + +function positiveInt(value: unknown): number | undefined { + if (typeof value !== 'number' || !Number.isFinite(value)) return undefined; + const n = Math.floor(value); + return n > 0 ? n : undefined; +} + +export function initialCounters(): TaskCounters { + return { + toolCalls: 0, + actionCalls: 0, + observationCalls: 0, + failureCalls: 0, + consecutiveSameTool: 0, + observationStreak: 0, + failureStreak: 0, + sameUrlNavigations: {}, + }; +} + +export function applyToolCallToTask(meta: TaskMeta, call: RecordedToolCall): TaskMeta { + const policy = normalizeTaskPolicy(meta.policy); + const current = meta.counters ?? initialCounters(); + const counters: TaskCounters = { + ...initialCounters(), + ...current, + sameUrlNavigations: { ...(current.sameUrlNavigations ?? {}) }, + }; + const previousTool = meta.last_tool_name; + const observation = isObservationTool(call.tool, call.args); + const isFailure = !call.ok; + + counters.toolCalls += 1; + if (observation) counters.observationCalls += 1; + else counters.actionCalls += 1; + if (isFailure) counters.failureCalls += 1; + + counters.consecutiveSameTool = previousTool === call.tool + ? counters.consecutiveSameTool + 1 + : 1; + counters.observationStreak = observation ? counters.observationStreak + 1 : 0; + counters.failureStreak = isFailure ? counters.failureStreak + 1 : 0; + + const navUrl = call.tool === 'navigate' ? extractUrl(call.args) : undefined; + if (navUrl) { + counters.sameUrlNavigations[navUrl] = (counters.sameUrlNavigations[navUrl] ?? 0) + 1; + } + + const decision = evaluateBudget(meta, counters, policy, call); + const recentEvent: TaskRecentEvent = { + ts: call.ts, + tool: call.tool, + ok: call.ok, + summary: summarizeCall(call, decision), + }; + const recent_events = [...(meta.recent_events ?? []), recentEvent].slice(-RECENT_EVENT_LIMIT); + + return { + ...meta, + phase: normalizeTaskPhase(meta.phase), + policy, + counters, + budget_status: decision.status, + budget_exceeded: decision.exceeded.length > 0 ? decision.exceeded : undefined, + recommended_next: decision.recommended_next, + recent_events, + last_tool_name: call.tool, + last_activity_at: call.ts, + }; +} + +function evaluateBudget( + meta: TaskMeta, + counters: TaskCounters, + policy: TaskEnvelopePolicy, + call: RecordedToolCall, +): TaskBudgetDecision { + const exceeded: string[] = []; + const warnings: string[] = []; + + checkLimit('maxToolCalls', counters.toolCalls, policy.maxToolCalls, exceeded, warnings); + checkLimit('maxConsecutiveSameTool', counters.consecutiveSameTool, policy.maxConsecutiveSameTool, exceeded, warnings); + checkLimit('maxObservationStreak', counters.observationStreak, policy.maxObservationStreak, exceeded, warnings); + checkLimit('maxFailureStreak', counters.failureStreak, policy.maxFailureStreak, exceeded, warnings); + checkSameUrlNavigationLimit(counters.sameUrlNavigations, policy.maxSameUrlNavigations, exceeded, warnings); + checkAllowedDomain(extractUrl(call.args), policy.allowedDomains, exceeded); + checkCheckpointCadence(counters.toolCalls, policy.checkpointEveryCalls, warnings); + if (policy.maxWallMs) { + checkLimit('maxWallMs', Date.now() - meta.created_at, policy.maxWallMs, exceeded, warnings); + } + + const status: BudgetStatus = exceeded.length > 0 ? 'exceeded' : warnings.length > 0 ? 'warning' : 'ok'; + return { + status, + exceeded, + warnings, + recommended_next: status === 'exceeded' + ? 'change_strategy_or_verify' + : status === 'warning' + ? 'checkpoint_or_verify' + : undefined, + }; +} + +function checkSameUrlNavigationLimit( + sameUrlNavigations: Record, + limit: number | undefined, + exceeded: string[], + warnings: string[], +): void { + if (!limit) return; + let atWarning = false; + for (const count of Object.values(sameUrlNavigations)) { + if (count > limit) { + exceeded.push('maxSameUrlNavigations'); + return; + } + if (count >= Math.ceil(limit * 0.75)) atWarning = true; + } + if (atWarning) warnings.push('maxSameUrlNavigations'); +} + +function checkAllowedDomain( + url: string | undefined, + allowedDomains: string[] | undefined, + exceeded: string[], +): void { + if (!url || !allowedDomains || allowedDomains.length === 0) return; + let host: string; + try { + host = new URL(url).hostname.toLowerCase(); + } catch { + exceeded.push('allowedDomains'); + return; + } + const allowed = allowedDomains.some((domain) => { + const normalized = domain.trim().toLowerCase().replace(/^\./, ''); + return normalized.length > 0 && (host === normalized || host.endsWith(`.${normalized}`)); + }); + if (!allowed) exceeded.push('allowedDomains'); +} + +function checkCheckpointCadence( + toolCalls: number, + checkpointEveryCalls: number | undefined, + warnings: string[], +): void { + if (!checkpointEveryCalls || toolCalls === 0) return; + if (toolCalls % checkpointEveryCalls === 0) warnings.push('checkpointEveryCalls'); +} + +function checkLimit( + key: string, + value: number, + limit: number | undefined, + exceeded: string[], + warnings: string[], +): void { + if (!limit) return; + if (value > limit) { + exceeded.push(key); + } else if (value >= Math.ceil(limit * 0.75)) { + warnings.push(key); + } +} + +function extractUrl(args: Record): string | undefined { + const value = args.url ?? args.href; + return typeof value === 'string' && value.length > 0 ? value : undefined; +} + + +function summarizeCall(call: RecordedToolCall, decision: TaskBudgetDecision): string { + const status = call.ok ? 'ok' : 'error'; + const budget = decision.exceeded.length > 0 + ? ` budget_exceeded=${decision.exceeded.join(',')}` + : decision.warnings.length > 0 + ? ` budget_warning=${decision.warnings.join(',')}` + : ''; + return `${call.tool} ${status} durationMs=${call.durationMs}${budget}`; +} diff --git a/src/core/task-ledger/envelope.ts b/src/core/task-ledger/envelope.ts new file mode 100644 index 000000000..43dc11a41 --- /dev/null +++ b/src/core/task-ledger/envelope.ts @@ -0,0 +1,43 @@ +import type { TaskStore } from './store'; +import type { RecordedToolCall } from './types'; +import { applyToolCallToTask } from './budget'; + +export function extractTaskId(args: Record): string | undefined { + const taskId = args.taskId ?? args.task_id; + return typeof taskId === 'string' && /^[0-9a-f]{16}$/.test(taskId) ? taskId : undefined; +} + +export async function recordTaskToolCall( + store: TaskStore, + taskId: string | undefined, + call: RecordedToolCall, +): Promise { + if (!taskId) return; + const meta = store.readMetaSync(taskId); + if (!meta) return; + if (meta.owner) { + if (meta.owner.session_id !== call.sessionId) return; + if ((call.principalMode === 'api-key' || call.principalMode === 'jwt') && meta.owner.tenant_id !== call.tenantId) return; + } + if (meta.kind !== 'browser_task') return; + if (meta.status === 'COMPLETED' || meta.status === 'FAILED' || meta.status === 'CANCELLED') return; + try { + const updated = await store.update(taskId, (cur) => { + if (cur.status === 'COMPLETED' || cur.status === 'FAILED' || cur.status === 'CANCELLED') return undefined; + return applyToolCallToTask(cur, call); + }); + if (!updated) return; + store.appendEvent(taskId, { + ts: call.ts, + kind: 'tool_call', + data: { + tool: call.tool, + ok: call.ok, + durationMs: call.durationMs, + sessionId: call.sessionId, + }, + }); + } catch (err) { + console.error(`[task-envelope] failed to record tool call for ${taskId}:`, err); + } +} diff --git a/src/core/task-ledger/index.ts b/src/core/task-ledger/index.ts index d95ff6592..f6fd0a9b7 100644 --- a/src/core/task-ledger/index.ts +++ b/src/core/task-ledger/index.ts @@ -6,6 +6,13 @@ export type { TaskEvent, TaskKind, + TaskPhase, + BudgetStatus, + TaskEnvelopePolicy, + TaskCounters, + TaskRecentEvent, + TaskBudgetDecision, + RecordedToolCall, TaskListFilter, TaskMeta, TaskOwner, @@ -36,3 +43,7 @@ export { waitForTerminal, TaskWaitTimeoutError, } from './runner'; + +export { getTaskStore, setTaskStoreForTests } from './singleton'; +export * from './budget'; +export * from './envelope'; diff --git a/src/core/task-ledger/singleton.ts b/src/core/task-ledger/singleton.ts new file mode 100644 index 000000000..858cf1675 --- /dev/null +++ b/src/core/task-ledger/singleton.ts @@ -0,0 +1,16 @@ +import { TaskStore, defaultTaskRootDir } from './store'; + +let storeSingleton: TaskStore | undefined; + +/** Resolve the process-wide task ledger store. */ +export function getTaskStore(): TaskStore { + if (!storeSingleton) { + storeSingleton = new TaskStore({ rootDir: defaultTaskRootDir() }); + } + return storeSingleton; +} + +/** Test seam — override the process-wide store with a custom instance. */ +export function setTaskStoreForTests(store: TaskStore | undefined): void { + storeSingleton = store; +} diff --git a/src/core/task-ledger/types.ts b/src/core/task-ledger/types.ts index 7c4005216..f6aa37434 100644 --- a/src/core/task-ledger/types.ts +++ b/src/core/task-ledger/types.ts @@ -35,6 +35,7 @@ export type TaskStatus = * server's tool registry is the source of truth. */ export type TaskKind = + | 'browser_task' | 'crawl' | 'crawl_sitemap' | 'recording' @@ -47,6 +48,57 @@ export interface TaskError { code?: string; } +export type TaskPhase = 'explore' | 'act' | 'verify' | 'recover' | 'done'; +export type BudgetStatus = 'ok' | 'warning' | 'exceeded'; + +export interface TaskEnvelopePolicy { + maxToolCalls?: number; + maxWallMs?: number; + maxConsecutiveSameTool?: number; + maxObservationStreak?: number; + maxFailureStreak?: number; + maxSameUrlNavigations?: number; + allowedDomains?: string[]; + checkpointEveryCalls?: number; +} + +export interface TaskCounters { + toolCalls: number; + actionCalls: number; + observationCalls: number; + failureCalls: number; + consecutiveSameTool: number; + observationStreak: number; + failureStreak: number; + sameUrlNavigations: Record; +} + +export interface TaskRecentEvent { + ts: number; + tool: string; + ok: boolean; + summary: string; +} + +export interface TaskBudgetDecision { + status: BudgetStatus; + exceeded: string[]; + warnings: string[]; + recommended_next?: string; +} + +export interface RecordedToolCall { + ts: number; + tool: string; + sessionId: string; + tenantId?: string; + keyId?: string; + principalMode?: string; + args: Record; + durationMs: number; + ok: boolean; +} + export interface TaskOwner { session_id: string; tenant_id?: string; @@ -74,8 +126,22 @@ export interface TaskMeta { result_path?: string; error?: TaskError; cancel_requested_at?: number; + /** Optional host-declared objective for task-level browser harness envelopes (#1034). */ + objective?: string; + /** Current host-declared phase. OpenChrome records facts; the host decides phase transitions. */ + phase?: TaskPhase; + /** Deterministic task budget / wandering policy. */ + policy?: TaskEnvelopePolicy; + counters?: TaskCounters; + budget_status?: BudgetStatus; + budget_exceeded?: string[]; + recommended_next?: string; + recent_events?: TaskRecentEvent[]; + last_tool_name?: string; + last_activity_at?: number; } + /** * A single event appended to the per-task `events.jsonl` file. Events * are advisory — meta.json is the source of truth for status. The @@ -92,7 +158,9 @@ export interface TaskEvent { | 'completed' | 'failed' | 'cancelled' - | 'cancel_requested'; + | 'cancel_requested' + | 'tool_call' + | 'budget'; data?: Record; } diff --git a/src/mcp-server.ts b/src/mcp-server.ts index e465ae5d1..ab7fc301d 100644 --- a/src/mcp-server.ts +++ b/src/mcp-server.ts @@ -59,6 +59,7 @@ import { OpenChromeConnectionError } from './errors/connection'; import { getTaskJournal } from './journal/task-journal'; import { getDashboardState } from './desktop/dashboard-state'; import { getActionRecorder } from './recording/action-recorder'; +import { extractTaskId, getTaskStore, recordTaskToolCall } from './core/task-ledger'; import { substituteSecrets, redactSecrets, @@ -203,8 +204,35 @@ export function isConnectionError(error: unknown): boolean { } /** Lifecycle tools that must work even when the CDP connection is broken (e.g., after - * sleep/wake). Skip session initialization so recovery handlers can always run. */ -const SKIP_SESSION_INIT_TOOLS = new Set(['oc_stop', 'oc_reap_orphans', 'oc_profile_status', 'oc_session_snapshot', 'oc_session_resume', 'oc_journal', 'oc_run_start', 'oc_run_status', 'oc_run_events', 'oc_run_finish', 'oc_progress_status']); + * sleep/wake). Skip session initialization so recovery handlers can always run. + * + * Task ledger tools (`oc_task_*`) are also listed here because they are pure + * ledger operations (or, for `oc_task_start`, just persist a meta row before + * background work begins). They never touch the browser themselves, so they + * must not trigger Chrome auto-launch on malformed input (#1034). + * + * Run-harness tools (`oc_run_*`) and `oc_progress_status` are pure read / + * bookkeeping calls landed on develop after this PR branched — also skip. */ +const SKIP_SESSION_INIT_TOOLS = new Set([ + 'oc_stop', + 'oc_reap_orphans', + 'oc_profile_status', + 'oc_session_snapshot', + 'oc_session_resume', + 'oc_journal', + 'oc_task_start', + 'oc_task_list', + 'oc_task_get', + 'oc_task_cancel', + 'oc_task_wait', + 'oc_task_update', + 'oc_task_finish', + 'oc_run_start', + 'oc_run_status', + 'oc_run_events', + 'oc_run_finish', + 'oc_progress_status', +]); const RUN_HARNESS_LONG_TASK_TOOLS = new Set([ 'execute_plan', @@ -364,6 +392,30 @@ export interface MCPServerOptions { capabilityFilter?: Set; } + +const TASK_ENVELOPE_BROWSER_TOOLS = new Set([ + 'navigate', + 'read_page', + 'find', + 'interact', + 'act', + 'fill_form', + 'form_input', + 'computer', + 'page_screenshot', + 'tabs_context', + 'tabs_list', + 'tabs_get', + 'inspect', + 'vision_find', + 'oc_assert', +]); + +function taskEnvelopeIdForTool(toolName: string, args: Record): string | undefined { + if (!TASK_ENVELOPE_BROWSER_TOOLS.has(toolName)) return undefined; + return extractTaskId(args); +} + export class MCPServer { private tools: Map = new Map(); private resources: Map = new Map(); @@ -1665,6 +1717,8 @@ export class MCPServer { // CDPClient may not be initialized — proceed with normal flow } + const taskEnvelopeId = taskEnvelopeIdForTool(toolName, toolArgs); + // Start activity tracking const callId = this.activityTracker!.startCall(toolName, sessionId || 'default', telemetryToolArgs, requestId); getDashboardState().recordToolStart(sessionId || 'default', toolName, telemetryToolArgs, callId); @@ -2067,6 +2121,17 @@ export class MCPServer { }; } + await recordTaskToolCall(getTaskStore(), taskEnvelopeId, { + ts: Date.now(), + tool: toolName, + sessionId, + tenantId: principal?.tenantId, + keyId: principal?.keyId, + principalMode: principal?.mode, + args: toolArgs, + durationMs: Date.now() - toolStartTime, + ok: result.isError !== true, + }); if (runHarnessId && !toolName.startsWith('oc_run_')) { try { @@ -2306,6 +2371,18 @@ export class MCPServer { } + await recordTaskToolCall(getTaskStore(), taskEnvelopeId, { + ts: Date.now(), + tool: toolName, + sessionId, + tenantId: principal?.tenantId, + keyId: principal?.keyId, + principalMode: principal?.mode, + args: toolArgs, + durationMs: Date.now() - toolStartTime, + ok: !errorIsError, + }); + // Secrets redaction (#834) — see success path. Error messages can // include the literal value (e.g. "type ... failed for value X"). const finalErrResult = redactSecrets(errResult); diff --git a/src/tools/__tests__/__snapshots__/tools-list.v1.11.snap.json b/src/tools/__tests__/__snapshots__/tools-list.v1.11.snap.json index f6111ed07..c4271c5bb 100644 --- a/src/tools/__tests__/__snapshots__/tools-list.v1.11.snap.json +++ b/src/tools/__tests__/__snapshots__/tools-list.v1.11.snap.json @@ -207,6 +207,9 @@ { "name": "oc_task_cancel" }, + { + "name": "oc_task_finish" + }, { "name": "oc_task_get" }, @@ -237,6 +240,9 @@ { "name": "oc_task_start" }, + { + "name": "oc_task_update" + }, { "name": "oc_task_wait" }, diff --git a/src/tools/index.ts b/src/tools/index.ts index 3e6e2d30a..de75717de 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -129,6 +129,8 @@ import { registerOcTaskListTool } from './oc-task-list'; import { registerOcTaskGetTool } from './oc-task-get'; import { registerOcTaskCancelTool } from './oc-task-cancel'; import { registerOcTaskWaitTool } from './oc-task-wait'; +import { registerOcTaskUpdateTool } from './oc-task-update'; +import { registerOcTaskFinishTool } from './oc-task-finish'; // Doctor report tool (#898) — read cached `openchrome doctor` output import { registerOcDoctorReportTool } from './oc-doctor-report'; // Performance insights two-step API (#846) @@ -298,6 +300,7 @@ export const TOOL_CAPABILITY_MAP: Record = { oc_run_start: 'core', oc_run_status: 'core', oc_task_cancel: 'core', + oc_task_finish: 'core', oc_task_get: 'core', oc_task_list: 'core', oc_task_run_checkpoint: 'core', @@ -308,6 +311,7 @@ export const TOOL_CAPABILITY_MAP: Record = { oc_task_run_start: 'core', oc_task_run_update: 'core', oc_task_start: 'core', + oc_task_update: 'core', oc_task_wait: 'core', }; @@ -507,6 +511,8 @@ export function registerAllTools(server: MCPServer): void { registerOcTaskGetTool(server); registerOcTaskCancelTool(server); registerOcTaskWaitTool(server); + registerOcTaskUpdateTool(server); + registerOcTaskFinishTool(server); // Reap any RUNNING task whose owner pid is no longer alive. Runs // once at server start (issue #855 invariant #2) so a crash on a diff --git a/src/tools/oc-task-cancel.ts b/src/tools/oc-task-cancel.ts index 7d5361d88..85f7fbcc9 100644 --- a/src/tools/oc-task-cancel.ts +++ b/src/tools/oc-task-cancel.ts @@ -62,6 +62,17 @@ const handler: ToolHandler = async ( status: 'CANCELLED' as TaskStatus, cancel_requested_at: now, ended_at: now, + last_activity_at: now, + }; + } + if (cur.kind === 'browser_task') { + return { + ...cur, + status: 'CANCELLED' as TaskStatus, + phase: 'done', + cancel_requested_at: now, + ended_at: now, + last_activity_at: now, }; } // RUNNING — set the cancel flag; the runner finishes the @@ -100,3 +111,5 @@ const handler: ToolHandler = async ( export function registerOcTaskCancelTool(server: MCPServer): void { server.registerTool(definition.name, handler, definition); } + +export const __test__ = { definition, handler }; diff --git a/src/tools/oc-task-finish.ts b/src/tools/oc-task-finish.ts new file mode 100644 index 000000000..9ef038a8f --- /dev/null +++ b/src/tools/oc-task-finish.ts @@ -0,0 +1,86 @@ +/** + * oc_task_finish — close a host-driven task envelope with a terminal status. + */ + +import { MCPServer } from '../mcp-server'; +import { MCPResult, MCPToolDefinition, ToolHandler } from '../types/mcp'; +import { getTaskStore } from '../core/task-ledger'; +import type { TaskStatus } from '../core/task-ledger'; +import { canAccessTask, taskAccessDeniedResult, waitForTaskStartupReap } from './oc-task-start'; + +const definition: MCPToolDefinition = { + name: 'oc_task_finish', + description: 'Finish a host-driven task envelope as completed, failed, or cancelled.', + inputSchema: { + type: 'object', + properties: { + task_id: { type: 'string' }, + taskId: { type: 'string', description: 'Alias for task_id.' }, + outcome: { + type: 'string', + enum: ['completed', 'failed', 'cancelled'], + description: 'REQUIRED Terminal task outcome: completed, failed, or cancelled.', + }, + note: { type: 'string' }, + }, + required: ['outcome'], + }, +}; + +const handler: ToolHandler = async (sessionId, params, context): Promise => { + const taskId = taskIdFrom(params); + if (!taskId) return errorResult('oc_task_finish: task_id is required'); + const status = statusFrom(params.outcome); + if (!status) return errorResult('oc_task_finish: outcome must be completed, failed, or cancelled'); + const note = typeof params.note === 'string' ? params.note : undefined; + await waitForTaskStartupReap(); + const store = getTaskStore(); + const meta = store.readMetaSync(taskId); + if (!meta) return errorResult(`oc_task_finish: unknown task ${taskId}`); + if (!canAccessTask(meta, sessionId, context?.principal)) return taskAccessDeniedResult(taskId); + if (meta.kind !== 'browser_task') return errorResult('oc_task_finish: only host-driven browser_task envelopes can be finished directly'); + const endedAt = Date.now(); + const updated = await store.update(taskId, (cur) => { + if (cur.status === 'COMPLETED' || cur.status === 'FAILED' || cur.status === 'CANCELLED') return undefined; + return { + ...cur, + status, + phase: 'done', + ended_at: endedAt, + last_activity_at: endedAt, + ...(note ? { args_summary: { ...cur.args_summary, final_note: note.slice(0, 500) } } : {}), + }; + }); + if (!updated) return errorResult(`oc_task_finish: task ${taskId} is already terminal`); + store.appendEvent(taskId, { + ts: endedAt, + kind: status === 'COMPLETED' ? 'completed' : status === 'CANCELLED' ? 'cancelled' : 'failed', + data: { note }, + }); + return { + content: [{ type: 'text', text: `task_id=${taskId} status=${status}` }], + meta: updated, + }; +}; + +function taskIdFrom(params: Record): string { + const v = params.task_id ?? params.taskId; + return typeof v === 'string' ? v : ''; +} + +function statusFrom(value: unknown): TaskStatus | undefined { + if (value === 'completed') return 'COMPLETED'; + if (value === 'failed') return 'FAILED'; + if (value === 'cancelled') return 'CANCELLED'; + return undefined; +} + +function errorResult(message: string): MCPResult { + return { isError: true, content: [{ type: 'text', text: message }] }; +} + +export function registerOcTaskFinishTool(server: MCPServer): void { + server.registerTool(definition.name, handler, definition); +} + +export const __test__ = { definition, handler }; diff --git a/src/tools/oc-task-get.ts b/src/tools/oc-task-get.ts index 4f384a9bf..ff75193ea 100644 --- a/src/tools/oc-task-get.ts +++ b/src/tools/oc-task-get.ts @@ -18,7 +18,8 @@ const definition: MCPToolDefinition = { inputSchema: { type: 'object', properties: { - task_id: { type: 'string', description: 'REQUIRED task_id returned by oc_task_start.' }, + task_id: { type: 'string', description: 'task_id returned by oc_task_start.' }, + taskId: { type: 'string', description: 'Alias for task_id.' }, include_result: { type: 'boolean', description: 'When true, also returns the persisted result.json contents.', @@ -32,7 +33,6 @@ const definition: MCPToolDefinition = { description: 'Alias for includeDigest.', }, }, - required: ['task_id'], }, }; @@ -41,7 +41,7 @@ const handler: ToolHandler = async ( params: Record, context?: ToolContext, ): Promise => { - const taskId = String(params.task_id ?? ''); + const taskId = String(params.task_id ?? params.taskId ?? ''); if (!taskId) { return { isError: true, content: [{ type: 'text', text: 'oc_task_get: task_id is required' }] }; } @@ -67,6 +67,13 @@ const handler: ToolHandler = async ( }, ], meta, + counters: meta.counters, + budget_status: meta.budget_status, + budget_exceeded: meta.budget_exceeded, + recommended_next: meta.recommended_next, + recent_events: meta.recent_events ?? [], + phase: meta.phase, + objective: meta.objective, ...(includeResult ? { result } : {}), ...(includeDigest ? { digest } : {}), }; @@ -75,3 +82,5 @@ const handler: ToolHandler = async ( export function registerOcTaskGetTool(server: MCPServer): void { server.registerTool(definition.name, handler, definition); } + +export const __test__ = { definition, handler }; diff --git a/src/tools/oc-task-start.ts b/src/tools/oc-task-start.ts index 280bbb2e3..04c53d56b 100644 --- a/src/tools/oc-task-start.ts +++ b/src/tools/oc-task-start.ts @@ -19,37 +19,24 @@ import { } from '../types/mcp'; import { TOOL_ANNOTATIONS } from '../types/tool-annotations'; import { - TaskStore, computeTaskId, - defaultTaskRootDir, summariseArgs, runTask, + getTaskStore, + setTaskStoreForTests, + normalizeTaskPolicy, + normalizeTaskPhase, + initialCounters, } from '../core/task-ledger'; -import type { TaskMeta, TaskOwner } from '../core/task-ledger'; +import type { TaskMeta, TaskKind, TaskOwner } from '../core/task-ledger'; import type { Principal } from '../auth/api-key-types'; -let storeSingleton: TaskStore | undefined; - -/** - * Resolve the process-wide TaskStore. Tests can clobber this via - * `setTaskStoreForTests` so each test runs against a fresh temp root. - */ -export function getTaskStore(): TaskStore { - if (!storeSingleton) { - storeSingleton = new TaskStore({ rootDir: defaultTaskRootDir() }); - } - return storeSingleton; -} - -/** Test seam — override the process-wide store with a custom instance. */ -export function setTaskStoreForTests(store: TaskStore | undefined): void { - storeSingleton = store; -} +export { getTaskStore, setTaskStoreForTests }; const definition: MCPToolDefinition = { name: 'oc_task_start', description: - 'Launch a long-running tool as a background task. Returns a task_id ' + + 'Create a task-level browser harness envelope, or launch a long-running tool as a background task. Returns a task_id ' + 'that can be polled with oc_task_get / oc_task_list / oc_task_wait, ' + 'or aborted with oc_task_cancel. The result is persisted to disk and ' + 'survives MCP-session loss.', @@ -60,15 +47,28 @@ const definition: MCPToolDefinition = { kind: { type: 'string', description: - 'REQUIRED Name of the underlying MCP tool to run. Canonical values: ' + - 'crawl, crawl_sitemap, recording, oc_evidence_bundle, oc_session_snapshot.', + 'Optional name of the underlying MCP tool to run in the background. ' + + 'Omit kind to create a task envelope for host-driven browser tool calls.', }, args: { type: 'object', - description: 'REQUIRED Arguments forwarded to the underlying tool.', + description: 'Arguments forwarded to the underlying tool when kind is set.', + }, + objective: { + type: 'string', + description: 'Host-declared objective for task-level browser harness tracking.', + }, + phase: { + type: 'string', + enum: ['explore', 'act', 'verify', 'recover', 'done'], + description: 'Initial host-declared task phase. Default: explore.', + }, + policy: { + type: 'object', + description: 'Deterministic budget policy: maxToolCalls, maxObservationStreak, maxConsecutiveSameTool, maxFailureStreak, maxSameUrlNavigations, maxWallMs, allowedDomains, checkpointEveryCalls.', }, }, - required: ['kind', 'args'], + required: [], }, }; @@ -131,6 +131,8 @@ const TASK_LEDGER_TOOLS = new Set([ 'oc_task_list', 'oc_task_wait', 'oc_task_cancel', + 'oc_task_update', + 'oc_task_finish', ]); function makeHandler(opts: StartHandlerOpts): ToolHandler { @@ -139,16 +141,24 @@ function makeHandler(opts: StartHandlerOpts): ToolHandler { params: Record, _ctx?: ToolContext, ): Promise => { - const kind = String(params.kind ?? ''); - const args = (params.args ?? {}) as Record; - if (!kind) { - return errorResult('oc_task_start: kind is required'); + const rawKind = params.kind; + if (rawKind !== undefined && (typeof rawKind !== 'string' || rawKind.trim().length === 0)) { + return errorResult('oc_task_start: kind must be a non-empty string when provided'); } + const kind = typeof rawKind === 'string' ? rawKind.trim() : 'browser_task'; + const rawArgs = params.args; + if (kind !== 'browser_task' && (!rawArgs || typeof rawArgs !== 'object' || Array.isArray(rawArgs))) { + return errorResult('oc_task_start: args must be an object when scheduling a tool task'); + } + const args = (rawArgs ?? {}) as Record; + const objective = typeof params.objective === 'string' ? params.objective : undefined; + const phase = normalizeTaskPhase(params.phase); + const policy = normalizeTaskPolicy(params.policy); if (TASK_LEDGER_TOOLS.has(kind)) { return errorResult(`oc_task_start: refusing to schedule task-ledger tool ${JSON.stringify(kind)}`); } - const inner = opts.resolveTool(kind); - if (!inner) { + const inner = kind === 'browser_task' ? null : opts.resolveTool(kind); + if (kind !== 'browser_task' && !inner) { return errorResult(`oc_task_start: tool ${JSON.stringify(kind)} is not registered`); } @@ -160,19 +170,41 @@ function makeHandler(opts: StartHandlerOpts): ToolHandler { // whole ledger for every new background job. const createdAt = Date.now(); const taskNonce = `${process.pid}:${createdAt}:${taskNonceSeq++}`; - const taskId = computeTaskId(kind, { ...args, __task_nonce: taskNonce }, createdAt); + const idSeed = kind === 'browser_task' + ? { objective: objective ?? '', phase, policy, __task_nonce: taskNonce } + : { ...args, __task_nonce: taskNonce }; + const taskId = computeTaskId(kind as TaskKind, idSeed, createdAt); const meta: TaskMeta = { task_id: taskId, kind, - status: 'PENDING', + status: kind === 'browser_task' ? 'RUNNING' : 'PENDING', pid: process.pid, created_at: createdAt, - args_summary: summariseArgs(args), + started_at: kind === 'browser_task' ? createdAt : undefined, + args_summary: summariseArgs(kind === 'browser_task' ? idSeed : args), owner: taskOwnerFor(sessionId, _ctx?.principal), task_nonce: taskNonce, + objective, + phase, + policy, + counters: initialCounters(), + budget_status: 'ok', + recent_events: [], + last_activity_at: createdAt, }; await store.create(meta); + if (kind === 'browser_task') { + store.appendEvent(taskId, { ts: createdAt, kind: 'started', data: { objective, phase } }); + return { + content: [{ type: 'text', text: `task_id=${taskId} status=RUNNING kind=browser_task phase=${phase}` }], + task_id: taskId, + status: 'RUNNING', + kind, + meta, + }; + } + // Spawn the runner in the background. We deliberately don't await // it — `oc_task_start` returns as soon as the PENDING row is on // disk so the MCP client doesn't have to hold its request open. @@ -187,7 +219,7 @@ function makeHandler(opts: StartHandlerOpts): ToolHandler { // Test seam fallback: direct handler invocation remains available for // focused unit tests, while production registration always supplies // invokeTool above so background tasks use the MCP pipeline. - return await inner(sessionId, merged, { + return await inner!(sessionId, merged, { startTime: Date.now(), deadlineMs: Number.MAX_SAFE_INTEGER, signal, @@ -208,6 +240,7 @@ function makeHandler(opts: StartHandlerOpts): ToolHandler { task_id: taskId, status: 'PENDING', kind, + meta, }; }; } diff --git a/src/tools/oc-task-update.ts b/src/tools/oc-task-update.ts new file mode 100644 index 000000000..cb8268220 --- /dev/null +++ b/src/tools/oc-task-update.ts @@ -0,0 +1,84 @@ +/** + * oc_task_update — update host-declared task envelope fields without executing browser actions. + */ + +import { MCPServer } from '../mcp-server'; +import { MCPResult, MCPToolDefinition, ToolHandler } from '../types/mcp'; +import { getTaskStore, normalizeTaskPhase } from '../core/task-ledger'; +import { canAccessTask, taskAccessDeniedResult } from './oc-task-start'; + +const definition: MCPToolDefinition = { + name: 'oc_task_update', + description: 'Update a task envelope phase or note. Does not execute browser actions.', + inputSchema: { + type: 'object', + properties: { + task_id: { type: 'string' }, + taskId: { type: 'string', description: 'Alias for task_id.' }, + phase: { + type: 'string', + enum: ['explore', 'act', 'verify', 'recover', 'done'], + }, + note: { type: 'string' }, + }, + required: [], + }, +}; + +const handler: ToolHandler = async (sessionId, params, context): Promise => { + const taskId = taskIdFrom(params); + if (!taskId) return errorResult('oc_task_update: task_id is required'); + const phase = params.phase !== undefined ? validTaskPhase(params.phase) : undefined; + if (params.phase !== undefined && !phase) { + return errorResult('oc_task_update: phase must be explore, act, verify, recover, or done'); + } + const note = typeof params.note === 'string' ? params.note : undefined; + const store = getTaskStore(); + const meta = store.readMetaSync(taskId); + if (!meta) return errorResult(`oc_task_update: unknown task ${taskId}`); + if (!canAccessTask(meta, sessionId, context?.principal)) return taskAccessDeniedResult(taskId); + if (meta.kind !== 'browser_task') return errorResult('oc_task_update: only host-driven browser_task envelopes can be updated'); + const updated = await store.update(taskId, (cur) => { + if (cur.status === 'COMPLETED' || cur.status === 'FAILED' || cur.status === 'CANCELLED') return undefined; + return { + ...cur, + ...(phase ? { phase } : {}), + ...(note ? { args_summary: { ...cur.args_summary, last_note: note.slice(0, 500) } } : {}), + last_activity_at: Date.now(), + }; + }); + if (!updated) return errorResult(`oc_task_update: task ${taskId} is terminal and cannot be updated`); + store.appendEvent(taskId, { ts: Date.now(), kind: 'log', data: { phase, note } }); + return { + content: [{ type: 'text', text: `task_id=${taskId} phase=${updated.phase ?? 'explore'} status=${updated.status}` }], + meta: updated, + }; +}; + +function taskIdFrom(params: Record): string { + const v = params.task_id ?? params.taskId; + return typeof v === 'string' ? v : ''; +} + +function validTaskPhase(value: unknown): ReturnType | undefined { + if ( + value === 'explore' || + value === 'act' || + value === 'verify' || + value === 'recover' || + value === 'done' + ) { + return value; + } + return undefined; +} + +function errorResult(message: string): MCPResult { + return { isError: true, content: [{ type: 'text', text: message }] }; +} + +export function registerOcTaskUpdateTool(server: MCPServer): void { + server.registerTool(definition.name, handler, definition); +} + +export const __test__ = { definition, handler }; diff --git a/tests/core/task-ledger/budget.test.ts b/tests/core/task-ledger/budget.test.ts new file mode 100644 index 000000000..8f477fdc6 --- /dev/null +++ b/tests/core/task-ledger/budget.test.ts @@ -0,0 +1,170 @@ +import { TaskStore, computeTaskId } from '../../../src/core/task-ledger'; +import { applyToolCallToTask, initialCounters, normalizeTaskPolicy } from '../../../src/core/task-ledger'; +import { recordTaskToolCall } from '../../../src/core/task-ledger/envelope'; +import type { TaskMeta, RecordedToolCall } from '../../../src/core/task-ledger'; +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; + +function makeMeta(overrides: Partial = {}): TaskMeta { + return { + task_id: 'aaaaaaaaaaaaaaaa', + kind: 'browser_task', + status: 'RUNNING', + pid: process.pid, + created_at: Date.now(), + started_at: Date.now(), + args_summary: {}, + objective: 'test objective', + phase: 'explore', + policy: normalizeTaskPolicy({ maxObservationStreak: 3, maxConsecutiveSameTool: 3, maxSameUrlNavigations: 2 }), + counters: initialCounters(), + budget_status: 'ok', + recent_events: [], + ...overrides, + }; +} + +function call(tool: string, args: Record = {}, ok = true): RecordedToolCall { + return { ts: Date.now(), tool, sessionId: 'sess', args, durationMs: 5, ok }; +} + +describe('task envelope budget evaluation', () => { + test('observation streak exceeds configured budget after repeated read_page calls', () => { + let meta = makeMeta(); + meta = applyToolCallToTask(meta, call('read_page')); + meta = applyToolCallToTask(meta, call('read_page')); + meta = applyToolCallToTask(meta, call('read_page')); + expect(meta.budget_status).toBe('warning'); + meta = applyToolCallToTask(meta, call('read_page')); + expect(meta.budget_status).toBe('exceeded'); + expect(meta.budget_exceeded).toContain('maxObservationStreak'); + expect(meta.recommended_next).toBe('change_strategy_or_verify'); + }); + + + test('expanded read-only browser tools count as observations', () => { + let meta = makeMeta(); + meta = applyToolCallToTask(meta, call('inspect')); + meta = applyToolCallToTask(meta, call('tabs_list')); + + expect(meta.counters?.observationCalls).toBe(2); + expect(meta.counters?.observationStreak).toBe(2); + expect(meta.counters?.actionCalls).toBe(0); + }); + + test('action calls reset observation streak while incrementing action count', () => { + let meta = makeMeta(); + meta = applyToolCallToTask(meta, call('read_page')); + meta = applyToolCallToTask(meta, call('interact')); + expect(meta.counters?.observationStreak).toBe(0); + expect(meta.counters?.actionCalls).toBe(1); + }); + + test('same URL navigation budget is tracked per URL', () => { + let meta = makeMeta(); + meta = applyToolCallToTask(meta, call('navigate', { url: 'http://localhost/a' })); + meta = applyToolCallToTask(meta, call('navigate', { url: 'http://localhost/a' })); + expect(meta.budget_status).toBe('warning'); + meta = applyToolCallToTask(meta, call('navigate', { url: 'http://localhost/a' })); + expect(meta.budget_status).toBe('exceeded'); + expect(meta.budget_exceeded).toContain('maxSameUrlNavigations'); + }); + + test('same URL navigation budget stays exceeded after later non-navigation calls', () => { + let meta = makeMeta(); + meta = applyToolCallToTask(meta, call('navigate', { url: 'http://localhost/a' })); + meta = applyToolCallToTask(meta, call('navigate', { url: 'http://localhost/a' })); + meta = applyToolCallToTask(meta, call('navigate', { url: 'http://localhost/a' })); + expect(meta.budget_status).toBe('exceeded'); + + meta = applyToolCallToTask(meta, call('read_page')); + + expect(meta.budget_status).toBe('exceeded'); + expect(meta.budget_exceeded).toContain('maxSameUrlNavigations'); + }); + + test('allowed domain policy rejects out-of-scope browser URLs', () => { + let meta = makeMeta({ policy: normalizeTaskPolicy({ allowedDomains: ['example.com'] }) }); + + meta = applyToolCallToTask(meta, call('navigate', { url: 'https://docs.example.com/start' })); + expect(meta.budget_status).toBe('ok'); + + meta = applyToolCallToTask(meta, call('navigate', { url: 'https://evil.test/phish' })); + expect(meta.budget_status).toBe('exceeded'); + expect(meta.budget_exceeded).toContain('allowedDomains'); + expect(meta.recommended_next).toBe('change_strategy_or_verify'); + }); + + test('checkpoint cadence policy requests a warning at configured call intervals', () => { + let meta = makeMeta({ policy: normalizeTaskPolicy({ checkpointEveryCalls: 2 }) }); + + meta = applyToolCallToTask(meta, call('read_page')); + expect(meta.budget_status).toBe('ok'); + + meta = applyToolCallToTask(meta, call('read_page')); + expect(meta.budget_status).toBe('warning'); + expect(meta.recommended_next).toBe('checkpoint_or_verify'); + expect(meta.recent_events?.at(-1)?.summary).toContain('budget_warning=checkpointEveryCalls'); + }); +}); + +describe('task envelope store integration', () => { + let root: string; + let store: TaskStore; + + beforeEach(() => { + root = fs.mkdtempSync(path.join(os.tmpdir(), 'oc-task-envelope-')); + store = new TaskStore({ rootDir: root }); + }); + + afterEach(() => fs.rmSync(root, { recursive: true, force: true })); + + test('absent task id path is a no-op for normal tool calls', async () => { + await expect(store.list()).resolves.toEqual([]); + }); + + test('task ids for browser_task envelopes are 16-hex and deterministic inputs vary by created_at', () => { + const a = computeTaskId('browser_task', { objective: 'x' }, 1); + const b = computeTaskId('browser_task', { objective: 'x' }, 2); + expect(a).toMatch(/^[0-9a-f]{16}$/); + expect(a).not.toBe(b); + }); + + + test('recordTaskToolCall ignores non-browser_task rows', async () => { + const meta = makeMeta({ kind: 'crawl', status: 'RUNNING' }); + await store.create(meta); + + await recordTaskToolCall(store, meta.task_id, { + ts: Date.now(), + tool: 'read_page', + sessionId: 'sess', + args: {}, + durationMs: 1, + ok: true, + }); + + expect(store.readMetaSync(meta.task_id)?.counters?.toolCalls).toBe(0); + }); + + test('recordTaskToolCall ignores cross-owner task ids', async () => { + const meta = makeMeta({ + owner: { session_id: 'sess-a', tenant_id: 'tenant-a', key_id: 'key-a', mode: 'api-key' }, + }); + await store.create(meta); + + await recordTaskToolCall(store, meta.task_id, { + ts: Date.now(), + tool: 'read_page', + sessionId: 'sess-a', + tenantId: 'tenant-b', + principalMode: 'api-key', + args: {}, + durationMs: 1, + ok: true, + }); + + expect(store.readMetaSync(meta.task_id)?.counters?.toolCalls).toBe(0); + }); +}); diff --git a/tests/core/task-ledger/tools.test.ts b/tests/core/task-ledger/tools.test.ts index c6f731cf2..a0fe6a242 100644 --- a/tests/core/task-ledger/tools.test.ts +++ b/tests/core/task-ledger/tools.test.ts @@ -15,6 +15,10 @@ import { getTaskStore, setTaskStoreForTests, } from '../../../src/tools/oc-task-start'; +import { __test__ as taskGetTest } from '../../../src/tools/oc-task-get'; +import { __test__ as taskUpdateTest } from '../../../src/tools/oc-task-update'; +import { __test__ as taskFinishTest } from '../../../src/tools/oc-task-finish'; +import { __test__ as taskCancelTest } from '../../../src/tools/oc-task-cancel'; import type { MCPResult, ToolHandler } from '../../../src/types/mcp'; function tempRoot(): string { @@ -66,16 +70,100 @@ describe('oc_task_start handler — happy path', () => { expect(result.url).toBe('https://example.com'); }); + + + test('creates a host-driven browser_task envelope when kind is omitted', async () => { + const handler = __test__.makeHandler({ resolveTool: () => null }); + const out = await handler('sess-1', { + objective: 'exercise budgets', + phase: 'explore', + policy: { maxObservationStreak: 3 }, + }); + expect(out.task_id).toMatch(/^[0-9a-f]{16}$/); + expect(out.status).toBe('RUNNING'); + const meta = getTaskStore().readMetaSync(out.task_id as string); + expect(meta?.kind).toBe('browser_task'); + expect(meta?.objective).toBe('exercise budgets'); + expect(meta?.policy?.maxObservationStreak).toBe(3); + expect(meta?.budget_status).toBe('ok'); + }); + + test('oc_task_cancel moves browser_task envelopes directly to CANCELLED', async () => { + const handler = __test__.makeHandler({ resolveTool: () => null }); + const started = await handler('sess-1', { + objective: 'host-managed browser work', + phase: 'act', + }); + const taskId = started.task_id as string; + + const cancelled = await taskCancelTest.handler('sess-1', { task_id: taskId }); + + expect(cancelled.isError).not.toBe(true); + expect(cancelled.meta).toMatchObject({ + task_id: taskId, + kind: 'browser_task', + status: 'CANCELLED', + phase: 'done', + }); + const meta = getTaskStore().readMetaSync(taskId); + expect(meta?.status).toBe('CANCELLED'); + expect(meta?.ended_at).toBeDefined(); + expect(meta?.cancel_requested_at).toBeDefined(); + }); + + + test('rejects malformed kind instead of creating a browser_task envelope', async () => { + const handler = __test__.makeHandler({ resolveTool: () => null }); + + const out = await handler('sess-1', { kind: 42 }); + + expect(out.isError).toBe(true); + expect(out.content?.[0]?.text).toContain('kind must be a non-empty string'); + }); + + test('rejects missing args when scheduling a non-browser tool task', async () => { + const innerTool = jest.fn, Parameters>(async () => ({ + content: [{ type: 'text', text: 'should not run' }], + })); + const handler = __test__.makeHandler({ + resolveTool: (name) => (name === 'fake_inner' ? innerTool : null), + }); + + const out = await handler('sess-1', { kind: 'fake_inner' }); + + expect(out.isError).toBe(true); + expect(out.content?.[0]?.text).toContain('args must be an object'); + expect(innerTool).not.toHaveBeenCalled(); + }); + test('returns isError when tool name is not registered', async () => { const handler = __test__.makeHandler({ resolveTool: () => null }); const out = await handler('sess-1', { kind: 'nope', args: {} }); expect(out.isError).toBe(true); }); - test('returns isError when kind is missing', async () => { + test('omitted kind starts an envelope instead of launching an inner tool', async () => { const handler = __test__.makeHandler({ resolveTool: () => null }); const out = await handler('sess-1', { args: {} }); - expect(out.isError).toBe(true); + expect(out.isError).not.toBe(true); + expect(out.kind).toBe('browser_task'); + }); + + test('oc_task_get accepts taskId alias through schema and handler', async () => { + const handler = __test__.makeHandler({ resolveTool: () => null }); + const started = await handler('sess-1', { + objective: 'poll through alias', + phase: 'explore', + }); + + expect(taskGetTest.definition.inputSchema.required).toBeUndefined(); + + const fetched = await taskGetTest.handler('sess-1', { + taskId: started.task_id, + }); + + expect(fetched.isError).not.toBe(true); + expect(fetched.meta).toMatchObject({ task_id: started.task_id, objective: 'poll through alias' }); }); test('rejects recursive task-ledger tool scheduling', async () => { @@ -93,6 +181,18 @@ describe('oc_task_start handler — happy path', () => { expect(innerTool).not.toHaveBeenCalled(); }); + test('rejects update and finish task-ledger tools from async scheduling', async () => { + const handler = __test__.makeHandler({ resolveTool: () => async () => ({ content: [] }) }); + + const update = await handler('sess-1', { kind: 'oc_task_update', args: {} }); + const finish = await handler('sess-1', { kind: 'oc_task_finish', args: {} }); + + expect(update.isError).toBe(true); + expect(update.content?.[0]?.text).toContain('refusing to schedule'); + expect(finish.isError).toBe(true); + expect(finish.content?.[0]?.text).toContain('refusing to schedule'); + }); + test('identical starts in the same millisecond get distinct task ids', async () => { const innerTool: ToolHandler = async () => ({ content: [{ type: 'text', text: 'ok' }] }); @@ -143,4 +243,51 @@ describe('oc_task_start handler — happy path', () => { await new Promise((r) => setTimeout(r, 10)); } }); + + test('oc_task_update and oc_task_finish enforce task ownership', async () => { + const handler = __test__.makeHandler({ resolveTool: () => null }); + const principal = { mode: 'api-key' as const, tenantId: 'tenant-a', keyId: 'key-a', scopes: ['write' as const] }; + const started = await handler('sess-owned', { objective: 'owned task' }, { + startTime: Date.now(), + deadlineMs: 1000, + principal, + }); + const taskId = started.task_id as string; + const otherPrincipal = { mode: 'api-key' as const, tenantId: 'tenant-b', keyId: 'key-b', scopes: ['write' as const] }; + + const deniedUpdate = await taskUpdateTest.handler('sess-owned', { taskId, phase: 'act' }, { + startTime: Date.now(), + deadlineMs: 1000, + principal: otherPrincipal, + }); + const deniedFinish = await taskFinishTest.handler('sess-owned', { taskId, outcome: 'completed' }, { + startTime: Date.now(), + deadlineMs: 1000, + principal: otherPrincipal, + }); + const allowedUpdate = await taskUpdateTest.handler('sess-owned', { taskId, phase: 'act' }, { + startTime: Date.now(), + deadlineMs: 1000, + principal, + }); + + expect(deniedUpdate.isError).toBe(true); + expect(deniedFinish.isError).toBe(true); + expect(allowedUpdate.isError).not.toBe(true); + expect(allowedUpdate.meta).toMatchObject({ task_id: taskId, phase: 'act' }); + }); + + test('oc_task_update rejects invalid phases instead of coercing them', async () => { + const handler = __test__.makeHandler({ resolveTool: () => null }); + const started = await handler('sess-1', { objective: 'phase validation' }); + + const out = await taskUpdateTest.handler('sess-1', { + taskId: started.task_id, + phase: 'not-a-phase', + }); + + expect(out.isError).toBe(true); + expect(out.content?.[0]?.text).toContain('phase must be'); + expect(getTaskStore().readMetaSync(started.task_id as string)?.phase).toBe('explore'); + }); });