diff --git a/applications/web/sources/components/schedulers/RunHistorySection.tsx b/applications/web/sources/components/schedulers/RunHistorySection.tsx index 6b38fd5..06fac48 100644 --- a/applications/web/sources/components/schedulers/RunHistorySection.tsx +++ b/applications/web/sources/components/schedulers/RunHistorySection.tsx @@ -223,13 +223,13 @@ export function RunHistorySection({ schedulerID }: RunHistorySectionProperties) {run.error} )} - {run.result && ( + {run.result != null && ( <> Result {JSON.stringify(run.result, null, 2)} )} - {!run.error && !run.result && ( + {!run.error && run.result == null && ( No additional details. )} diff --git a/functions/crawler-code-runner-function/sources/index.ts b/functions/crawler-code-runner-function/sources/index.ts index 2e0ca50..fec1d7c 100644 --- a/functions/crawler-code-runner-function/sources/index.ts +++ b/functions/crawler-code-runner-function/sources/index.ts @@ -1,4 +1,4 @@ -import { createServerLogger } from '@audio-underview/logger'; +import { createServerLogger, Logger } from '@audio-underview/logger'; import { type LambdaEvent, type LambdaResponse, @@ -10,12 +10,36 @@ import { import { createContext, Script } from 'node:vm'; import { lookup } from 'node:dns/promises'; -interface RunRequestBody { - type: 'test' | 'run'; +interface WebRunRequestBody { + type: 'web'; + mode: 'test' | 'run'; url: string; code: string; } +interface DataRunRequestBody { + type: 'data'; + mode: 'test' | 'run'; + data: unknown; + code: string; +} + +type RunRequestBody = WebRunRequestBody | DataRunRequestBody; + +class SandboxTimeoutError extends Error { + constructor(message: string) { + super(message); + this.name = 'SandboxTimeoutError'; + } +} + +class SandboxExecutionError extends Error { + constructor(message: string) { + super(message); + this.name = 'SandboxExecutionError'; + } +} + const FETCH_TIMEOUT_MILLISECONDS = 10_000; const CODE_EXECUTION_TIMEOUT_MILLISECONDS = 5_000; const MAX_CODE_LENGTH = 10_000; @@ -67,6 +91,90 @@ async function validateTargetURL(targetURL: URL, context: ResponseContext): Prom return null; } +async function executeInSandbox(code: string, argument: unknown, logger: Logger): Promise<{ result: unknown }> { + const sandbox = createContext({ + Array, + Boolean, + Date, + Error, + JSON, + Map, + Math, + Number, + Object, + Promise, + RegExp, + Set, + String, + TypeError, + RangeError, + URL, + URLSearchParams, + parseInt, + parseFloat, + isNaN, + isFinite, + encodeURIComponent, + decodeURIComponent, + encodeURI, + decodeURI, + undefined, + NaN, + Infinity, + }); + + // For non-string arguments, re-create inside sandbox context via JSON.parse. + // This ensures Array.isArray, instanceof etc. work correctly inside the sandbox. + let sandboxArgument: unknown = argument; + if (typeof argument !== 'string') { + sandbox.__rawInput__ = JSON.stringify(argument); + new Script('globalThis.__input__ = JSON.parse(globalThis.__rawInput__)').runInContext(sandbox); + sandboxArgument = sandbox.__input__; + delete sandbox.__rawInput__; + delete sandbox.__input__; + } + + try { + const script = new Script(`(${code})`); + const userFunction = script.runInContext(sandbox, { timeout: CODE_EXECUTION_TIMEOUT_MILLISECONDS }); + let timer: ReturnType | undefined; + const asyncTimeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new Error('Async execution timed out')), CODE_EXECUTION_TIMEOUT_MILLISECONDS); + timer.unref(); + }); + let result: unknown; + try { + result = await Promise.race([userFunction(sandboxArgument), asyncTimeout]); + } finally { + clearTimeout(timer); + } + + // Normalize undefined to null to prevent JSON.stringify field drop + if (result === undefined) { + result = null; + } + + return { result }; + } catch (executionError) { + logger.error('Code execution failed', executionError, { function: 'executeInSandbox' }); + if ( + executionError != null + && typeof executionError === 'object' + && 'code' in executionError + && executionError.code === 'ERR_SCRIPT_EXECUTION_TIMEOUT' + ) { + throw new SandboxTimeoutError(`Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`); + } + const message = executionError instanceof Error + ? executionError.message + : 'Unknown execution error'; + if (message === 'Async execution timed out') { + throw new SandboxTimeoutError(`Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`); + } + throw new SandboxExecutionError(message); + } +} + const logger = createServerLogger({ defaultContext: { module: 'crawler-code-runner-function', @@ -81,24 +189,58 @@ const HELP = { { method: 'POST', path: '/run', - description: 'Fetch a URL and run code against the response body', + description: 'Run code against a fetched URL response (web) or provided data (data)', body: { - type: "'test' | 'run'", - url: 'string - The URL to fetch', - code: 'string - JavaScript function source to execute against the fetched response body', + type: "'web' | 'data'", + mode: "'test' | 'run'", + url: "string - The URL to fetch (required for type 'web')", + data: "unknown - The data to process (required for type 'data')", + code: 'string - JavaScript function source to execute against the input', }, }, ], }; -function isRunRequestBody(value: unknown): value is RunRequestBody { - if (value == null || typeof value !== 'object') return false; - const object = value as Record; - return ( - (object.type === 'test' || object.type === 'run') - && typeof object.url === 'string' - && typeof object.code === 'string' - ); +function validateRunRequestBody(raw: unknown): RunRequestBody | string { + if (raw == null || typeof raw !== 'object') { + return 'Request body must be a JSON object'; + } + const object = raw as Record; + + if (object.type !== 'web' && object.type !== 'data') { + return "Field 'type' must be 'web' or 'data'"; + } + + if (object.mode !== 'test' && object.mode !== 'run') { + return "Field 'mode' must be 'test' or 'run'"; + } + + if (typeof object.code !== 'string') { + return "Field 'code' is required and must be a string"; + } + + if (object.type === 'web') { + if (typeof object.url !== 'string') { + return "Field 'url' is required and must be a string when type is 'web'"; + } + return { + type: 'web', + mode: object.mode as 'test' | 'run', + url: object.url as string, + code: object.code as string, + }; + } + + // type === 'data' + if (!('data' in object)) { + return "Field 'data' is required when type is 'data'"; + } + return { + type: 'data', + mode: object.mode as 'test' | 'run', + data: object.data, + code: object.code as string, + }; } async function handleRun(body: string | undefined, context: ResponseContext): Promise { @@ -109,116 +251,76 @@ async function handleRun(body: string | undefined, context: ResponseContext): Pr return errorResponse('invalid_request', 'Request body must be valid JSON', 400, context); } - if (!isRunRequestBody(raw)) { - const object = raw != null && typeof raw === 'object' ? raw as Record : {}; - if (!object.type || (object.type !== 'test' && object.type !== 'run')) { - return errorResponse('invalid_request', "Field 'type' must be 'test' or 'run'", 400, context); - } - if (!object.url || typeof object.url !== 'string') { - return errorResponse('invalid_request', "Field 'url' is required and must be a string", 400, context); - } - return errorResponse('invalid_request', "Field 'code' is required and must be a string", 400, context); + const validated = validateRunRequestBody(raw); + if (typeof validated === 'string') { + return errorResponse('invalid_request', validated, 400, context); } - const parsed = raw; + const parsed = validated; if (parsed.code.length > MAX_CODE_LENGTH) { return errorResponse('invalid_request', `Field 'code' exceeds maximum length of ${MAX_CODE_LENGTH} characters`, 400, context); } - let targetURL: URL; - try { - targetURL = new URL(parsed.url); - } catch { - return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context); - } + if (parsed.type === 'web') { + let targetURL: URL; + try { + targetURL = new URL(parsed.url); + } catch { + return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context); + } - const ssrfError = await validateTargetURL(targetURL, context); - if (ssrfError) { - return ssrfError; - } + const ssrfError = await validateTargetURL(targetURL, context); + if (ssrfError) { + return ssrfError; + } - let responseText: string; - try { - const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS); - logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' }); - const fetchResponse = await fetch(targetURL.toString(), { signal }); - responseText = await fetchResponse.text(); - logger.info('Target URL fetched', { - status: fetchResponse.status, - contentLength: responseText.length, - }, { function: 'handleRun' }); - } catch (fetchError) { - if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') { - logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' }); - return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context); + let responseText: string; + try { + const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS); + logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' }); + const fetchResponse = await fetch(targetURL.toString(), { signal }); + responseText = await fetchResponse.text(); + logger.info('Target URL fetched', { + status: fetchResponse.status, + contentLength: responseText.length, + }, { function: 'handleRun' }); + } catch (fetchError) { + if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') { + logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' }); + return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context); + } + logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' }); + return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context); } - logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' }); - return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context); - } - let result: unknown; - try { - const sandbox = createContext({ - Array, - Boolean, - Date, - Error, - JSON, - Map, - Math, - Number, - Object, - Promise, - RegExp, - Set, - String, - TypeError, - RangeError, - parseInt, - parseFloat, - isNaN, - isFinite, - encodeURIComponent, - decodeURIComponent, - encodeURI, - decodeURI, - undefined, - NaN, - Infinity, - }); - const script = new Script(`(${parsed.code})`); - const userFunction = script.runInContext(sandbox, { timeout: CODE_EXECUTION_TIMEOUT_MILLISECONDS }); - let timer: ReturnType | undefined; - const asyncTimeout = new Promise((_, reject) => { - timer = setTimeout(() => reject(new Error('Async execution timed out')), CODE_EXECUTION_TIMEOUT_MILLISECONDS); - timer.unref(); - }); try { - result = await Promise.race([userFunction(responseText), asyncTimeout]); - } finally { - clearTimeout(timer); + const { result } = await executeInSandbox(parsed.code, responseText, logger); + return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context); + } catch (executionError) { + if (executionError instanceof SandboxTimeoutError) { + return errorResponse('execution_timeout', executionError.message, 422, context); + } + if (executionError instanceof SandboxExecutionError) { + return errorResponse('execution_failed', executionError.message, 422, context); + } + return errorResponse('execution_failed', 'Unknown execution error', 422, context); } + } + + // type === 'data' + try { + const { result } = await executeInSandbox(parsed.code, parsed.data, logger); + return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context); } catch (executionError) { - logger.error('Code execution failed', executionError, { function: 'handleRun' }); - if ( - executionError != null - && typeof executionError === 'object' - && 'code' in executionError - && executionError.code === 'ERR_SCRIPT_EXECUTION_TIMEOUT' - ) { - return errorResponse('execution_timeout', `Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`, 422, context); + if (executionError instanceof SandboxTimeoutError) { + return errorResponse('execution_timeout', executionError.message, 422, context); } - const message = executionError instanceof Error - ? executionError.message - : 'Unknown execution error'; - if (message === 'Async execution timed out') { - return errorResponse('execution_timeout', `Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`, 422, context); + if (executionError instanceof SandboxExecutionError) { + return errorResponse('execution_failed', executionError.message, 422, context); } - return errorResponse('execution_failed', message, 422, context); + return errorResponse('execution_failed', 'Unknown execution error', 422, context); } - - return jsonResponse({ type: parsed.type, result }, 200, context); } export async function handler(event: LambdaEvent): Promise { diff --git a/functions/crawler-code-runner-function/tests/index.test.ts b/functions/crawler-code-runner-function/tests/index.test.ts index b498c9a..fae66d5 100644 --- a/functions/crawler-code-runner-function/tests/index.test.ts +++ b/functions/crawler-code-runner-function/tests/index.test.ts @@ -109,7 +109,7 @@ describe('crawler-code-runner-function', () => { path: '/run', origin: 'https://example.com', contentType: 'application/json', - body: JSON.stringify({ url: 'https://example.com', code: '(x) => x' }), + body: JSON.stringify({ mode: 'test', url: 'https://example.com', code: '(x) => x' }), }); const response = await handler(event); @@ -125,22 +125,55 @@ describe('crawler-code-runner-function', () => { path: '/run', origin: 'https://example.com', contentType: 'application/json', - body: JSON.stringify({ type: 'invalid', url: 'https://example.com', code: '(x) => x' }), + body: JSON.stringify({ type: 'invalid', mode: 'test', url: 'https://example.com', code: '(x) => x' }), }); const response = await handler(event); expect(response.statusCode).toBe(400); const body = JSON.parse(response.body); expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('type'); }); - it('returns 400 for missing url field', async () => { + it('returns 400 for missing mode field', async () => { const event = createEvent({ method: 'POST', path: '/run', origin: 'https://example.com', contentType: 'application/json', - body: JSON.stringify({ type: 'run', code: '(x) => x' }), + body: JSON.stringify({ type: 'web', url: 'https://example.com', code: '(x) => x' }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(400); + const body = JSON.parse(response.body); + expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('mode'); + }); + + it('returns 400 for invalid mode value', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ type: 'web', mode: 'invalid', url: 'https://example.com', code: '(x) => x' }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(400); + const body = JSON.parse(response.body); + expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('mode'); + }); + + it('returns 400 for missing url field when type is web', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ type: 'web', mode: 'run', code: '(x) => x' }), }); const response = await handler(event); @@ -150,13 +183,29 @@ describe('crawler-code-runner-function', () => { expect(body.error_description).toContain('url'); }); + it('returns 400 for missing data field when type is data', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ type: 'data', mode: 'run', code: '(x) => x' }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(400); + const body = JSON.parse(response.body); + expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('data'); + }); + it('returns 400 for missing code field', async () => { const event = createEvent({ method: 'POST', path: '/run', origin: 'https://example.com', contentType: 'application/json', - body: JSON.stringify({ type: 'run', url: 'https://example.com' }), + body: JSON.stringify({ type: 'web', mode: 'run', url: 'https://example.com' }), }); const response = await handler(event); @@ -172,7 +221,7 @@ describe('crawler-code-runner-function', () => { path: '/run', origin: 'https://example.com', contentType: 'application/json', - body: JSON.stringify({ type: 'run', url: 'not-a-url', code: '(x) => x' }), + body: JSON.stringify({ type: 'web', mode: 'run', url: 'not-a-url', code: '(x) => x' }), }); const response = await handler(event); @@ -183,7 +232,7 @@ describe('crawler-code-runner-function', () => { }); }); - describe('POST /run fetch failures', () => { + describe('POST /run web type - fetch failures', () => { it('returns 502 when target URL fetch fails', async () => { vi.stubGlobal('fetch', vi.fn().mockRejectedValue(new Error('Network error'))); @@ -193,7 +242,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '(text) => text', }), @@ -208,7 +258,7 @@ describe('crawler-code-runner-function', () => { }); }); - describe('POST /run execution failures', () => { + describe('POST /run web type - execution failures', () => { beforeEach(() => { vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ status: 200, @@ -227,7 +277,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '(text) => { throw new Error("intentional error"); }', }), @@ -247,7 +298,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '((( invalid syntax', }), @@ -260,12 +312,12 @@ describe('crawler-code-runner-function', () => { }); }); - describe('POST /run successful execution', () => { + describe('POST /run web type - successful execution', () => { afterEach(() => { vi.unstubAllGlobals(); }); - it('executes code and returns result with type test', async () => { + it('executes code and returns result with mode test', async () => { vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ status: 200, text: () => Promise.resolve('hello world'), @@ -277,7 +329,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'test', + type: 'web', + mode: 'test', url: 'https://target.example.com/data', code: '(text) => text.toUpperCase()', }), @@ -286,11 +339,12 @@ describe('crawler-code-runner-function', () => { expect(response.statusCode).toBe(200); const body = JSON.parse(response.body); - expect(body.type).toBe('test'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('test'); expect(body.result).toBe('HELLO WORLD'); }); - it('executes code and returns result with type run', async () => { + it('executes code and returns result with mode run', async () => { vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ status: 200, text: () => Promise.resolve('hello world'), @@ -302,7 +356,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '(text) => text.length', }), @@ -311,7 +366,8 @@ describe('crawler-code-runner-function', () => { expect(response.statusCode).toBe(200); const body = JSON.parse(response.body); - expect(body.type).toBe('run'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('run'); expect(body.result).toBe(11); }); @@ -327,7 +383,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/page', code: '(text) => ({ length: text.length, hasTitle: text.includes("") })', }), @@ -336,7 +393,8 @@ describe('crawler-code-runner-function', () => { expect(response.statusCode).toBe(200); const body = JSON.parse(response.body); - expect(body.type).toBe('run'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('run'); expect(body.result.length).toBe(43); expect(body.result.hasTitle).toBe(true); }); @@ -353,7 +411,8 @@ describe('crawler-code-runner-function', () => { origin: 'https://example.com', contentType: 'application/json', body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: 'async (text) => text.split(" ")', }), @@ -362,7 +421,8 @@ describe('crawler-code-runner-function', () => { expect(response.statusCode).toBe(200); const body = JSON.parse(response.body); - expect(body.type).toBe('run'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('run'); expect(body.result).toEqual(['async', 'test']); }); @@ -373,7 +433,8 @@ describe('crawler-code-runner-function', () => { })); const rawBody = JSON.stringify({ - type: 'test', + type: 'web', + mode: 'test', url: 'https://target.example.com/data', code: '(text) => text.toUpperCase()', }); @@ -389,9 +450,229 @@ describe('crawler-code-runner-function', () => { expect(response.statusCode).toBe(200); const body = JSON.parse(response.body); - expect(body.type).toBe('test'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('test'); expect(body.result).toBe('HELLO'); }); + + it('normalizes undefined result to null', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue({ + status: 200, + text: () => Promise.resolve('hello'), + })); + + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'web', + mode: 'run', + url: 'https://target.example.com/data', + code: '(text) => undefined', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.result).toBeNull(); + }); + }); + + describe('POST /run data type - successful execution', () => { + it('executes code against provided data object', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: { items: [1, 2, 3] }, + code: '(data) => data.items.length', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.type).toBe('data'); + expect(body.mode).toBe('run'); + expect(body.result).toBe(3); + }); + + it('executes code against provided data array', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'test', + data: [10, 20, 30], + code: '(data) => data.map((x) => x * 2)', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.type).toBe('data'); + expect(body.mode).toBe('test'); + expect(body.result).toEqual([20, 40, 60]); + }); + + it('executes code against provided string data', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: 'hello world', + code: '(data) => data.toUpperCase()', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.type).toBe('data'); + expect(body.mode).toBe('run'); + expect(body.result).toBe('HELLO WORLD'); + }); + + it('executes code against null data', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: null, + code: '(data) => data === null ? "was null" : "not null"', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.type).toBe('data'); + expect(body.mode).toBe('run'); + expect(body.result).toBe('was null'); + }); + + it('ensures Array.isArray works for array data in sandbox', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'test', + data: [1, 2, 3], + code: '(data) => Array.isArray(data)', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.result).toBe(true); + }); + + it('normalizes undefined result to null for data type', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: 'anything', + code: '(data) => undefined', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.result).toBeNull(); + }); + + it('does not perform SSRF check for data type', async () => { + // data type should not trigger any fetch or DNS lookup + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: { value: 42 }, + code: '(data) => data.value + 1', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(200); + const body = JSON.parse(response.body); + expect(body.result).toBe(43); + }); + }); + + describe('POST /run data type - execution failures', () => { + it('returns 422 when code execution throws an error', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: { value: 1 }, + code: '(data) => { throw new Error("data processing error"); }', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(422); + const body = JSON.parse(response.body); + expect(body.error).toBe('execution_failed'); + expect(body.error_description).toContain('data processing error'); + }); + + it('returns 422 when code is syntactically invalid', async () => { + const event = createEvent({ + method: 'POST', + path: '/run', + origin: 'https://example.com', + contentType: 'application/json', + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: 'test', + code: '((( invalid syntax', + }), + }); + const response = await handler(event); + + expect(response.statusCode).toBe(422); + const body = JSON.parse(response.body); + expect(body.error).toBe('execution_failed'); + }); }); describe('unknown routes', () => { diff --git a/packages/supabase-connector/migrations/006_add_active_run_unique_constraint.sql b/packages/supabase-connector/migrations/006_add_active_run_unique_constraint.sql new file mode 100644 index 0000000..798190b --- /dev/null +++ b/packages/supabase-connector/migrations/006_add_active_run_unique_constraint.sql @@ -0,0 +1,19 @@ +-- Migration: Prevent concurrent runs for the same scheduler +-- Only one run with status 'pending' or 'running' can exist per scheduler at a time. +-- When a run completes (completed/failed/partially_failed), the index no longer blocks new runs. + +-- Clean up duplicate active runs before creating the unique index. +-- Keeps the most recent active run per scheduler, marks others as failed. +UPDATE scheduler_runs +SET status = 'failed', error = 'Cleaned up by migration 006', completed_at = NOW() +WHERE id NOT IN ( + SELECT DISTINCT ON (scheduler_id) id + FROM scheduler_runs + WHERE status IN ('pending', 'running') + ORDER BY scheduler_id, created_at DESC +) +AND status IN ('pending', 'running'); + +CREATE UNIQUE INDEX scheduler_runs_one_active_per_scheduler + ON scheduler_runs (scheduler_id) + WHERE status IN ('pending', 'running'); diff --git a/packages/supabase-connector/migrations/007_create_crawler_permissions.sql b/packages/supabase-connector/migrations/007_create_crawler_permissions.sql new file mode 100644 index 0000000..2a520a4 --- /dev/null +++ b/packages/supabase-connector/migrations/007_create_crawler_permissions.sql @@ -0,0 +1,26 @@ +-- Migration: Crawler permission system +-- Controls who can use a crawler in their scheduler stages. +-- Extensible for marketplace subscriptions. + +CREATE TYPE crawler_permission_level AS ENUM ('owner', 'subscriber'); + +CREATE TABLE crawler_permissions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + crawler_id UUID NOT NULL REFERENCES crawlers(id) ON DELETE CASCADE, + user_uuid UUID NOT NULL REFERENCES users(uuid) ON DELETE CASCADE, + level crawler_permission_level NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE(crawler_id, user_uuid) +); + +CREATE INDEX crawler_permissions_user_uuid_index ON crawler_permissions(user_uuid); +CREATE INDEX crawler_permissions_crawler_id_index ON crawler_permissions(crawler_id); + +COMMENT ON TABLE crawler_permissions IS 'Controls access to crawlers. Owner = creator, subscriber = marketplace user.'; +COMMENT ON COLUMN crawler_permissions.level IS 'Permission level: owner (full control), subscriber (can use in stages)'; + +-- Backfill: grant owner permission to existing crawler creators +INSERT INTO crawler_permissions (crawler_id, user_uuid, level) +SELECT id, user_uuid, 'owner' +FROM crawlers +ON CONFLICT (crawler_id, user_uuid) DO NOTHING; diff --git a/packages/supabase-connector/migrations/008_add_fan_out_strategy.sql b/packages/supabase-connector/migrations/008_add_fan_out_strategy.sql new file mode 100644 index 0000000..4f8abcc --- /dev/null +++ b/packages/supabase-connector/migrations/008_add_fan_out_strategy.sql @@ -0,0 +1,8 @@ +-- Add fan_out_strategy column to scheduler_stages +-- 'compact' (default): remove failed items from results +-- 'preserve': keep failed items as null, preserving positional alignment + +CREATE TYPE fan_out_strategy AS ENUM ('compact', 'preserve'); + +ALTER TABLE scheduler_stages + ADD COLUMN fan_out_strategy fan_out_strategy NOT NULL DEFAULT 'compact'; diff --git a/packages/supabase-connector/sources/crawler-permissions.ts b/packages/supabase-connector/sources/crawler-permissions.ts new file mode 100644 index 0000000..1e5d6ab --- /dev/null +++ b/packages/supabase-connector/sources/crawler-permissions.ts @@ -0,0 +1,97 @@ +import type { SupabaseClient } from '@supabase/supabase-js'; +import { traceDatabaseOperation, SpanStatusCode } from '@audio-underview/axiom-logger/tracers'; +import type { + Database, + CrawlerPermissionRow, + CrawlerPermissionLevel, +} from './types/index.ts'; + +type SupabaseClientType = SupabaseClient<Database>; + +export async function createCrawlerPermission( + client: SupabaseClientType, + parameters: { + crawler_id: string; + user_uuid: string; + level: CrawlerPermissionLevel; + }, +): Promise<CrawlerPermissionRow> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'insert', table: 'crawler_permissions' }, + async (span) => { + span.setAttribute('db.insert.crawler_id', parameters.crawler_id); + span.setAttribute('db.insert.user_uuid', parameters.user_uuid); + span.setAttribute('db.insert.level', parameters.level); + + const { data, error } = await client + .from('crawler_permissions') + .insert(parameters) + .select() + .single(); + + if (error) { + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to create crawler permission: ${error.message}`); + } + + span.setAttribute('db.rows_affected', 1); + return data as CrawlerPermissionRow; + }, + ); +} + +export async function getCrawlerPermission( + client: SupabaseClientType, + crawlerID: string, + userUUID: string, +): Promise<CrawlerPermissionRow | undefined> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'select', table: 'crawler_permissions' }, + async (span) => { + span.setAttribute('db.query.crawler_id', crawlerID); + span.setAttribute('db.query.user_uuid', userUUID); + + const { data, error } = await client + .from('crawler_permissions') + .select() + .eq('crawler_id', crawlerID) + .eq('user_uuid', userUUID) + .maybeSingle(); + + if (error) { + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to get crawler permission: ${error.message}`); + } + + span.setAttribute('db.rows_affected', data === null ? 0 : 1); + return (data as CrawlerPermissionRow | null) ?? undefined; + }, + ); +} + +export async function deleteCrawlerPermission( + client: SupabaseClientType, + crawlerID: string, + userUUID: string, +): Promise<void> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'delete', table: 'crawler_permissions' }, + async (span) => { + span.setAttribute('db.query.crawler_id', crawlerID); + span.setAttribute('db.query.user_uuid', userUUID); + + const { error } = await client + .from('crawler_permissions') + .delete() + .eq('crawler_id', crawlerID) + .eq('user_uuid', userUUID); + + if (error) { + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to delete crawler permission: ${error.message}`); + } + + span.setAttribute('db.rows_affected', 1); + }, + ); +} diff --git a/packages/supabase-connector/sources/crawlers.ts b/packages/supabase-connector/sources/crawlers.ts index 2d3b030..6dd7546 100644 --- a/packages/supabase-connector/sources/crawlers.ts +++ b/packages/supabase-connector/sources/crawlers.ts @@ -92,6 +92,44 @@ export async function listCrawlersByUser( ); } +/** + * Gets a single crawler by ID without ownership check. + * Used by the execution engine where the crawler may belong to any user. + * + * @param client - Supabase client + * @param id - Crawler ID + * @returns Crawler row if found, null otherwise + */ +export async function getCrawlerByID( + client: SupabaseClientType, + id: string, +): Promise<CrawlerRow | null> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'select', table: 'crawlers' }, + async (span) => { + span.setAttribute('db.query.id', id); + + const { data, error } = await client + .from('crawlers') + .select('*') + .eq('id', id) + .single(); + + if (error) { + if (error.code === 'PGRST116') { + span.setAttribute('db.rows_affected', 0); + return null; + } + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to get crawler by ID: ${error.message}`); + } + + span.setAttribute('db.rows_affected', 1); + return data as CrawlerRow; + } + ); +} + /** * Gets a single crawler by ID, verifying ownership. * diff --git a/packages/supabase-connector/sources/index.ts b/packages/supabase-connector/sources/index.ts index 592ac59..e7a4477 100644 --- a/packages/supabase-connector/sources/index.ts +++ b/packages/supabase-connector/sources/index.ts @@ -7,6 +7,7 @@ export type { AccountRow, CrawlerRow, SchedulerRow, + FanOutStrategy, SchedulerStageRow, SchedulerRunRow, SchedulerStageRunRow, @@ -25,6 +26,8 @@ export type { SocialLoginInput, SocialLoginResult, LinkAccountResult, + CrawlerPermissionLevel, + CrawlerPermissionRow, SupabaseConnectorConfiguration, Database, } from './types/index.ts'; @@ -52,10 +55,18 @@ export { createCrawler, listCrawlersByUser, getCrawler, + getCrawlerByID, updateCrawler, deleteCrawler, } from './crawlers.ts'; +// Crawler permission operations +export { + createCrawlerPermission, + getCrawlerPermission, + deleteCrawlerPermission, +} from './crawler-permissions.ts'; + // Scheduler operations export type { PaginatedSchedulers } from './schedulers.ts'; export { @@ -84,3 +95,10 @@ export { updateSchedulerRun, listSchedulerRuns, } from './scheduler-runs.ts'; + +// Scheduler stage run operations +export { + createSchedulerStageRun, + updateSchedulerStageRun, + listSchedulerStageRunsByRun, +} from './scheduler-stage-runs.ts'; diff --git a/packages/supabase-connector/sources/scheduler-runs.ts b/packages/supabase-connector/sources/scheduler-runs.ts index 2e98261..399c48e 100644 --- a/packages/supabase-connector/sources/scheduler-runs.ts +++ b/packages/supabase-connector/sources/scheduler-runs.ts @@ -2,6 +2,7 @@ import type { SupabaseClient } from '@supabase/supabase-js'; import { traceDatabaseOperation, SpanStatusCode } from '@audio-underview/axiom-logger/tracers'; import type { Database, + SchedulerRunStatus, SchedulerRunRow, SchedulerRunsInsert, SchedulerRunsUpdate, @@ -79,6 +80,7 @@ export async function updateSchedulerRun( id: string, schedulerID: string, input: SchedulerRunsUpdate, + options?: { onlyIfStatus?: readonly SchedulerRunStatus[] }, ): Promise<SchedulerRunRow | null> { return traceDatabaseOperation( { serviceName: 'supabase-connector', operation: 'update', table: 'scheduler_runs' }, @@ -86,11 +88,17 @@ export async function updateSchedulerRun( span.setAttribute('db.update.id', id); span.setAttribute('db.update.scheduler_id', schedulerID); - const { data, error } = await client + let query = client .from('scheduler_runs') .update(input) .eq('id', id) - .eq('scheduler_id', schedulerID) + .eq('scheduler_id', schedulerID); + + if (options?.onlyIfStatus !== undefined) { + query = query.in('status', options.onlyIfStatus); + } + + const { data, error } = await query .select() .single(); diff --git a/packages/supabase-connector/sources/scheduler-stage-runs.ts b/packages/supabase-connector/sources/scheduler-stage-runs.ts new file mode 100644 index 0000000..85756fc --- /dev/null +++ b/packages/supabase-connector/sources/scheduler-stage-runs.ts @@ -0,0 +1,101 @@ +import type { SupabaseClient } from '@supabase/supabase-js'; +import { traceDatabaseOperation, SpanStatusCode } from '@audio-underview/axiom-logger/tracers'; +import type { + Database, + SchedulerStageRunRow, + SchedulerStageRunsInsert, + SchedulerStageRunsUpdate, +} from './types/index.ts'; + +type SupabaseClientType = SupabaseClient<Database>; + +export async function createSchedulerStageRun( + client: SupabaseClientType, + input: SchedulerStageRunsInsert, +): Promise<SchedulerStageRunRow> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'insert', table: 'scheduler_stage_runs' }, + async (span) => { + span.setAttribute('db.insert.run_id', input.run_id); + span.setAttribute('db.insert.stage_id', input.stage_id); + span.setAttribute('db.insert.stage_order', input.stage_order); + + const { data, error } = await client + .from('scheduler_stage_runs') + .insert(input) + .select() + .single(); + + if (error) { + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to create scheduler stage run: ${error.message}`); + } + + span.setAttribute('db.rows_affected', 1); + span.setAttribute('db.created_id', (data as SchedulerStageRunRow).id); + return data as SchedulerStageRunRow; + }, + ); +} + +export async function updateSchedulerStageRun( + client: SupabaseClientType, + id: string, + runID: string, + input: SchedulerStageRunsUpdate, +): Promise<SchedulerStageRunRow | null> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'update', table: 'scheduler_stage_runs' }, + async (span) => { + span.setAttribute('db.update.id', id); + span.setAttribute('db.update.run_id', runID); + + const { data, error } = await client + .from('scheduler_stage_runs') + .update(input) + .eq('id', id) + .eq('run_id', runID) + .select() + .single(); + + if (error) { + if (error.code === 'PGRST116') { + span.setAttribute('db.rows_affected', 0); + return null; + } + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to update scheduler stage run: ${error.message}`); + } + + span.setAttribute('db.rows_affected', 1); + return data as SchedulerStageRunRow; + }, + ); +} + +export async function listSchedulerStageRunsByRun( + client: SupabaseClientType, + runID: string, +): Promise<SchedulerStageRunRow[]> { + return traceDatabaseOperation( + { serviceName: 'supabase-connector', operation: 'select', table: 'scheduler_stage_runs' }, + async (span) => { + span.setAttribute('db.query.run_id', runID); + + const { data, error } = await client + .from('scheduler_stage_runs') + .select('*') + .eq('run_id', runID) + .order('stage_order', { ascending: true }); + + if (error) { + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw new Error(`Failed to list scheduler stage runs: ${error.message}`); + } + + const stageRuns = (data ?? []) as SchedulerStageRunRow[]; + span.setAttribute('db.rows_affected', stageRuns.length); + return stageRuns; + }, + ); +} diff --git a/packages/supabase-connector/sources/types/database.ts b/packages/supabase-connector/sources/types/database.ts index 6eaaeaa..0aba278 100644 --- a/packages/supabase-connector/sources/types/database.ts +++ b/packages/supabase-connector/sources/types/database.ts @@ -97,7 +97,12 @@ export interface SchedulerRow { * (e.g. { url: { type: "string", default: "https://..." } }). * output_schema is derived from the crawler's output_schema. * fan_out_field names the array field in previous output to fan-out over. + * fan_out_strategy controls how failed items are handled: + * 'compact' (default) — remove failed items from results + * 'preserve' — keep failed items as null, preserving positional alignment */ +export type FanOutStrategy = 'compact' | 'preserve'; + export interface SchedulerStageRow { [key: string]: unknown; id: string; @@ -107,6 +112,7 @@ export interface SchedulerStageRow { input_schema: Record<string, unknown>; output_schema: Record<string, unknown>; fan_out_field: string | null; + fan_out_strategy: FanOutStrategy; created_at: string; } @@ -126,7 +132,7 @@ export interface SchedulerRunRow { status: SchedulerRunStatus; started_at: string | null; completed_at: string | null; - result: Record<string, unknown> | null; + result: unknown; error: string | null; created_at: string; } @@ -144,8 +150,8 @@ export interface SchedulerStageRunRow { status: SchedulerRunStatus; started_at: string | null; completed_at: string | null; - input: Record<string, unknown> | null; - output: Record<string, unknown> | null; + input: unknown; + output: unknown; error: string | null; items_total: number | null; items_succeeded: number | null; @@ -153,6 +159,24 @@ export interface SchedulerStageRunRow { created_at: string; } +/** + * Crawler permission level type + */ +export type CrawlerPermissionLevel = 'owner' | 'subscriber'; + +/** + * Crawler permission table row type + * Controls who can use a crawler in their scheduler stages. + */ +export interface CrawlerPermissionRow { + [key: string]: unknown; + id: string; + crawler_id: string; + user_uuid: string; + level: CrawlerPermissionLevel; + created_at: string; +} + /** * Supabase connector configuration */ @@ -249,6 +273,7 @@ export interface Database { input_schema: Record<string, unknown>; output_schema?: Record<string, unknown>; fan_out_field?: string | null; + fan_out_strategy?: FanOutStrategy; }; Update: Partial<Omit<SchedulerStageRow, 'id' | 'scheduler_id' | 'created_at'>>; Relationships: [ @@ -277,7 +302,7 @@ export interface Database { status?: SchedulerRunStatus; started_at?: string | null; completed_at?: string | null; - result?: Record<string, unknown> | null; + result?: unknown; error?: string | null; }; Update: Partial<Omit<SchedulerRunRow, 'id' | 'scheduler_id' | 'created_at'>>; @@ -302,8 +327,8 @@ export interface Database { status?: SchedulerRunStatus; started_at?: string | null; completed_at?: string | null; - input?: Record<string, unknown> | null; - output?: Record<string, unknown> | null; + input?: unknown; + output?: unknown; error?: string | null; items_total?: number | null; items_succeeded?: number | null; @@ -327,6 +352,33 @@ export interface Database { }, ]; }; + crawler_permissions: { + Row: CrawlerPermissionRow; + Insert: { + [key: string]: unknown; + id?: string; + crawler_id: string; + user_uuid: string; + level: CrawlerPermissionLevel; + }; + Update: Partial<Omit<CrawlerPermissionRow, 'id' | 'created_at'>>; + Relationships: [ + { + foreignKeyName: 'crawler_permissions_crawler_id_fkey'; + columns: ['crawler_id']; + isOneToOne: false; + referencedRelation: 'crawlers'; + referencedColumns: ['id']; + }, + { + foreignKeyName: 'crawler_permissions_user_uuid_fkey'; + columns: ['user_uuid']; + isOneToOne: false; + referencedRelation: 'users'; + referencedColumns: ['uuid']; + }, + ]; + }; }; Views: Record<string, never>; Functions: { @@ -342,6 +394,7 @@ export interface Database { provider_type: ProviderType; crawler_type: CrawlerType; scheduler_run_status: SchedulerRunStatus; + crawler_permission_level: CrawlerPermissionLevel; }; CompositeTypes: Record<string, never>; }; diff --git a/packages/supabase-connector/sources/types/index.ts b/packages/supabase-connector/sources/types/index.ts index 8f28dcd..c6dc7a2 100644 --- a/packages/supabase-connector/sources/types/index.ts +++ b/packages/supabase-connector/sources/types/index.ts @@ -6,6 +6,7 @@ export type { AccountRow, CrawlerRow, SchedulerRow, + FanOutStrategy, SchedulerStageRow, SchedulerRunRow, SchedulerStageRunRow, @@ -24,6 +25,8 @@ export type { SocialLoginInput, SocialLoginResult, LinkAccountResult, + CrawlerPermissionLevel, + CrawlerPermissionRow, SupabaseConnectorConfiguration, Database, } from './database.ts'; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a950cda..043d55a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -496,6 +496,9 @@ importers: '@audio-underview/worker-tools': specifier: workspace:* version: link:../tools + safe-regex2: + specifier: ^5.1.0 + version: 5.1.0 devDependencies: '@cloudflare/vitest-pool-workers': specifier: catalog:worker @@ -4164,6 +4167,10 @@ packages: engines: {node: '>= 0.4'} hasBin: true + ret@0.5.0: + resolution: {integrity: sha512-I1XxrZSQ+oErkRR4jYbAyEEu2I0avBvvMM5JN+6EBprOGRCs63ENqZ3vjavq8fBw2+62G5LF5XelKwuJpcvcxw==} + engines: {node: '>=10'} + rettime@0.10.1: resolution: {integrity: sha512-uyDrIlUEH37cinabq0AX4QbgV4HbFZ/gqoiunWQ1UqBtRvTTytwhNYjE++pO/MjPTZL5KQCf2bEoJ/BJNVQ5Kw==} @@ -4176,6 +4183,10 @@ packages: resolution: {integrity: sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==} engines: {node: '>=18'} + safe-regex2@5.1.0: + resolution: {integrity: sha512-pNHAuBW7TrcleFHsxBr5QMi/Iyp0ENjUKz7GCcX1UO7cMh+NmVK6HxQckNL1tJp1XAJVjG6B8OKIPqodqj9rtw==} + hasBin: true + sass@1.97.3: resolution: {integrity: sha512-fDz1zJpd5GycprAbu4Q2PV/RprsRtKC/0z82z0JLgdytmcq0+ujJbJ/09bPGDxCLkKY3Np5cRAOcWiVkLXJURg==} engines: {node: '>=14.0.0'} @@ -8181,6 +8192,8 @@ snapshots: path-parse: 1.0.7 supports-preserve-symlinks-flag: 1.0.0 + ret@0.5.0: {} + rettime@0.10.1: {} rollup@4.53.5: @@ -8213,6 +8226,10 @@ snapshots: run-applescript@7.1.0: {} + safe-regex2@5.1.0: + dependencies: + ret: 0.5.0 + sass@1.97.3: dependencies: chokidar: 4.0.3 diff --git a/workers/crawler-code-runner-worker/sources/create-code-runner.ts b/workers/crawler-code-runner-worker/sources/create-code-runner.ts index 3c5c00c..768a966 100644 --- a/workers/crawler-code-runner-worker/sources/create-code-runner.ts +++ b/workers/crawler-code-runner-worker/sources/create-code-runner.ts @@ -1,5 +1,5 @@ export interface CodeRunner { - execute(data: string): Promise<unknown>; + execute(data: unknown): Promise<unknown>; } export const MAX_CODE_LENGTH = 10_000; diff --git a/workers/crawler-code-runner-worker/sources/index.ts b/workers/crawler-code-runner-worker/sources/index.ts index ca5f783..80d768c 100644 --- a/workers/crawler-code-runner-worker/sources/index.ts +++ b/workers/crawler-code-runner-worker/sources/index.ts @@ -12,13 +12,24 @@ interface Environment { LOADER: WorkerLoader; } -interface RunRequestBody { - type: 'test' | 'run'; +interface WebRunRequestBody { + type: 'web'; + mode: 'test' | 'run'; url: string; code: string; } +interface DataRunRequestBody { + type: 'data'; + mode: 'test' | 'run'; + data: unknown; + code: string; +} + +type RunRequestBody = WebRunRequestBody | DataRunRequestBody; + const FETCH_TIMEOUT_MILLISECONDS = 10_000; +const MAX_RESPONSE_BYTES = 10 * 1024 * 1024; // 10MB const logger = createWorkerLogger({ defaultContext: { @@ -34,80 +45,150 @@ const HELP = { { method: 'POST', path: '/run', - description: 'Fetch a URL and run code against the response body', + description: 'Run code against a fetched URL response (web) or provided data (data)', body: { - type: "'test' | 'run'", - url: 'string - The URL to fetch', - code: 'string - JavaScript function source to execute against the fetched response body', + type: "'web' | 'data'", + mode: "'test' | 'run'", + url: "string - The URL to fetch (required for type 'web')", + data: "unknown - The data to process (required for type 'data')", + code: 'string - JavaScript function source to execute against the input', }, }, ], }; +function validateRunRequestBody(raw: unknown): RunRequestBody | string { + if (raw == null || typeof raw !== 'object') { + return 'Request body must be a JSON object'; + } + const object = raw as Record<string, unknown>; + + if (object.type !== 'web' && object.type !== 'data') { + return "Field 'type' must be 'web' or 'data'"; + } + + if (object.mode !== 'test' && object.mode !== 'run') { + return "Field 'mode' must be 'test' or 'run'"; + } + + if (typeof object.code !== 'string') { + return "Field 'code' is required and must be a string"; + } + + if (object.type === 'web') { + if (typeof object.url !== 'string') { + return "Field 'url' is required and must be a string when type is 'web'"; + } + return { + type: 'web', + mode: object.mode as 'test' | 'run', + url: object.url as string, + code: object.code as string, + }; + } + + // type === 'data' + if (!('data' in object)) { + return "Field 'data' is required when type is 'data'"; + } + return { + type: 'data', + mode: object.mode as 'test' | 'run', + data: object.data, + code: object.code as string, + }; +} + async function handleRun( request: Request, environment: Environment, context: ResponseContext, ): Promise<Response> { - let body: RunRequestBody; + let raw: unknown; try { - body = await request.json() as RunRequestBody; + raw = await request.json(); } catch { return errorResponse('invalid_request', 'Request body must be valid JSON', 400, context); } - if (!body.type || (body.type !== 'test' && body.type !== 'run')) { - return errorResponse('invalid_request', "Field 'type' must be 'test' or 'run'", 400, context); - } - - if (!body.url || typeof body.url !== 'string') { - return errorResponse('invalid_request', "Field 'url' is required and must be a string", 400, context); + const validated = validateRunRequestBody(raw); + if (typeof validated === 'string') { + return errorResponse('invalid_request', validated, 400, context); } - if (!body.code || typeof body.code !== 'string') { - return errorResponse('invalid_request', "Field 'code' is required and must be a string", 400, context); - } + const parsed = validated; - if (body.code.length > MAX_CODE_LENGTH) { + if (parsed.code.length > MAX_CODE_LENGTH) { return errorResponse('invalid_request', `Field 'code' exceeds maximum length of ${MAX_CODE_LENGTH} characters`, 400, context); } - let targetURL: URL; - try { - targetURL = new URL(body.url); - } catch { - return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context); - } + if (parsed.type === 'web') { + let targetURL: URL; + try { + targetURL = new URL(parsed.url); + } catch { + return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context); + } - let responseText: string; - try { - const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS); - logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' }); - const fetchResponse = await fetch(targetURL.toString(), { signal }); - responseText = await fetchResponse.text(); - logger.info('Target URL fetched', { - status: fetchResponse.status, - contentLength: responseText.length, - }, { function: 'handleRun' }); - } catch (fetchError) { - if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') { - logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' }); - return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context); + let responseText: string; + try { + const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS); + logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' }); + const fetchResponse = await fetch(targetURL.toString(), { signal }); + + const contentLength = fetchResponse.headers.get('Content-Length'); + if (contentLength !== null && Number(contentLength) > MAX_RESPONSE_BYTES) { + logger.warn('Response too large', { url: targetURL.toString(), contentLength }, { function: 'handleRun' }); + return errorResponse('response_too_large', `Response exceeds maximum size of ${MAX_RESPONSE_BYTES} bytes`, 413, context); + } + + responseText = await fetchResponse.text(); + logger.info('Target URL fetched', { + status: fetchResponse.status, + contentLength: responseText.length, + }, { function: 'handleRun' }); + } catch (fetchError) { + if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') { + logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' }); + return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context); + } + logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' }); + return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context); + } + + let result: unknown; + try { + const runner = createCodeRunner(environment.LOADER, parsed.code); + result = await runner.execute(responseText); + } catch (executionError) { + logger.error('Code execution failed', executionError, { function: 'handleRun' }); + return errorResponse('execution_failed', 'Code execution failed', 422, context); } - logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' }); - return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context); + + // Normalize undefined to null to prevent JSON.stringify field drop + if (result === undefined) { + result = null; + } + + return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context); } + // type === 'data' let result: unknown; try { - const runner = createCodeRunner(environment.LOADER, body.code); - result = await runner.execute(responseText); + const runner = createCodeRunner(environment.LOADER, parsed.code); + result = await runner.execute(parsed.data); } catch (executionError) { logger.error('Code execution failed', executionError, { function: 'handleRun' }); return errorResponse('execution_failed', 'Code execution failed', 422, context); } - return jsonResponse({ type: body.type, result }, 200, context); + // Normalize undefined to null to prevent JSON.stringify field drop + if (result === undefined) { + result = null; + } + + return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context); } export default { diff --git a/workers/crawler-code-runner-worker/tests/index.test.ts b/workers/crawler-code-runner-worker/tests/index.test.ts index b343f4a..6a04715 100644 --- a/workers/crawler-code-runner-worker/tests/index.test.ts +++ b/workers/crawler-code-runner-worker/tests/index.test.ts @@ -98,7 +98,7 @@ describe('crawler-code-runner-worker', () => { const request = new Request(`${WORKER_URL}/run`, { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, - body: JSON.stringify({ url: 'https://example.com', code: '(x) => x' }), + body: JSON.stringify({ mode: 'test', url: 'https://example.com', code: '(x) => x' }), }); const response = await worker.fetch(request, env); @@ -112,20 +112,49 @@ describe('crawler-code-runner-worker', () => { const request = new Request(`${WORKER_URL}/run`, { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, - body: JSON.stringify({ type: 'invalid', url: 'https://example.com', code: '(x) => x' }), + body: JSON.stringify({ type: 'invalid', mode: 'test', url: 'https://example.com', code: '(x) => x' }), }); const response = await worker.fetch(request, env); expect(response.status).toBe(400); const body = await response.json(); expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('type'); }); - it('returns 400 for missing url field', async () => { + it('returns 400 for missing mode field', async () => { const request = new Request(`${WORKER_URL}/run`, { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, - body: JSON.stringify({ type: 'run', code: '(x) => x' }), + body: JSON.stringify({ type: 'web', url: 'https://example.com', code: '(x) => x' }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(400); + const body = await response.json(); + expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('mode'); + }); + + it('returns 400 for invalid mode value', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ type: 'web', mode: 'invalid', url: 'https://example.com', code: '(x) => x' }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(400); + const body = await response.json(); + expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('mode'); + }); + + it('returns 400 for missing url field when type is web', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ type: 'web', mode: 'run', code: '(x) => x' }), }); const response = await worker.fetch(request, env); @@ -135,11 +164,25 @@ describe('crawler-code-runner-worker', () => { expect(body.error_description).toContain('url'); }); + it('returns 400 for missing data field when type is data', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ type: 'data', mode: 'run', code: '(x) => x' }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(400); + const body = await response.json(); + expect(body.error).toBe('invalid_request'); + expect(body.error_description).toContain('data'); + }); + it('returns 400 for missing code field', async () => { const request = new Request(`${WORKER_URL}/run`, { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, - body: JSON.stringify({ type: 'run', url: 'https://example.com' }), + body: JSON.stringify({ type: 'web', mode: 'run', url: 'https://example.com' }), }); const response = await worker.fetch(request, env); @@ -154,7 +197,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: 'x'.repeat(MAX_CODE_LENGTH + 1), }), @@ -171,7 +215,7 @@ describe('crawler-code-runner-worker', () => { const request = new Request(`${WORKER_URL}/run`, { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, - body: JSON.stringify({ type: 'run', url: 'not-a-url', code: '(x) => x' }), + body: JSON.stringify({ type: 'web', mode: 'run', url: 'not-a-url', code: '(x) => x' }), }); const response = await worker.fetch(request, env); @@ -182,7 +226,7 @@ describe('crawler-code-runner-worker', () => { }); }); - describe('POST /run fetch failures', () => { + describe('POST /run web type - fetch failures', () => { it('returns 502 when target URL fetch fails', async () => { fetchMock .get('https://target.example.com') @@ -193,7 +237,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '(text) => text', }), @@ -206,7 +251,7 @@ describe('crawler-code-runner-worker', () => { }); }); - describe('POST /run execution failures', () => { + describe('POST /run web type - execution failures', () => { it('returns 422 when code execution throws an error', async () => { fetchMock .get('https://target.example.com') @@ -217,7 +262,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '(text) => { throw new Error("intentional error"); }', }), @@ -240,7 +286,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '((( invalid syntax', }), @@ -253,8 +300,8 @@ describe('crawler-code-runner-worker', () => { }); }); - describe('POST /run successful execution', () => { - it('executes code and returns result with type test', async () => { + describe('POST /run web type - successful execution', () => { + it('executes code and returns result with mode test', async () => { fetchMock .get('https://target.example.com') .intercept({ path: '/data', method: 'GET' }) @@ -264,7 +311,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'test', + type: 'web', + mode: 'test', url: 'https://target.example.com/data', code: '(text) => text.toUpperCase()', }), @@ -273,11 +321,12 @@ describe('crawler-code-runner-worker', () => { expect(response.status).toBe(200); const body = await response.json(); - expect(body.type).toBe('test'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('test'); expect(body.result).toBe('HELLO WORLD'); }); - it('executes code and returns result with type run', async () => { + it('executes code and returns result with mode run', async () => { fetchMock .get('https://target.example.com') .intercept({ path: '/data', method: 'GET' }) @@ -287,7 +336,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: '(text) => text.length', }), @@ -296,7 +346,8 @@ describe('crawler-code-runner-worker', () => { expect(response.status).toBe(200); const body = await response.json(); - expect(body.type).toBe('run'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('run'); expect(body.result).toBe(11); }); @@ -310,7 +361,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/page', code: '(text) => ({ length: text.length, hasTitle: text.includes("<title>") })', }), @@ -319,7 +371,8 @@ describe('crawler-code-runner-worker', () => { expect(response.status).toBe(200); const body = await response.json(); - expect(body.type).toBe('run'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('run'); expect(body.result.length).toBe(43); expect(body.result.hasTitle).toBe(true); }); @@ -334,7 +387,8 @@ describe('crawler-code-runner-worker', () => { method: 'POST', headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, body: JSON.stringify({ - type: 'run', + type: 'web', + mode: 'run', url: 'https://target.example.com/data', code: 'async (text) => text.split(" ")', }), @@ -343,11 +397,132 @@ describe('crawler-code-runner-worker', () => { expect(response.status).toBe(200); const body = await response.json(); - expect(body.type).toBe('run'); + expect(body.type).toBe('web'); + expect(body.mode).toBe('run'); expect(body.result).toEqual(['async', 'test']); }); }); + describe('POST /run data type - successful execution', () => { + it('executes code against provided data object', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: { items: [1, 2, 3] }, + code: '(data) => data.items.length', + }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(200); + const body = await response.json(); + expect(body.type).toBe('data'); + expect(body.mode).toBe('run'); + expect(body.result).toBe(3); + }); + + it('executes code against provided data array', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ + type: 'data', + mode: 'test', + data: [10, 20, 30], + code: '(data) => data.map((x) => x * 2)', + }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(200); + const body = await response.json(); + expect(body.type).toBe('data'); + expect(body.mode).toBe('test'); + expect(body.result).toEqual([20, 40, 60]); + }); + + it('executes code against provided string data', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: 'hello world', + code: '(data) => data.toUpperCase()', + }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(200); + const body = await response.json(); + expect(body.type).toBe('data'); + expect(body.mode).toBe('run'); + expect(body.result).toBe('HELLO WORLD'); + }); + + it('executes code against null data', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: null, + code: '(data) => data === null ? "was null" : "not null"', + }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(200); + const body = await response.json(); + expect(body.type).toBe('data'); + expect(body.mode).toBe('run'); + expect(body.result).toBe('was null'); + }); + }); + + describe('POST /run data type - execution failures', () => { + it('returns 422 when code execution throws an error', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: { value: 1 }, + code: '(data) => { throw new Error("data processing error"); }', + }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(422); + const body = await response.json(); + expect(body.error).toBe('execution_failed'); + }); + + it('returns 422 when code is syntactically invalid', async () => { + const request = new Request(`${WORKER_URL}/run`, { + method: 'POST', + headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' }, + body: JSON.stringify({ + type: 'data', + mode: 'run', + data: 'test', + code: '((( invalid syntax', + }), + }); + const response = await worker.fetch(request, env); + + expect(response.status).toBe(422); + const body = await response.json(); + expect(body.error).toBe('execution_failed'); + }); + }); + describe('unknown routes', () => { it('returns 404 for unknown path', async () => { const request = new Request(`${WORKER_URL}/unknown`, { diff --git a/workers/crawler-manager-worker/package.json b/workers/crawler-manager-worker/package.json index 7ebde66..294f65f 100644 --- a/workers/crawler-manager-worker/package.json +++ b/workers/crawler-manager-worker/package.json @@ -13,8 +13,9 @@ "packageManager": "pnpm@10.26.1", "dependencies": { "@audio-underview/logger": "workspace:*", + "@audio-underview/supabase-connector": "workspace:*", "@audio-underview/worker-tools": "workspace:*", - "@audio-underview/supabase-connector": "workspace:*" + "safe-regex2": "^5.1.0" }, "devDependencies": { "@cloudflare/vitest-pool-workers": "catalog:worker", diff --git a/workers/crawler-manager-worker/sources/code-runner-client.ts b/workers/crawler-manager-worker/sources/code-runner-client.ts new file mode 100644 index 0000000..f8ab8ee --- /dev/null +++ b/workers/crawler-manager-worker/sources/code-runner-client.ts @@ -0,0 +1,178 @@ +export interface CodeRunnerResult { + type: 'web' | 'data'; + mode: 'test' | 'run'; + result: unknown; +} + +export function validateCodeRunnerResult(value: unknown): CodeRunnerResult { + if (value == null || typeof value !== 'object') { + throw new CodeRunnerExecutionError('invalid_response', 'Expected object from code-runner', 0); + } + const record = value as Record<string, unknown>; + if (record.type !== 'web' && record.type !== 'data') { + throw new CodeRunnerExecutionError('invalid_response', `Expected type 'web' or 'data', got '${String(record.type)}'`, 0); + } + if (record.mode !== 'test' && record.mode !== 'run') { + throw new CodeRunnerExecutionError('invalid_response', `Expected mode 'test' or 'run', got '${String(record.mode)}'`, 0); + } + if (!('result' in record)) { + throw new CodeRunnerExecutionError('invalid_response', 'Missing result field', 0); + } + return { type: record.type, mode: record.mode, result: record.result }; +} + +export interface CodeRunnerClient { + run( + type: 'web' | 'data', + url: string | undefined, + data: unknown | undefined, + code: string, + ): Promise<CodeRunnerResult>; +} + +export class CodeRunnerExecutionError extends Error { + readonly errorCode: string; + readonly errorDescription: string; + readonly statusCode: number; + + constructor(errorCode: string, errorDescription: string, statusCode: number) { + super(`CodeRunner error ${statusCode}: [${errorCode}] ${errorDescription}`); + this.name = 'CodeRunnerExecutionError'; + this.errorCode = errorCode; + this.errorDescription = errorDescription; + this.statusCode = statusCode; + } +} + +const MAX_RETRY_ATTEMPTS = 2; +const INITIAL_BACKOFF_MILLISECONDS = 1_000; +const REQUEST_TIMEOUT_MILLISECONDS = 30_000; + +interface CodeRunnerRequestBody { + type: 'web' | 'data'; + mode: 'run'; + url?: string; + data?: unknown; + code: string; +} + +interface CodeRunnerErrorResponse { + error_code?: string; + error_description?: string; +} + +function isRetryableStatusCode(statusCode: number): boolean { + return statusCode >= 500 && statusCode < 600; +} + +function buildRequestBody( + type: 'web' | 'data', + url: string | undefined, + data: unknown | undefined, + code: string, +): CodeRunnerRequestBody { + if (type === 'web') { + return { type: 'web', mode: 'run', url, code }; + } + return { type: 'data', mode: 'run', data, code }; +} + +async function delay(milliseconds: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, milliseconds)); +} + +export class HTTPCodeRunnerClient implements CodeRunnerClient { + private readonly baseURL: string; + + constructor(baseURL: string) { + this.baseURL = baseURL.replace(/\/+$/, ''); + } + + async run( + type: 'web' | 'data', + url: string | undefined, + data: unknown | undefined, + code: string, + ): Promise<CodeRunnerResult> { + const requestBody = buildRequestBody(type, url, data, code); + const endpoint = `${this.baseURL}/run`; + + let lastError: unknown; + + for (let attempt = 0; attempt <= MAX_RETRY_ATTEMPTS; attempt++) { + if (attempt > 0) { + const backoffMilliseconds = INITIAL_BACKOFF_MILLISECONDS * Math.pow(2, attempt - 1); + await delay(backoffMilliseconds); + } + + let response: Response; + try { + response = await fetch(endpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(requestBody), + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MILLISECONDS), + }); + } catch (error: unknown) { + lastError = error; + if (attempt < MAX_RETRY_ATTEMPTS) { + continue; + } + throw new CodeRunnerExecutionError( + 'network_error', + error instanceof Error ? error.message : 'Unknown network error', + 0, + ); + } + + if (response.ok) { + try { + const result = validateCodeRunnerResult(await response.json()); + return result; + } catch (error: unknown) { + throw new CodeRunnerExecutionError( + 'invalid_response', + error instanceof Error ? error.message : 'Failed to parse code runner response', + response.status, + ); + } + } + + if (isRetryableStatusCode(response.status)) { + lastError = new CodeRunnerExecutionError( + 'server_error', + `Server returned ${response.status}`, + response.status, + ); + if (attempt < MAX_RETRY_ATTEMPTS) { + continue; + } + throw lastError; + } + + // 4xx errors fail immediately (user code error) — no retry + let errorCode = 'execution_error'; + let errorDescription = `Code runner returned HTTP ${response.status}`; + + try { + const errorBody = (await response.json()) as CodeRunnerErrorResponse; + errorCode = errorBody.error_code ?? errorCode; + errorDescription = errorBody.error_description ?? errorDescription; + } catch { + // Response body is not valid JSON; use defaults + } + + throw new CodeRunnerExecutionError(errorCode, errorDescription, response.status); + } + + // Exhausted all retry attempts — throw the last captured error + if (lastError instanceof CodeRunnerExecutionError) { + throw lastError; + } + throw new CodeRunnerExecutionError( + 'network_error', + lastError instanceof Error ? lastError.message : 'Unknown error after retries', + 0, + ); + } +} diff --git a/workers/crawler-manager-worker/sources/crawler-executor.ts b/workers/crawler-manager-worker/sources/crawler-executor.ts new file mode 100644 index 0000000..30a1c27 --- /dev/null +++ b/workers/crawler-manager-worker/sources/crawler-executor.ts @@ -0,0 +1,79 @@ +import type { Logger } from '@audio-underview/logger'; +import type { CrawlerRow } from '@audio-underview/supabase-connector'; +import type { CrawlerExecuteResult } from '@audio-underview/worker-tools'; +import type { CodeRunnerClient } from './code-runner-client.ts'; +import { isSafeURLPattern } from './safe-url-pattern.ts'; + +export type { CrawlerExecuteResult }; + +function resolveURL( + input: unknown, + crawler: CrawlerRow, +): string | null { + // 1. input.url if present + if (input !== null && input !== undefined && typeof input === 'object' && 'url' in input) { + const url = (input as Record<string, unknown>).url; + if (typeof url === 'string' && url.length > 0) { + return url; + } + } + + // 2. input_schema url default + const inputSchema = crawler.input_schema; + if (inputSchema.url !== null && inputSchema.url !== undefined && typeof inputSchema.url === 'object' && 'default' in inputSchema.url) { + const defaultURL = (inputSchema.url as Record<string, unknown>).default; + if (typeof defaultURL === 'string' && defaultURL.length > 0) { + return defaultURL; + } + } + + return null; +} + +export async function executeCrawler( + codeRunnerClient: CodeRunnerClient, + crawler: CrawlerRow, + input: unknown, + logger: Logger, +): Promise<CrawlerExecuteResult> { + if (crawler.type === 'web') { + const url = resolveURL(input, crawler); + if (!url) { + throw new Error( + `Crawler ${crawler.id}: no URL available. Provide url in input or set a default in input_schema.`, + ); + } + + if (crawler.url_pattern) { + if (!isSafeURLPattern(crawler.url_pattern)) { + logger.warn('Skipping url_pattern validation: potential ReDoS pattern detected', { + urlPattern: crawler.url_pattern, + crawlerID: crawler.id, + }, { function: 'executeCrawler' }); + } else { + try { + const pattern = new RegExp(crawler.url_pattern); + if (!pattern.test(url)) { + logger.warn('URL does not match crawler url_pattern', { + url, + urlPattern: crawler.url_pattern, + crawlerID: crawler.id, + }, { function: 'executeCrawler' }); + } + } catch (error: unknown) { + logger.warn('Invalid url_pattern regex', { + urlPattern: crawler.url_pattern, + crawlerID: crawler.id, + error: error instanceof Error ? error.message : String(error), + }, { function: 'executeCrawler' }); + } + } + } + + const response = await codeRunnerClient.run('web', url, undefined, crawler.code); + return { type: 'web', result: response.result }; + } + + const response = await codeRunnerClient.run('data', undefined, input, crawler.code); + return { type: 'data', result: response.result }; +} diff --git a/workers/crawler-manager-worker/sources/index.ts b/workers/crawler-manager-worker/sources/index.ts index 5fc03fd..f0b4c80 100644 --- a/workers/crawler-manager-worker/sources/index.ts +++ b/workers/crawler-manager-worker/sources/index.ts @@ -1,3 +1,5 @@ +import { WorkerEntrypoint } from 'cloudflare:workers'; +import { isSafeURLPattern } from './safe-url-pattern.ts'; import { createWorkerLogger } from '@audio-underview/logger'; import { type ResponseContext, @@ -11,16 +13,24 @@ import { createCrawler, listCrawlersByUser, getCrawler, + getCrawlerByID, updateCrawler, deleteCrawler, + createCrawlerPermission, } from '@audio-underview/supabase-connector'; import { handleTokenExchange } from './token-exchange.ts'; +import { HTTPCodeRunnerClient } from './code-runner-client.ts'; +import { executeCrawler } from './crawler-executor.ts'; +import type { CrawlerExecuteResult } from './crawler-executor.ts'; + +export type { CrawlerExecuteResult }; interface Environment { ALLOWED_ORIGINS: string; SUPABASE_URL: string; SUPABASE_SECRET_KEY: string; JWT_SECRET: string; + CODE_RUNNER_FUNCTION_URL: string; } interface CreateCrawlerRequestBody { @@ -51,10 +61,6 @@ const HELP = { ], }; -function hasNestedQuantifiers(pattern: string): boolean { - // Detect patterns like (a+)+, (.*)*, (a{2,})+, etc. - return /(\([^)]*[+*][^)]*\))[+*]|\(\?:[^)]*[+*][^)]*\)[+*]/.test(pattern); -} async function validateCrawlerBody( request: Request, @@ -115,7 +121,7 @@ async function validateCrawlerBody( return errorResponse('invalid_request', `Field 'url_pattern' must not exceed ${MAX_URL_PATTERN_LENGTH} characters`, 400, context); } - if (hasNestedQuantifiers(body.url_pattern)) { + if (!isSafeURLPattern(body.url_pattern)) { return errorResponse('invalid_request', "Field 'url_pattern' contains potentially unsafe regex pattern", 400, context); } @@ -166,6 +172,12 @@ async function handleCreateCrawler( output_schema: body.output_schema, }); + await createCrawlerPermission(supabaseClient, { + crawler_id: crawler.id, + user_uuid: userUUID, + level: 'owner', + }); + return jsonResponse(crawler, 201, context); } @@ -293,8 +305,9 @@ function parseCrawlerID(pathname: string): string | null { return id; } -export default { - async fetch(request: Request, environment: Environment): Promise<Response> { +export default class CrawlerManagerWorker extends WorkerEntrypoint<Environment> { + async fetch(request: Request): Promise<Response> { + const environment = this.env; const url = new URL(request.url); const origin = request.headers.get('Origin') ?? ''; @@ -404,5 +417,27 @@ export default { logger.error('Unhandled worker error', error, { function: 'fetch' }); return errorResponse('server_error', 'An unexpected error occurred', 500, context); } - }, -}; + } + + // Service Binding RPC — called by scheduler-manager-worker only. + // No user ownership check: binding declaration itself is the access control. + async executeCrawler(crawlerID: string, input: unknown): Promise<CrawlerExecuteResult> { + const rpcLogger = logger.createChild({ + function: 'executeCrawler', + metadata: { crawlerID }, + }); + + const supabaseClient = createSupabaseClient({ + supabaseURL: this.env.SUPABASE_URL, + supabaseSecretKey: this.env.SUPABASE_SECRET_KEY, + }); + + const crawler = await getCrawlerByID(supabaseClient, crawlerID); + if (!crawler) { + throw new Error(`Crawler ${crawlerID} not found`); + } + + const codeRunnerClient = new HTTPCodeRunnerClient(this.env.CODE_RUNNER_FUNCTION_URL); + return executeCrawler(codeRunnerClient, crawler, input, rpcLogger); + } +} diff --git a/workers/crawler-manager-worker/sources/safe-url-pattern.ts b/workers/crawler-manager-worker/sources/safe-url-pattern.ts new file mode 100644 index 0000000..c26ca3f --- /dev/null +++ b/workers/crawler-manager-worker/sources/safe-url-pattern.ts @@ -0,0 +1,5 @@ +import isSafeRegex from 'safe-regex2'; + +export function isSafeURLPattern(pattern: string): boolean { + return isSafeRegex(pattern); +} diff --git a/workers/crawler-manager-worker/tests/crawler-executor.test.ts b/workers/crawler-manager-worker/tests/crawler-executor.test.ts new file mode 100644 index 0000000..55e2074 --- /dev/null +++ b/workers/crawler-manager-worker/tests/crawler-executor.test.ts @@ -0,0 +1,205 @@ +import { describe, it, expect, vi } from 'vitest'; +import { executeCrawler } from '../sources/crawler-executor.ts'; +import { validateCodeRunnerResult } from '../sources/code-runner-client.ts'; +import type { CodeRunnerClient } from '../sources/code-runner-client.ts'; +import type { CrawlerRow } from '@audio-underview/supabase-connector'; + +function createMockCodeRunnerClient( + result: unknown = { extracted: 'data' }, +): CodeRunnerClient & { run: ReturnType<typeof vi.fn> } { + return { + run: vi.fn().mockResolvedValue({ type: 'web', mode: 'run', result }), + }; +} + +function createMockLogger() { + return { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + createChild: vi.fn().mockReturnThis(), + } as any; +} + +function createMockCrawler(overrides: Partial<CrawlerRow> = {}): CrawlerRow { + return { + id: '00000000-0000-0000-0000-000000000001', + user_uuid: '00000000-0000-0000-0000-000000000002', + name: 'Test Crawler', + type: 'web', + url_pattern: '.*\\.example\\.com', + code: '(text) => ({ title: "test" })', + input_schema: { body: 'string' }, + output_schema: {}, + created_at: '2026-01-01T00:00:00Z', + updated_at: '2026-01-01T00:00:00Z', + ...overrides, + }; +} + +describe('executeCrawler', () => { + describe('web crawler', () => { + it('resolves URL from input.url and calls codeRunnerClient.run', async () => { + const client = createMockCodeRunnerClient({ title: 'Hello' }); + const logger = createMockLogger(); + const crawler = createMockCrawler({ code: '(text) => ({ title: text })' }); + const input = { url: 'https://www.example.com/page' }; + + const result = await executeCrawler(client, crawler, input, logger); + + expect(result).toEqual({ type: 'web', result: { title: 'Hello' } }); + expect(client.run).toHaveBeenCalledOnce(); + expect(client.run).toHaveBeenCalledWith( + 'web', + 'https://www.example.com/page', + undefined, + crawler.code, + ); + }); + + it('resolves URL from input_schema.url.default when input has no url', async () => { + const client = createMockCodeRunnerClient(); + const logger = createMockLogger(); + const crawler = createMockCrawler({ + input_schema: { + url: { default: 'https://fallback.example.com/default' }, + }, + }); + const input = {}; + + await executeCrawler(client, crawler, input, logger); + + expect(client.run).toHaveBeenCalledWith( + 'web', + 'https://fallback.example.com/default', + undefined, + crawler.code, + ); + }); + + it('throws when no URL is available', async () => { + const client = createMockCodeRunnerClient(); + const logger = createMockLogger(); + const crawler = createMockCrawler({ input_schema: {} }); + const input = {}; + + await expect(executeCrawler(client, crawler, input, logger)).rejects.toThrow( + /no URL available/, + ); + expect(client.run).not.toHaveBeenCalled(); + }); + + it('warns when URL does not match url_pattern but still executes', async () => { + const client = createMockCodeRunnerClient(); + const logger = createMockLogger(); + const crawler = createMockCrawler({ url_pattern: '^https://only\\.allowed\\.com' }); + const input = { url: 'https://different.com/page' }; + + const result = await executeCrawler(client, crawler, input, logger); + + expect(logger.warn).toHaveBeenCalledOnce(); + expect(logger.warn).toHaveBeenCalledWith( + 'URL does not match crawler url_pattern', + expect.objectContaining({ + url: 'https://different.com/page', + urlPattern: '^https://only\\.allowed\\.com', + crawlerID: crawler.id, + }), + { function: 'executeCrawler' }, + ); + expect(result.type).toBe('web'); + expect(client.run).toHaveBeenCalledOnce(); + }); + + it('skips url_pattern validation on unsafe regex and logs warning', async () => { + const client = createMockCodeRunnerClient(); + const logger = createMockLogger(); + const crawler = createMockCrawler({ url_pattern: '[invalid(' }); + const input = { url: 'https://www.example.com/page' }; + + const result = await executeCrawler(client, crawler, input, logger); + + expect(logger.warn).toHaveBeenCalledWith( + 'Skipping url_pattern validation: potential ReDoS pattern detected', + expect.objectContaining({ + urlPattern: '[invalid(', + crawlerID: crawler.id, + }), + expect.objectContaining({ function: 'executeCrawler' }), + ); + expect(result.type).toBe('web'); + expect(client.run).toHaveBeenCalledOnce(); + }); + }); + + describe('data crawler', () => { + it('calls codeRunnerClient.run with data type and input', async () => { + const client = createMockCodeRunnerClient({ processed: true }); + client.run.mockResolvedValue({ type: 'data', mode: 'run', result: { processed: true } }); + const logger = createMockLogger(); + const crawler = createMockCrawler({ type: 'data', url_pattern: null }); + const input = { items: [1, 2, 3] }; + + const result = await executeCrawler(client, crawler, input, logger); + + expect(result).toEqual({ type: 'data', result: { processed: true } }); + expect(client.run).toHaveBeenCalledOnce(); + expect(client.run).toHaveBeenCalledWith( + 'data', + undefined, + input, + crawler.code, + ); + }); + }); + + describe('error propagation', () => { + it('propagates errors from codeRunnerClient.run', async () => { + const client = createMockCodeRunnerClient(); + client.run.mockRejectedValue(new Error('Code execution failed')); + const logger = createMockLogger(); + const crawler = createMockCrawler(); + const input = { url: 'https://www.example.com/page' }; + + await expect(executeCrawler(client, crawler, input, logger)).rejects.toThrow( + 'Code execution failed', + ); + }); + }); +}); + +describe('validateCodeRunnerResult', () => { + it('accepts valid web result', () => { + const result = validateCodeRunnerResult({ type: 'web', mode: 'run', result: { data: 'ok' } }); + expect(result.type).toBe('web'); + expect(result.mode).toBe('run'); + expect(result.result).toEqual({ data: 'ok' }); + }); + + it('accepts valid result with null', () => { + const result = validateCodeRunnerResult({ type: 'data', mode: 'test', result: null }); + expect(result.type).toBe('data'); + expect(result.result).toBeNull(); + }); + + it('throws on null input', () => { + expect(() => validateCodeRunnerResult(null)).toThrow('Expected object from code-runner'); + }); + + it('throws on non-object input', () => { + expect(() => validateCodeRunnerResult('string')).toThrow('Expected object from code-runner'); + }); + + it('throws on invalid type', () => { + expect(() => validateCodeRunnerResult({ type: 'unknown', mode: 'run', result: {} })).toThrow("Expected type 'web' or 'data'"); + }); + + it('throws on invalid mode', () => { + expect(() => validateCodeRunnerResult({ type: 'web', mode: 'unknown', result: {} })).toThrow("Expected mode 'test' or 'run'"); + }); + + it('throws on missing result field', () => { + expect(() => validateCodeRunnerResult({ type: 'web', mode: 'run' })).toThrow('Missing result field'); + }); +}); diff --git a/workers/crawler-manager-worker/tests/index.test.ts b/workers/crawler-manager-worker/tests/index.test.ts index ac6560e..cc36141 100644 --- a/workers/crawler-manager-worker/tests/index.test.ts +++ b/workers/crawler-manager-worker/tests/index.test.ts @@ -1,7 +1,6 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; -import { env, fetchMock } from 'cloudflare:test'; +import { env, fetchMock, SELF } from 'cloudflare:test'; import { signJWT } from '@audio-underview/worker-tools'; -import worker from '../sources/index.ts'; const WORKER_URL = 'https://worker.example.com'; const MOCK_USER_UUID = '00000000-0000-0000-0000-000000000001'; @@ -99,7 +98,7 @@ describe('crawler-manager-worker', () => { method: 'OPTIONS', headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(204); expect(response.headers.get('Access-Control-Allow-Origin')).toBe('https://example.com'); @@ -111,7 +110,7 @@ describe('crawler-manager-worker', () => { method: 'OPTIONS', headers: { Origin: 'https://unknown.example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(204); expect(response.headers.get('Access-Control-Allow-Origin')).toBeNull(); @@ -124,7 +123,7 @@ describe('crawler-manager-worker', () => { method: 'HEAD', headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); expect(response.headers.get('Content-Type')).toBe('application/json'); @@ -137,7 +136,7 @@ describe('crawler-manager-worker', () => { const request = new Request(WORKER_URL, { headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -150,7 +149,7 @@ describe('crawler-manager-worker', () => { const request = new Request(`${WORKER_URL}/help`, { headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -161,7 +160,7 @@ describe('crawler-manager-worker', () => { const request = new Request(WORKER_URL, { headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); const body = await response.json(); const tokenEndpoint = body.endpoints.find((endpoint: { path: string }) => endpoint.path === '/authentication/token'); @@ -175,7 +174,7 @@ describe('crawler-manager-worker', () => { const request = new Request(`${WORKER_URL}/crawlers`, { headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(401); const body = await response.json(); @@ -189,7 +188,7 @@ describe('crawler-manager-worker', () => { Authorization: 'Bearer invalid-token', }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(401); const body = await response.json(); @@ -209,7 +208,7 @@ describe('crawler-manager-worker', () => { Authorization: `Bearer ${expiredToken}`, }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(401); const body = await response.json(); @@ -229,7 +228,7 @@ describe('crawler-manager-worker', () => { Authorization: `Bearer ${wrongSecretToken}`, }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(401); }); @@ -255,7 +254,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ provider: 'google', access_token: 'valid-google-token' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -283,7 +282,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ provider: 'github', access_token: 'valid-github-token' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -300,7 +299,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ access_token: 'some-token' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -316,7 +315,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ provider: 'twitter', access_token: 'some-token' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); }); @@ -330,7 +329,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ provider: 'google' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); }); @@ -343,7 +342,7 @@ describe('crawler-manager-worker', () => { }, body: 'not json', }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); }); @@ -362,7 +361,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ provider: 'google', access_token: 'bad-token' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(401); }); @@ -391,7 +390,7 @@ describe('crawler-manager-worker', () => { }, body: JSON.stringify({ provider: 'google', access_token: 'valid-but-unregistered' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(401); }); @@ -403,7 +402,7 @@ describe('crawler-manager-worker', () => { method: 'POST', body: 'not json', }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -417,7 +416,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url_pattern: '.*', code: '(x) => x' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -430,7 +429,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: ' ', url_pattern: '.*', code: '(x) => x' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -443,7 +442,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'x'.repeat(256), url_pattern: '.*', code: '(x) => x' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -456,7 +455,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'test', url_pattern: '(((', code: '(x) => x' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -469,7 +468,7 @@ describe('crawler-manager-worker', () => { body: JSON.stringify({ name: 'test', url_pattern: '(a+)+', code: '(x) => x' }), headers: { 'Content-Type': 'application/json' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json() as Record<string, unknown>; expect(body.error_description).toContain('unsafe regex pattern'); @@ -481,7 +480,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'test', type: 'invalid', code: '(x) => x', url_pattern: '.*' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json() as Record<string, unknown>; expect(body.error_description).toContain('type'); @@ -493,7 +492,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'test', type: 'web', code: '(x) => x' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json() as Record<string, unknown>; expect(body.error_description).toContain('url_pattern'); @@ -505,7 +504,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'test', type: 'data', code: '(x) => x' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json() as Record<string, unknown>; expect(body.error_description).toContain('input_schema'); @@ -517,7 +516,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'test', url_pattern: '.*', code: '(x) => x', output_schema: 'invalid' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json() as Record<string, unknown>; expect(body.error_description).toContain('output_schema'); @@ -534,7 +533,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(crawlerData), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(201); const body = await response.json(); @@ -555,7 +554,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(crawlerData), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(201); const body = await response.json(); @@ -569,7 +568,7 @@ describe('crawler-manager-worker', () => { mockSupabaseCrawlerList(); const request = await authenticatedRequest('/crawlers'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -585,7 +584,7 @@ describe('crawler-manager-worker', () => { mockSupabaseCrawlerList(); const request = await authenticatedRequest('/crawlers?offset=0&limit=10'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -596,7 +595,7 @@ describe('crawler-manager-worker', () => { it('returns 400 for negative offset', async () => { const request = await authenticatedRequest('/crawlers?offset=-1'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -606,7 +605,7 @@ describe('crawler-manager-worker', () => { it('returns 400 for zero limit', async () => { const request = await authenticatedRequest('/crawlers?limit=0'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -616,7 +615,7 @@ describe('crawler-manager-worker', () => { it('returns 400 for limit exceeding maximum', async () => { const request = await authenticatedRequest('/crawlers?limit=101'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -626,7 +625,7 @@ describe('crawler-manager-worker', () => { it('returns 400 for non-numeric offset', async () => { const request = await authenticatedRequest('/crawlers?offset=abc'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -636,7 +635,7 @@ describe('crawler-manager-worker', () => { it('returns 400 for non-integer limit', async () => { const request = await authenticatedRequest('/crawlers?limit=1.5'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -650,7 +649,7 @@ describe('crawler-manager-worker', () => { mockSupabaseCrawlerGet(); const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -661,7 +660,7 @@ describe('crawler-manager-worker', () => { mockSupabaseCrawlerNotFound(); const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); }); @@ -673,7 +672,7 @@ describe('crawler-manager-worker', () => { method: 'PUT', body: 'not json', }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); const body = await response.json(); @@ -686,7 +685,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: 'test' }), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(400); }); @@ -707,7 +706,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(updatedData), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -730,7 +729,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(validCrawlerBody()), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); }); @@ -746,7 +745,7 @@ describe('crawler-manager-worker', () => { const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`, { method: 'DELETE', }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(200); const body = await response.json(); @@ -762,7 +761,7 @@ describe('crawler-manager-worker', () => { const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`, { method: 'DELETE', }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); const body = await response.json(); @@ -773,7 +772,7 @@ describe('crawler-manager-worker', () => { describe('invalid UUID format', () => { it('returns 404 for invalid crawler ID in GET', async () => { const request = await authenticatedRequest('/crawlers/abc'); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); }); @@ -784,7 +783,7 @@ describe('crawler-manager-worker', () => { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(validCrawlerBody()), }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); }); @@ -793,7 +792,7 @@ describe('crawler-manager-worker', () => { const request = await authenticatedRequest('/crawlers/abc', { method: 'DELETE', }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); }); @@ -804,7 +803,7 @@ describe('crawler-manager-worker', () => { const request = new Request(`${WORKER_URL}/unknown`, { headers: { Origin: 'https://example.com' }, }); - const response = await worker.fetch(request, env); + const response = await SELF.fetch(request); expect(response.status).toBe(404); const body = await response.json(); diff --git a/workers/crawler-manager-worker/vitest.config.ts b/workers/crawler-manager-worker/vitest.config.ts index 08b2372..5e4a918 100644 --- a/workers/crawler-manager-worker/vitest.config.ts +++ b/workers/crawler-manager-worker/vitest.config.ts @@ -12,6 +12,7 @@ export default defineWorkersConfig({ SUPABASE_URL: 'https://supabase.example.com', SUPABASE_SECRET_KEY: 'test-secret-key', JWT_SECRET: 'test-jwt-secret-key-for-testing-only', + CODE_RUNNER_FUNCTION_URL: 'https://code-runner.example.com', }, }, }, diff --git a/workers/crawler-manager-worker/wrangler.toml b/workers/crawler-manager-worker/wrangler.toml index f0e0d52..662bad9 100644 --- a/workers/crawler-manager-worker/wrangler.toml +++ b/workers/crawler-manager-worker/wrangler.toml @@ -8,6 +8,8 @@ ALLOWED_ORIGINS = "http://localhost:5173,https://audio-underview.pages.dev" # Secrets (set via wrangler secret put): # SUPABASE_URL # SUPABASE_SECRET_KEY +# JWT_SECRET +# CODE_RUNNER_FUNCTION_URL [observability.logs] enabled = true diff --git a/workers/scheduler-manager-worker/sources/crawler-execution-client.ts b/workers/scheduler-manager-worker/sources/crawler-execution-client.ts new file mode 100644 index 0000000..e8c402a --- /dev/null +++ b/workers/scheduler-manager-worker/sources/crawler-execution-client.ts @@ -0,0 +1,27 @@ +import { + type CrawlerExecuteResult, + validateCrawlerExecuteResult, +} from '@audio-underview/worker-tools'; + +export type { CrawlerExecuteResult }; + +interface CrawlerManagerRPC { + executeCrawler(crawlerID: string, input: unknown): Promise<unknown>; +} + +export interface CrawlerExecutionClient { + execute(crawlerID: string, input: unknown): Promise<CrawlerExecuteResult>; +} + +export class ServiceBindingCrawlerExecutionClient implements CrawlerExecutionClient { + private readonly binding: Service; + + constructor(binding: Service) { + this.binding = binding; + } + + async execute(crawlerID: string, input: unknown): Promise<CrawlerExecuteResult> { + const raw = await (this.binding as unknown as CrawlerManagerRPC).executeCrawler(crawlerID, input); + return validateCrawlerExecuteResult(raw); + } +} diff --git a/workers/scheduler-manager-worker/sources/handlers/scheduler-execution.ts b/workers/scheduler-manager-worker/sources/handlers/scheduler-execution.ts new file mode 100644 index 0000000..432bed3 --- /dev/null +++ b/workers/scheduler-manager-worker/sources/handlers/scheduler-execution.ts @@ -0,0 +1,122 @@ +import { + type ResponseContext, + jsonResponse, +} from '@audio-underview/worker-tools'; +import { + type SchedulerRunStatus, + createSupabaseClient, + createSchedulerRun, + getSchedulerRun, + updateSchedulerRun, +} from '@audio-underview/supabase-connector'; +import { createWorkerLogger } from '@audio-underview/logger'; +import type { Environment } from '../index.ts'; +import { ServiceBindingCrawlerExecutionClient } from '../crawler-execution-client.ts'; +import { executeScheduler } from '../scheduler-executor.ts'; +import { verifySchedulerOwnership } from './tools.ts'; + +export function resolveHTTPStatus(status: string, error: string | null | undefined): number { + if (status === 'completed' || status === 'partially_failed') return 200; + if (status !== 'failed' || error === null || error === undefined) return 200; + + if (error.includes('timed out')) return 408; + if (error.includes('Invalid input_schema') || error.includes('fan_out_field')) return 422; + if (error.includes('CodeRunner error') || error.includes('Invalid CrawlerExecuteResult')) return 502; + if (error.includes('Supabase') || error.includes('database')) return 503; + + return 500; +} + +const logger = createWorkerLogger({ + defaultContext: { + module: 'scheduler-execution-handler', + }, +}); + +export async function handleExecuteScheduler( + environment: Environment, + context: ResponseContext, + schedulerID: string, + userUUID: string, +): Promise<Response> { + const supabaseClient = createSupabaseClient({ + supabaseURL: environment.SUPABASE_URL, + supabaseSecretKey: environment.SUPABASE_SECRET_KEY, + }); + + const ownershipError = await verifySchedulerOwnership(supabaseClient, schedulerID, userUUID, context); + if (ownershipError) return ownershipError; + + // Atomic concurrent run guard via DB unique partial index + // (scheduler_runs_one_active_per_scheduler: only one pending/running run per scheduler) + let run; + try { + run = await createSchedulerRun(supabaseClient, { + scheduler_id: schedulerID, + status: 'pending', + }); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + if (message.includes('scheduler_runs_one_active_per_scheduler')) { + return jsonResponse({ + error: 'conflict', + error_description: 'A run is already in progress', + }, 409, context); + } + throw error; + } + + const crawlerExecutionClient = new ServiceBindingCrawlerExecutionClient(environment.CRAWLER_MANAGER); + + // Pipeline timeout: 5 minutes. Prevents run stuck in 'running' on client disconnect or hang. + // AbortController signals executeScheduler to stop updating run status after timeout. + const PIPELINE_TIMEOUT_MILLISECONDS = 300_000; + const abortController = new AbortController(); + + try { + await Promise.race([ + executeScheduler( + { supabaseClient, crawlerExecutionClient, logger }, + schedulerID, + userUUID, + run.id, + abortController.signal, + ), + new Promise<never>((_, reject) => + setTimeout(() => reject(new Error('Pipeline execution timed out after 5 minutes')), PIPELINE_TIMEOUT_MILLISECONDS), + ), + ]); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + + if (message.includes('timed out')) { + abortController.abort(); + await updateSchedulerRun(supabaseClient, run.id, schedulerID, { + status: 'failed', + completed_at: new Date().toISOString(), + error: message, + }, { onlyIfStatus: ['pending', 'running'] satisfies SchedulerRunStatus[] }).catch((updateError: unknown) => { + logger.error('Failed to update run status after timeout', updateError, { + function: 'handleExecuteScheduler', + metadata: { schedulerID, runID: run.id }, + }); + }); + } + } + + // Fetch final run state + const completedRun = await getSchedulerRun(supabaseClient, run.id, schedulerID); + const finalRun = completedRun ?? run; + + const responseBody = { + run_id: finalRun.id, + status: finalRun.status, + result: finalRun.result, + error: finalRun.error, + started_at: finalRun.started_at, + completed_at: finalRun.completed_at, + }; + + const httpStatus = resolveHTTPStatus(finalRun.status, finalRun.error); + return jsonResponse(responseBody, httpStatus, context); +} diff --git a/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts b/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts index a5a8629..cefb133 100644 --- a/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts +++ b/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts @@ -5,6 +5,7 @@ import { } from '@audio-underview/worker-tools'; import { createSupabaseClient, + getCrawlerPermission, createSchedulerStage, listSchedulerStages, getSchedulerStage, @@ -15,12 +16,15 @@ import { import type { Environment } from '../index.ts'; import { verifySchedulerOwnership, UUID_PATTERN } from './tools.ts'; +type FanOutStrategy = 'compact' | 'preserve'; + interface CreateStageRequestBody { crawler_id: string; stage_order: number; input_schema: Record<string, unknown>; output_schema?: Record<string, unknown>; fan_out_field?: string; + fan_out_strategy?: FanOutStrategy; } interface UpdateStageRequestBody { @@ -28,6 +32,7 @@ interface UpdateStageRequestBody { input_schema?: Record<string, unknown>; output_schema?: Record<string, unknown>; fan_out_field?: string | null; + fan_out_strategy?: FanOutStrategy; } function isPlainObject(value: unknown): value is Record<string, unknown> { @@ -64,6 +69,11 @@ export async function handleCreateStage( return errorResponse('invalid_request', "Field 'crawler_id' is required and must be a valid UUID", 400, context); } + const crawlerPermission = await getCrawlerPermission(supabaseClient, body.crawler_id, userUUID); + if (crawlerPermission === undefined) { + return errorResponse('forbidden', 'You do not have permission to use this crawler', 403, context); + } + if (typeof body.stage_order !== 'number' || !Number.isInteger(body.stage_order) || body.stage_order < 0) { return errorResponse('invalid_request', "Field 'stage_order' is required and must be a non-negative integer", 400, context); } @@ -82,6 +92,12 @@ export async function handleCreateStage( } } + if (body.fan_out_strategy !== undefined) { + if (body.fan_out_strategy !== 'compact' && body.fan_out_strategy !== 'preserve') { + return errorResponse('invalid_request', "Field 'fan_out_strategy' must be 'compact' or 'preserve'", 400, context); + } + } + try { const stage = await createSchedulerStage(supabaseClient, { scheduler_id: schedulerID, @@ -90,6 +106,7 @@ export async function handleCreateStage( input_schema: body.input_schema, output_schema: body.output_schema, fan_out_field: body.fan_out_field, + fan_out_strategy: body.fan_out_strategy, }); return jsonResponse(stage, 201, context); @@ -177,6 +194,11 @@ export async function handleUpdateStage( if (typeof body.crawler_id !== 'string' || !UUID_PATTERN.test(body.crawler_id)) { return errorResponse('invalid_request', "Field 'crawler_id' must be a valid UUID", 400, context); } + + const crawlerPermission = await getCrawlerPermission(supabaseClient, body.crawler_id, userUUID); + if (crawlerPermission === undefined) { + return errorResponse('forbidden', 'You do not have permission to use this crawler', 403, context); + } } if (body.input_schema !== undefined && !isPlainObject(body.input_schema)) { @@ -193,7 +215,13 @@ export async function handleUpdateStage( } } - if (body.crawler_id === undefined && body.input_schema === undefined && body.output_schema === undefined && body.fan_out_field === undefined) { + if (body.fan_out_strategy !== undefined) { + if (body.fan_out_strategy !== 'compact' && body.fan_out_strategy !== 'preserve') { + return errorResponse('invalid_request', "Field 'fan_out_strategy' must be 'compact' or 'preserve'", 400, context); + } + } + + if (body.crawler_id === undefined && body.input_schema === undefined && body.output_schema === undefined && body.fan_out_field === undefined && body.fan_out_strategy === undefined) { return errorResponse('invalid_request', 'At least one field must be provided for update', 400, context); } @@ -202,6 +230,7 @@ export async function handleUpdateStage( if (body.input_schema !== undefined) updatePayload.input_schema = body.input_schema; if (body.output_schema !== undefined) updatePayload.output_schema = body.output_schema; if (body.fan_out_field !== undefined) updatePayload.fan_out_field = body.fan_out_field; + if (body.fan_out_strategy !== undefined) updatePayload.fan_out_strategy = body.fan_out_strategy; try { const stage = await updateSchedulerStage(supabaseClient, stageID, schedulerID, updatePayload); diff --git a/workers/scheduler-manager-worker/sources/index.ts b/workers/scheduler-manager-worker/sources/index.ts index bff7015..095eff3 100644 --- a/workers/scheduler-manager-worker/sources/index.ts +++ b/workers/scheduler-manager-worker/sources/index.ts @@ -27,6 +27,7 @@ import { handleListRuns, handleGetRun, } from './handlers/scheduler-runs.ts'; +import { handleExecuteScheduler } from './handlers/scheduler-execution.ts'; import { UUID_PATTERN } from './handlers/tools.ts'; export interface Environment { @@ -34,6 +35,7 @@ export interface Environment { SUPABASE_URL: string; SUPABASE_SECRET_KEY: string; JWT_SECRET: string; + CRAWLER_MANAGER: Service; } const logger = createWorkerLogger({ @@ -60,12 +62,14 @@ const HELP = { { method: 'PUT', path: '/schedulers/:id/stages/reorder', description: 'Reorder stages' }, { method: 'GET', path: '/schedulers/:id/runs', description: 'List runs for a scheduler' }, { method: 'GET', path: '/schedulers/:id/runs/:runID', description: 'Get a run by ID' }, + { method: 'POST', path: '/schedulers/:id/execute', description: 'Execute a scheduler pipeline' }, ], }; interface ParsedRoute { type: 'schedulers_collection' | 'scheduler_single' + | 'scheduler_execute' | 'stages_collection' | 'stage_single' | 'stages_reorder' @@ -91,6 +95,14 @@ function parseRoute(pathname: string): ParsedRoute { return { type: 'scheduler_single', schedulerID: id }; } + // /schedulers/:id/execute + const executeMatch = pathname.match(/^\/schedulers\/([0-9a-f-]+)\/execute$/i); + if (executeMatch) { + const id = executeMatch[1]; + if (!UUID_PATTERN.test(id)) return { type: null }; + return { type: 'scheduler_execute', schedulerID: id }; + } + // /schedulers/:id/stages/reorder const reorderMatch = pathname.match(/^\/schedulers\/([0-9a-f-]+)\/stages\/reorder$/i); if (reorderMatch) { @@ -237,6 +249,15 @@ export default { return response; } + case 'scheduler_execute': { + if (request.method === 'POST') { + return await handleExecuteScheduler(environment, context, route.schedulerID!, userUUID); + } + const response = errorResponse('method_not_allowed', 'Method not allowed', 405, context); + response.headers.set('Allow', 'POST'); + return response; + } + case 'stages_reorder': { if (request.method === 'PUT') { return await handleReorderStages(request, environment, context, route.schedulerID!, userUUID); diff --git a/workers/scheduler-manager-worker/sources/scheduler-executor.ts b/workers/scheduler-manager-worker/sources/scheduler-executor.ts new file mode 100644 index 0000000..a6ed4cd --- /dev/null +++ b/workers/scheduler-manager-worker/sources/scheduler-executor.ts @@ -0,0 +1,193 @@ +import type { SupabaseClient } from '@audio-underview/supabase-connector'; +import type { Logger } from '@audio-underview/logger'; +import { + listSchedulerStages, + updateSchedulerRun, + updateScheduler, + createSchedulerStageRun, + updateSchedulerStageRun, +} from '@audio-underview/supabase-connector'; +import type { CrawlerExecutionClient } from './crawler-execution-client.ts'; +import { + executeStage, + executeFanOut, + resolveDefaultInput, +} from './stage-runner.ts'; + +export interface ExecutorDependencies { + supabaseClient: SupabaseClient; + crawlerExecutionClient: CrawlerExecutionClient; + logger: Logger; +} + +export async function executeScheduler( + dependencies: ExecutorDependencies, + schedulerID: string, + userUUID: string, + runID: string, + signal?: AbortSignal, +): Promise<void> { + const { supabaseClient, logger } = dependencies; + + const stageRunnerDependencies = { + supabaseClient: dependencies.supabaseClient, + crawlerExecutionClient: dependencies.crawlerExecutionClient, + logger: dependencies.logger, + }; + + try { + // Mark run as running + await updateSchedulerRun(supabaseClient, runID, schedulerID, { + status: 'running', + started_at: new Date().toISOString(), + }); + + const stages = await listSchedulerStages(supabaseClient, schedulerID); + + if (stages.length === 0) { + await updateSchedulerRun(supabaseClient, runID, schedulerID, { + status: 'completed', + completed_at: new Date().toISOString(), + result: null, + }); + return; + } + + let currentInput: unknown = resolveDefaultInput(stages[0].input_schema); + let lastOutput: unknown = null; + let hasPartialFailure = false; + + for (const stage of stages) { + if (signal?.aborted) break; + + // Fan-out check + if (stage.fan_out_field) { + if (currentInput !== null && currentInput !== undefined && typeof currentInput !== 'object') { + throw new Error( + `Stage ${stage.stage_order}: fan_out_field "${stage.fan_out_field}" requires object input, got ${typeof currentInput}`, + ); + } + const inputObject = currentInput as Record<string, unknown> | null; + const fanOutItems = inputObject?.[stage.fan_out_field]; + + if (fanOutItems === undefined || fanOutItems === null) { + throw new Error( + `Stage ${stage.stage_order}: fan_out_field "${stage.fan_out_field}" not found in input`, + ); + } + + if (!Array.isArray(fanOutItems)) { + throw new Error( + `Stage ${stage.stage_order}: fan_out_field "${stage.fan_out_field}" is not an array`, + ); + } + + // Create stage_run record for the fan-out stage + const stageRun = await createSchedulerStageRun(supabaseClient, { + run_id: runID, + stage_id: stage.id, + stage_order: stage.stage_order, + status: 'running', + started_at: new Date().toISOString(), + input: currentInput, + }); + + if (fanOutItems.length === 0) { + await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, { + status: 'completed', + completed_at: new Date().toISOString(), + output: [], + items_total: 0, + items_succeeded: 0, + items_failed: 0, + }); + currentInput = []; + lastOutput = []; + continue; + } + + const fanOutResult = await executeFanOut( + stageRunnerDependencies, + stage, + fanOutItems, + 1, + signal, + ); + + await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, { + status: fanOutResult.status, + completed_at: new Date().toISOString(), + output: fanOutResult.results, + items_total: fanOutResult.itemsTotal, + items_succeeded: fanOutResult.itemsSucceeded, + items_failed: fanOutResult.itemsFailed, + }); + + if (fanOutResult.status === 'failed') { + throw new Error( + `Stage ${stage.stage_order}: all fan-out items failed`, + ); + } + + if (fanOutResult.status === 'partially_failed') { + hasPartialFailure = true; + } + + currentInput = fanOutResult.results; + lastOutput = fanOutResult.results; + } else { + // Normal stage execution + const stageResult = await executeStage( + stageRunnerDependencies, + runID, + stage, + currentInput, + signal, + ); + + currentInput = stageResult.output; + lastOutput = stageResult.output; + } + } + + // Pipeline completed successfully — skip if handler already timed out + if (signal?.aborted) return; + + await updateSchedulerRun(supabaseClient, runID, schedulerID, { + status: hasPartialFailure ? 'partially_failed' : 'completed', + completed_at: new Date().toISOString(), + result: lastOutput, + }); + } catch (error: unknown) { + if (signal?.aborted) return; + + const errorMessage = error instanceof Error ? error.message : String(error); + + logger.error('Scheduler execution failed', error, { + function: 'executeScheduler', + metadata: { schedulerID, runID }, + }); + + await updateSchedulerRun(supabaseClient, runID, schedulerID, { + status: 'failed', + completed_at: new Date().toISOString(), + error: errorMessage, + }).catch((updateError: unknown) => { + logger.error('Failed to update run status after error', updateError, { + function: 'executeScheduler', + metadata: { schedulerID, runID }, + }); + }); + } finally { + if (signal?.aborted) return; + // Always update last_run_at + await updateScheduler(supabaseClient, schedulerID, userUUID, { + last_run_at: new Date().toISOString(), + }).catch((updateError: unknown) => { + logger.error('Failed to update scheduler last_run_at', updateError, { + function: 'executeScheduler', + metadata: { schedulerID }, + }); + }); + } +} diff --git a/workers/scheduler-manager-worker/sources/stage-runner.ts b/workers/scheduler-manager-worker/sources/stage-runner.ts new file mode 100644 index 0000000..6f7b318 --- /dev/null +++ b/workers/scheduler-manager-worker/sources/stage-runner.ts @@ -0,0 +1,160 @@ +import type { SupabaseClient } from '@audio-underview/supabase-connector'; +import type { Logger } from '@audio-underview/logger'; +import type { + SchedulerStageRow, + SchedulerStageRunRow, +} from '@audio-underview/supabase-connector'; +import { + createSchedulerStageRun, + updateSchedulerStageRun, +} from '@audio-underview/supabase-connector'; +import type { CrawlerExecutionClient } from './crawler-execution-client.ts'; + +export interface StageRunnerDependencies { + supabaseClient: SupabaseClient; + crawlerExecutionClient: CrawlerExecutionClient; + logger: Logger; +} + +export interface StageResult { + output: unknown; + stageRun: SchedulerStageRunRow; +} + +export interface FanOutResult { + results: unknown[]; + itemsTotal: number; + itemsSucceeded: number; + itemsFailed: number; + status: 'completed' | 'partially_failed' | 'failed'; +} + +export function resolveDefaultInput(inputSchema: unknown): Record<string, unknown> { + if (inputSchema === null || inputSchema === undefined || typeof inputSchema !== 'object' || Array.isArray(inputSchema)) { + throw new Error(`Invalid input_schema: expected object, got ${typeof inputSchema}`); + } + const defaults: Record<string, unknown> = {}; + for (const [key, value] of Object.entries(inputSchema)) { + if (value !== null && value !== undefined && typeof value === 'object' && 'default' in value) { + defaults[key] = (value as Record<string, unknown>).default; + } + } + return defaults; +} + +export async function executeStage( + dependencies: StageRunnerDependencies, + runID: string, + stage: SchedulerStageRow, + input: unknown, + signal?: AbortSignal, +): Promise<StageResult> { + const { supabaseClient, crawlerExecutionClient, logger } = dependencies; + + if (signal?.aborted) { + throw new Error('Stage execution aborted: pipeline timed out'); + } + + const stageRun = await createSchedulerStageRun(supabaseClient, { + run_id: runID, + stage_id: stage.id, + stage_order: stage.stage_order, + status: 'running', + started_at: new Date().toISOString(), + input, + }); + + try { + const response = await crawlerExecutionClient.execute(stage.crawler_id, input); + + const updatedStageRun = await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, { + status: 'completed', + completed_at: new Date().toISOString(), + output: response.result, + }); + + return { output: response.result, stageRun: updatedStageRun ?? stageRun }; + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : String(error); + + logger.error('Stage execution failed', error, { + function: 'executeStage', + metadata: { stageID: stage.id, stageOrder: stage.stage_order }, + }); + + await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, { + status: 'failed', + completed_at: new Date().toISOString(), + error: errorMessage, + }).catch((updateError: unknown) => { + logger.error('Failed to update stage run status after error', updateError, { + function: 'executeStage', + metadata: { stageRunID: stageRun.id, runID }, + }); + }); + + throw error; + } +} + +const FAN_OUT_FAILED = Symbol('fan-out-failed'); + +export async function executeFanOut( + dependencies: StageRunnerDependencies, + stage: SchedulerStageRow, + items: unknown[], + concurrency: number = 1, + signal?: AbortSignal, +): Promise<FanOutResult> { + const { crawlerExecutionClient, logger } = dependencies; + + const results: (unknown | typeof FAN_OUT_FAILED)[] = new Array(items.length).fill(FAN_OUT_FAILED); + let itemsSucceeded = 0; + let itemsFailed = 0; + + let nextIndex = 0; + + async function worker(): Promise<void> { + while (nextIndex < items.length) { + if (signal?.aborted) break; + const index = nextIndex++; + const item = items[index]; + try { + const response = await crawlerExecutionClient.execute(stage.crawler_id, item); + results[index] = response.result; + itemsSucceeded++; + } catch (error: unknown) { + logger.warn('Fan-out item failed', error, { + function: 'executeFanOut', + metadata: { stageID: stage.id, itemIndex: index }, + }); + itemsFailed++; + } + } + } + + const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => worker()); + await Promise.all(workers); + + let status: 'completed' | 'partially_failed' | 'failed'; + if (itemsFailed === 0) { + status = 'completed'; + } else if (itemsSucceeded > 0) { + status = 'partially_failed'; + } else { + status = 'failed'; + } + + const strategy = stage.fan_out_strategy ?? 'compact'; + const finalResults = strategy === 'preserve' + ? results.map((result) => result === FAN_OUT_FAILED ? null : result) + : results.filter((result) => result !== FAN_OUT_FAILED); + + return { + results: finalResults, + itemsTotal: items.length, + itemsSucceeded, + itemsFailed, + status, + }; +} diff --git a/workers/scheduler-manager-worker/tests/index.test.ts b/workers/scheduler-manager-worker/tests/index.test.ts index 580570b..6d60f5a 100644 --- a/workers/scheduler-manager-worker/tests/index.test.ts +++ b/workers/scheduler-manager-worker/tests/index.test.ts @@ -47,6 +47,7 @@ function mockStageResponse(overrides: Record<string, unknown> = {}) { input_schema: { url: { type: 'string', default: 'https://example.com' } }, output_schema: {}, fan_out_field: null, + fan_out_strategy: 'compact', created_at: '2026-01-01T00:00:00Z', ...overrides, }; @@ -89,6 +90,19 @@ function mockSupabaseSchedulerGet(data: unknown = mockSchedulerResponse()) { .reply(200, JSON.stringify(data)); } +function mockCrawlerPermission() { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/crawler_permissions/, method: 'GET' }) + .reply(200, JSON.stringify({ + id: '00000000-0000-0000-0000-000000000099', + crawler_id: MOCK_CRAWLER_ID, + user_uuid: MOCK_USER_UUID, + level: 'owner', + created_at: '2026-01-01T00:00:00Z', + })); +} + function mockSupabaseSchedulerNotFound() { fetchMock .get('https://supabase.example.com') @@ -402,6 +416,8 @@ describe('scheduler-manager-worker', () => { it('creates a stage and returns 201', async () => { // Mock scheduler ownership check mockSupabaseSchedulerGet(); + // Mock crawler permission check + mockCrawlerPermission(); // Mock stage creation fetchMock .get('https://supabase.example.com') diff --git a/workers/scheduler-manager-worker/tests/scheduler-execution.test.ts b/workers/scheduler-manager-worker/tests/scheduler-execution.test.ts new file mode 100644 index 0000000..b44baa9 --- /dev/null +++ b/workers/scheduler-manager-worker/tests/scheduler-execution.test.ts @@ -0,0 +1,87 @@ +import { describe, it, expect } from 'vitest'; +import { resolveHTTPStatus } from '../sources/handlers/scheduler-execution.ts'; +import { validateCrawlerExecuteResult } from '@audio-underview/worker-tools'; + +describe('resolveHTTPStatus', () => { + it('returns 200 for completed', () => { + expect(resolveHTTPStatus('completed', null)).toBe(200); + }); + + it('returns 200 for partially_failed', () => { + expect(resolveHTTPStatus('partially_failed', null)).toBe(200); + }); + + it('returns 200 for failed with no error message', () => { + expect(resolveHTTPStatus('failed', null)).toBe(200); + }); + + it('returns 500 for failed with unknown error', () => { + expect(resolveHTTPStatus('failed', 'Something went wrong')).toBe(500); + }); + + it('returns 408 for pipeline timeout', () => { + expect(resolveHTTPStatus('failed', 'Pipeline execution timed out after 5 minutes')).toBe(408); + }); + + it('returns 422 for invalid input_schema', () => { + expect(resolveHTTPStatus('failed', 'Invalid input_schema: expected object, got string')).toBe(422); + }); + + it('returns 422 for fan_out_field error', () => { + expect(resolveHTTPStatus('failed', 'Stage 1: fan_out_field "items" not found in input')).toBe(422); + }); + + it('returns 502 for code-runner error', () => { + expect(resolveHTTPStatus('failed', 'CodeRunner error 500: [server_error] Server returned 500')).toBe(502); + }); + + it('returns 502 for invalid RPC response', () => { + expect(resolveHTTPStatus('failed', 'Invalid CrawlerExecuteResult: expected object')).toBe(502); + }); + + it('returns 503 for database error', () => { + expect(resolveHTTPStatus('failed', 'Supabase request failed')).toBe(503); + }); + + it('returns 503 for database connection error', () => { + expect(resolveHTTPStatus('failed', 'database connection refused')).toBe(503); + }); + + it('returns 200 for pending status', () => { + expect(resolveHTTPStatus('pending', null)).toBe(200); + }); + + it('returns 200 for running status', () => { + expect(resolveHTTPStatus('running', null)).toBe(200); + }); +}); + +describe('validateCrawlerExecuteResult', () => { + it('accepts valid web result', () => { + const result = validateCrawlerExecuteResult({ type: 'web', result: { data: 'hello' } }); + expect(result.type).toBe('web'); + expect(result.result).toEqual({ data: 'hello' }); + }); + + it('accepts valid data result with null', () => { + const result = validateCrawlerExecuteResult({ type: 'data', result: null }); + expect(result.type).toBe('data'); + expect(result.result).toBeNull(); + }); + + it('throws on null input', () => { + expect(() => validateCrawlerExecuteResult(null)).toThrow('expected object'); + }); + + it('throws on non-object input', () => { + expect(() => validateCrawlerExecuteResult('string')).toThrow('expected object'); + }); + + it('throws on invalid type', () => { + expect(() => validateCrawlerExecuteResult({ type: 'unknown', result: {} })).toThrow("expected type 'web' or 'data'"); + }); + + it('throws on missing result field', () => { + expect(() => validateCrawlerExecuteResult({ type: 'web' })).toThrow('missing result field'); + }); +}); diff --git a/workers/scheduler-manager-worker/tests/scheduler-executor.test.ts b/workers/scheduler-manager-worker/tests/scheduler-executor.test.ts new file mode 100644 index 0000000..12a6f29 --- /dev/null +++ b/workers/scheduler-manager-worker/tests/scheduler-executor.test.ts @@ -0,0 +1,603 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { env, fetchMock } from 'cloudflare:test'; +import { createSupabaseClient } from '@audio-underview/supabase-connector'; +import type { SchedulerStageRow } from '@audio-underview/supabase-connector'; +import type { CrawlerExecutionClient } from '../sources/crawler-execution-client.ts'; +import type { Logger } from '@audio-underview/logger'; +import { executeScheduler } from '../sources/scheduler-executor.ts'; +import type { ExecutorDependencies } from '../sources/scheduler-executor.ts'; + +const SCHEDULER_ID = '00000000-0000-0000-0000-000000000010'; +const USER_UUID = '00000000-0000-0000-0000-000000000001'; +const RUN_ID = '00000000-0000-0000-0000-000000000040'; +const STAGE_ID = '00000000-0000-0000-0000-000000000020'; +const STAGE_ID_2 = '00000000-0000-0000-0000-000000000021'; +const CRAWLER_ID = '00000000-0000-0000-0000-000000000030'; +const CRAWLER_ID_2 = '00000000-0000-0000-0000-000000000031'; +const STAGE_RUN_ID = '00000000-0000-0000-0000-000000000050'; +const STAGE_RUN_ID_2 = '00000000-0000-0000-0000-000000000051'; + +function mockStage(overrides: Partial<SchedulerStageRow> = {}): SchedulerStageRow { + return { + id: STAGE_ID, + scheduler_id: SCHEDULER_ID, + crawler_id: CRAWLER_ID, + stage_order: 0, + input_schema: { url: { type: 'string', default: 'https://example.com' } }, + output_schema: {}, + fan_out_field: null, + fan_out_strategy: 'compact', + created_at: '2026-01-01T00:00:00Z', + ...overrides, + } as SchedulerStageRow; +} + +function mockRunRow(overrides: Record<string, unknown> = {}) { + return { + id: RUN_ID, + scheduler_id: SCHEDULER_ID, + status: 'running', + started_at: '2026-01-01T00:00:00Z', + completed_at: null, + result: null, + error: null, + created_at: '2026-01-01T00:00:00Z', + ...overrides, + }; +} + +function mockSchedulerRow(overrides: Record<string, unknown> = {}) { + return { + id: SCHEDULER_ID, + user_uuid: USER_UUID, + name: 'Test Scheduler', + cron_expression: null, + is_enabled: true, + last_run_at: null, + created_at: '2026-01-01T00:00:00Z', + updated_at: '2026-01-01T00:00:00Z', + ...overrides, + }; +} + +function mockStageRunRow(overrides: Record<string, unknown> = {}) { + return { + id: STAGE_RUN_ID, + run_id: RUN_ID, + stage_id: STAGE_ID, + stage_order: 0, + status: 'running', + started_at: '2026-01-01T00:00:00Z', + completed_at: null, + input: null, + output: null, + error: null, + items_total: null, + items_succeeded: null, + items_failed: null, + created_at: '2026-01-01T00:00:00Z', + ...overrides, + }; +} + +// --- Supabase mock helpers --- + +function mockListSchedulerStages(stages: SchedulerStageRow[]) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_stages/, method: 'GET' }) + .reply(200, JSON.stringify(stages)); +} + +function mockUpdateSchedulerRun(overrides: Record<string, unknown> = {}) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_runs/, method: 'PATCH' }) + .reply(200, JSON.stringify(mockRunRow(overrides))); +} + +function mockUpdateScheduler(overrides: Record<string, unknown> = {}) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/schedulers/, method: 'PATCH' }) + .reply(200, JSON.stringify(mockSchedulerRow(overrides))); +} + +function mockCreateStageRun(overrides: Record<string, unknown> = {}) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'POST' }) + .reply(201, JSON.stringify(mockStageRunRow(overrides))); +} + +function mockUpdateStageRun(overrides: Record<string, unknown> = {}) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'PATCH' }) + .reply(200, JSON.stringify(mockStageRunRow(overrides))); +} + +function mockUpdateSchedulerRunError() { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_runs/, method: 'PATCH' }) + .reply(500, JSON.stringify({ message: 'Internal Server Error' })); +} + +// --- Crawler execution client mock --- + +function createMockCrawlerExecutionClient( + results: unknown[] = [], +): CrawlerExecutionClient & { execute: ReturnType<typeof vi.fn> } { + let callIndex = 0; + return { + execute: vi.fn().mockImplementation(async () => { + const result = results[callIndex] ?? { extracted: 'data' }; + callIndex++; + return { type: 'data' as const, result }; + }), + }; +} + +function createMockLogger(): Logger { + return { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + createChild: vi.fn().mockReturnThis(), + } as unknown as Logger; +} + +function createDependencies(crawlerResults: unknown[] = []): { + dependencies: ExecutorDependencies; + crawlerExecutionClient: ReturnType<typeof createMockCrawlerExecutionClient>; + logger: ReturnType<typeof createMockLogger>; +} { + const crawlerExecutionClient = createMockCrawlerExecutionClient(crawlerResults); + const logger = createMockLogger(); + const supabaseClient = createSupabaseClient({ + supabaseURL: env.SUPABASE_URL, + supabaseSecretKey: env.SUPABASE_SECRET_KEY, + }); + return { + dependencies: { supabaseClient, crawlerExecutionClient, logger }, + crawlerExecutionClient, + logger, + }; +} + +beforeEach(() => { + fetchMock.activate(); + fetchMock.disableNetConnect(); +}); + +afterEach(() => { + fetchMock.deactivate(); +}); + +describe('executeScheduler', () => { + it('marks run as completed with result null when stages are empty', async () => { + const { dependencies } = createDependencies(); + + // 1. updateSchedulerRun → status: 'running' + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages → empty array + mockListSchedulerStages([]); + // 3. updateSchedulerRun → status: 'completed', result: null + mockUpdateSchedulerRun({ status: 'completed', result: null }); + // finally: updateScheduler → last_run_at + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(dependencies.crawlerExecutionClient.execute).not.toHaveBeenCalled(); + }); + + it('executes a single stage and marks run completed with output', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies([ + { title: 'Test Page' }, + ]); + + // 1. updateSchedulerRun → status: 'running' + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages → one stage + mockListSchedulerStages([mockStage()]); + // 3. executeStage internals: + // a. createSchedulerStageRun + mockCreateStageRun(); + // b. crawlerExecutionClient.execute → via mock + // c. updateSchedulerStageRun + mockUpdateStageRun({ status: 'completed', output: { title: 'Test Page' } }); + // 4. updateSchedulerRun → status: 'completed' + mockUpdateSchedulerRun({ status: 'completed', result: { title: 'Test Page' } }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(1); + expect(crawlerExecutionClient.execute).toHaveBeenCalledWith(CRAWLER_ID, { + url: 'https://example.com', + }); + }); + + it('chains multi-stage output: stage N output becomes stage N+1 input', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies([ + { urls: ['https://a.com', 'https://b.com'] }, + { results: [1, 2] }, + ]); + + const stage1 = mockStage({ + id: STAGE_ID, + crawler_id: CRAWLER_ID, + stage_order: 0, + }); + const stage2 = mockStage({ + id: STAGE_ID_2, + crawler_id: CRAWLER_ID_2, + stage_order: 1, + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1: createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. stage2: createStageRun + updateStageRun + mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 }); + mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'completed' }); + // 5. updateSchedulerRun → completed + mockUpdateSchedulerRun({ status: 'completed' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(2); + // Stage 1 gets default input from input_schema + expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(1, CRAWLER_ID, { + url: 'https://example.com', + }); + // Stage 2 gets output of stage 1 as input + expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(2, CRAWLER_ID_2, { + urls: ['https://a.com', 'https://b.com'], + }); + }); + + it('handles fan-out stage: validates fan_out_field exists and is array', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies([ + { links: ['https://a.com', 'https://b.com'] }, + ]); + + const stage1 = mockStage({ stage_order: 0 }); + const stage2 = mockStage({ + id: STAGE_ID_2, + crawler_id: CRAWLER_ID_2, + stage_order: 1, + fan_out_field: 'links', + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1 (normal): createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. stage2 (fan-out): createStageRun for the fan-out stage + mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 }); + // 5. executeFanOut calls crawlerExecutionClient.execute for each item + // (2 items from links array → 2 crawler calls, producing results for items 2 and 3) + // 6. updateStageRun for the fan-out stage + mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'completed' }); + // 7. updateSchedulerRun → completed + mockUpdateSchedulerRun({ status: 'completed' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + // Stage 1 + 2 fan-out items = 3 total crawler calls + expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(3); + expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(2, CRAWLER_ID_2, 'https://a.com'); + expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(3, CRAWLER_ID_2, 'https://b.com'); + }); + + it('completes fan-out with empty array: output is []', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies([ + { links: [] }, + ]); + + const stage1 = mockStage({ stage_order: 0 }); + const stage2 = mockStage({ + id: STAGE_ID_2, + stage_order: 1, + fan_out_field: 'links', + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1 (normal): createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. stage2 (fan-out): createStageRun + mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 }); + // 5. empty array → updateStageRun with output: [] + mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'completed', output: [] }); + // 6. updateSchedulerRun → completed (lastOutput = []) + mockUpdateSchedulerRun({ status: 'completed', result: [] }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + // Only stage 1 executes via crawler; fan-out stage has empty array + expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(1); + }); + + it('fails run when all fan-out items fail', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies(); + + crawlerExecutionClient.execute + .mockReset() + .mockImplementationOnce(async () => ({ type: 'data' as const, result: { data: 'stage1' } })) + .mockImplementationOnce(async () => { throw new Error('fan-out item 1 failed'); }) + .mockImplementationOnce(async () => { throw new Error('fan-out item 2 failed'); }); + + const stage1 = mockStage({ stage_order: 0 }); + const stage2 = mockStage({ + id: STAGE_ID_2, + stage_order: 1, + fan_out_field: 'items', + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1: createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. stage2 fan-out: createStageRun + mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 }); + // 5. executeFanOut → all fail → status: 'failed' + // 6. updateStageRun with status: 'failed' + mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'failed' }); + // 7. throw → catch block: updateSchedulerRun → status: 'failed' + mockUpdateSchedulerRun({ status: 'failed', error: 'Stage 1: all fan-out items failed' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(logger.error).toHaveBeenCalledWith( + 'Scheduler execution failed', + expect.any(Error), + expect.objectContaining({ function: 'executeScheduler' }), + ); + }); + + it('sets run status to partially_failed when some fan-out items fail', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + + crawlerExecutionClient.execute + .mockReset() + .mockImplementationOnce(async () => ({ type: 'data' as const, result: { items: ['a', 'b'] } })) + .mockImplementationOnce(async () => ({ type: 'data' as const, result: 'ok-a' })) + .mockImplementationOnce(async () => { throw new Error('item b failed'); }); + + const stage1 = mockStage({ stage_order: 0 }); + const stage2 = mockStage({ + id: STAGE_ID_2, + stage_order: 1, + fan_out_field: 'items', + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1: createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. stage2 fan-out: createStageRun + mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 }); + // 5. executeFanOut → partially_failed + // 6. updateStageRun with status: 'partially_failed' + mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'partially_failed' }); + // 7. updateSchedulerRun → partially_failed (hasPartialFailure = true) + mockUpdateSchedulerRun({ status: 'partially_failed' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(3); + }); + + it('marks run as failed when a stage throws an error', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies(); + + crawlerExecutionClient.execute + .mockReset() + .mockRejectedValueOnce(new Error('Crawler connection timeout')); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([mockStage()]); + // 3. executeStage: createStageRun + mockCreateStageRun(); + // 4. executeStage: crawler throws → updateStageRun with error + mockUpdateStageRun({ status: 'failed', error: 'Crawler connection timeout' }); + // 5. catch: updateSchedulerRun → failed + mockUpdateSchedulerRun({ status: 'failed', error: 'Crawler connection timeout' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(logger.error).toHaveBeenCalledWith( + 'Scheduler execution failed', + expect.any(Error), + expect.objectContaining({ + function: 'executeScheduler', + metadata: { schedulerID: SCHEDULER_ID, runID: RUN_ID }, + }), + ); + }); + + it('always updates scheduler.last_run_at in finally block', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + + crawlerExecutionClient.execute + .mockReset() + .mockRejectedValueOnce(new Error('Some error')); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([mockStage()]); + // 3. executeStage: createStageRun + mockCreateStageRun(); + // 4. executeStage: crawler throws → updateStageRun with error + mockUpdateStageRun({ status: 'failed' }); + // 5. catch: updateSchedulerRun → failed + mockUpdateSchedulerRun({ status: 'failed' }); + // finally: updateScheduler → capture request body to verify last_run_at + let capturedBody: Record<string, unknown> | undefined; + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/schedulers/, method: 'PATCH' }) + .reply(200, (request: { body: string }) => { + capturedBody = JSON.parse(request.body as string) as Record<string, unknown>; + return { data: JSON.stringify(mockSchedulerRow()) }; + }); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(capturedBody).toBeDefined(); + expect(capturedBody!.last_run_at).toBeDefined(); + expect(typeof capturedBody!.last_run_at).toBe('string'); + }); + + it('logs error but does not throw when run status update fails in catch block', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies(); + + crawlerExecutionClient.execute + .mockReset() + .mockRejectedValueOnce(new Error('Crawler failed')); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([mockStage()]); + // 3. executeStage: createStageRun + mockCreateStageRun(); + // 4. executeStage: crawler throws → updateStageRun with error + mockUpdateStageRun({ status: 'failed' }); + // 5. catch: updateSchedulerRun → HTTP 500 error + mockUpdateSchedulerRunError(); + // finally: updateScheduler + mockUpdateScheduler(); + + // Should NOT throw even though the catch-block update failed + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + // Should log both the original error and the update failure + expect(logger.error).toHaveBeenCalledWith( + 'Scheduler execution failed', + expect.any(Error), + expect.objectContaining({ function: 'executeScheduler' }), + ); + expect(logger.error).toHaveBeenCalledWith( + 'Failed to update run status after error', + expect.any(Error), + expect.objectContaining({ function: 'executeScheduler' }), + ); + }); + + it('throws error when fan_out_field references a non-existent field', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies([ + { data: 'no-links-field' }, + ]); + + const stage1 = mockStage({ stage_order: 0 }); + const stage2 = mockStage({ + id: STAGE_ID_2, + stage_order: 1, + fan_out_field: 'links', + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1: createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. fan_out_field 'links' not found in input → throws + // 5. catch: updateSchedulerRun → failed + mockUpdateSchedulerRun({ status: 'failed' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(logger.error).toHaveBeenCalledWith( + 'Scheduler execution failed', + expect.objectContaining({ + message: expect.stringContaining('fan_out_field "links" not found in input'), + }), + expect.objectContaining({ function: 'executeScheduler' }), + ); + }); + + it('throws error when fan_out_field references a non-array value', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies([ + { links: 'not-an-array' }, + ]); + + const stage1 = mockStage({ stage_order: 0 }); + const stage2 = mockStage({ + id: STAGE_ID_2, + stage_order: 1, + fan_out_field: 'links', + input_schema: {}, + }); + + // 1. updateSchedulerRun → running + mockUpdateSchedulerRun({ status: 'running' }); + // 2. listSchedulerStages + mockListSchedulerStages([stage1, stage2]); + // 3. stage1: createStageRun + updateStageRun + mockCreateStageRun(); + mockUpdateStageRun({ status: 'completed' }); + // 4. fan_out_field 'links' is not array → throws + // 5. catch: updateSchedulerRun → failed + mockUpdateSchedulerRun({ status: 'failed' }); + // finally: updateScheduler + mockUpdateScheduler(); + + await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID); + + expect(logger.error).toHaveBeenCalledWith( + 'Scheduler execution failed', + expect.objectContaining({ + message: expect.stringContaining('fan_out_field "links" is not an array'), + }), + expect.objectContaining({ function: 'executeScheduler' }), + ); + }); +}); diff --git a/workers/scheduler-manager-worker/tests/stage-runner.test.ts b/workers/scheduler-manager-worker/tests/stage-runner.test.ts new file mode 100644 index 0000000..09aa840 --- /dev/null +++ b/workers/scheduler-manager-worker/tests/stage-runner.test.ts @@ -0,0 +1,419 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { env, fetchMock } from 'cloudflare:test'; +import { createSupabaseClient } from '@audio-underview/supabase-connector'; +import type { SchedulerStageRow } from '@audio-underview/supabase-connector'; +import type { CrawlerExecutionClient } from '../sources/crawler-execution-client.ts'; +import type { Logger } from '@audio-underview/logger'; +import { + resolveDefaultInput, + executeStage, + executeFanOut, +} from '../sources/stage-runner.ts'; +import type { StageRunnerDependencies } from '../sources/stage-runner.ts'; + +const MOCK_SCHEDULER_ID = '00000000-0000-0000-0000-000000000010'; +const MOCK_STAGE_ID = '00000000-0000-0000-0000-000000000020'; +const MOCK_CRAWLER_ID = '00000000-0000-0000-0000-000000000030'; +const MOCK_RUN_ID = '00000000-0000-0000-0000-000000000040'; +const MOCK_STAGE_RUN_ID = '00000000-0000-0000-0000-000000000050'; + +function mockStage(overrides: Partial<SchedulerStageRow> = {}): SchedulerStageRow { + return { + id: MOCK_STAGE_ID, + scheduler_id: MOCK_SCHEDULER_ID, + crawler_id: MOCK_CRAWLER_ID, + stage_order: 0, + input_schema: { url: { type: 'string', default: 'https://example.com' } }, + output_schema: {}, + fan_out_field: null, + fan_out_strategy: 'compact', + created_at: '2026-01-01T00:00:00Z', + ...overrides, + } as SchedulerStageRow; +} + +function mockStageRunRow(overrides: Record<string, unknown> = {}) { + return { + id: MOCK_STAGE_RUN_ID, + run_id: MOCK_RUN_ID, + stage_id: MOCK_STAGE_ID, + stage_order: 0, + status: 'running', + started_at: '2026-01-01T00:00:00Z', + completed_at: null, + input: null, + output: null, + error: null, + items_total: null, + items_succeeded: null, + items_failed: null, + created_at: '2026-01-01T00:00:00Z', + ...overrides, + }; +} + +function mockSupabaseStageRunCreate(overrides: Record<string, unknown> = {}) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'POST' }) + .reply(201, JSON.stringify(mockStageRunRow(overrides))); +} + +function mockSupabaseStageRunUpdate(overrides: Record<string, unknown> = {}) { + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'PATCH' }) + .reply(200, JSON.stringify(mockStageRunRow(overrides))); +} + +function createMockCrawlerExecutionClient(): CrawlerExecutionClient & { + execute: ReturnType<typeof vi.fn>; +} { + return { + execute: vi.fn(), + }; +} + +function createMockLogger(): Logger { + return { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + createChild: vi.fn().mockReturnThis(), + } as unknown as Logger; +} + +function createDependencies( + crawlerExecutionClient?: CrawlerExecutionClient, +): { dependencies: StageRunnerDependencies; crawlerExecutionClient: ReturnType<typeof createMockCrawlerExecutionClient>; logger: ReturnType<typeof createMockLogger> } { + const client = (crawlerExecutionClient as ReturnType<typeof createMockCrawlerExecutionClient>) ?? createMockCrawlerExecutionClient(); + const logger = createMockLogger(); + const supabaseClient = createSupabaseClient({ + supabaseURL: env.SUPABASE_URL, + supabaseSecretKey: env.SUPABASE_SECRET_KEY, + }); + return { + dependencies: { supabaseClient, crawlerExecutionClient: client, logger }, + crawlerExecutionClient: client, + logger, + }; +} + +beforeEach(() => { + fetchMock.activate(); + fetchMock.disableNetConnect(); +}); + +afterEach(() => { + fetchMock.deactivate(); +}); + +describe('resolveDefaultInput', () => { + it('extracts default values from descriptor format', () => { + const schema = { + url: { type: 'string', default: 'https://example.com' }, + count: { type: 'number', default: 10 }, + }; + const result = resolveDefaultInput(schema); + expect(result).toEqual({ + url: 'https://example.com', + count: 10, + }); + }); + + it('skips fields without default', () => { + const schema = { + url: { type: 'string', default: 'https://example.com' }, + query: { type: 'string' }, + }; + const result = resolveDefaultInput(schema); + expect(result).toEqual({ url: 'https://example.com' }); + expect(result).not.toHaveProperty('query'); + }); + + it('returns empty object for schema with no defaults', () => { + const schema = { + url: { type: 'string' }, + query: { type: 'string' }, + }; + const result = resolveDefaultInput(schema); + expect(result).toEqual({}); + }); + + it('handles empty schema', () => { + const result = resolveDefaultInput({}); + expect(result).toEqual({}); + }); + + it('ignores non-object field values', () => { + const schema = { + url: 'not-an-object', + count: 42, + flag: null, + }; + const result = resolveDefaultInput(schema as Record<string, unknown>); + expect(result).toEqual({}); + }); + + it('handles default value of null', () => { + const schema = { + optional: { type: 'string', default: null }, + }; + const result = resolveDefaultInput(schema); + expect(result).toEqual({ optional: null }); + }); + + it('handles default value of false', () => { + const schema = { + enabled: { type: 'boolean', default: false }, + }; + const result = resolveDefaultInput(schema); + expect(result).toEqual({ enabled: false }); + }); +}); + +describe('executeStage', () => { + it('calls crawlerExecutionClient.execute and returns output with stageRun', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + const input = { url: 'https://example.com' }; + const crawlerResult = { items: [1, 2, 3] }; + + crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: crawlerResult }); + + mockSupabaseStageRunCreate(); + mockSupabaseStageRunUpdate({ status: 'completed', output: crawlerResult }); + + const result = await executeStage(dependencies, MOCK_RUN_ID, stage, input); + + expect(crawlerExecutionClient.execute).toHaveBeenCalledWith(MOCK_CRAWLER_ID, input); + expect(result.output).toEqual(crawlerResult); + expect(result.stageRun.status).toBe('completed'); + }); + + it('creates stage_run with status running then updates to completed', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + const crawlerResult = { data: 'test' }; + + crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: crawlerResult }); + + mockSupabaseStageRunCreate({ status: 'running' }); + mockSupabaseStageRunUpdate({ status: 'completed', output: crawlerResult }); + + const result = await executeStage(dependencies, MOCK_RUN_ID, stage, {}); + + expect(result.stageRun.id).toBe(MOCK_STAGE_RUN_ID); + expect(result.output).toEqual(crawlerResult); + }); + + it('updates stage_run to failed and re-throws on error', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies(); + const stage = mockStage(); + const executionError = new Error('Crawler execution failed'); + + crawlerExecutionClient.execute.mockRejectedValue(executionError); + + mockSupabaseStageRunCreate({ status: 'running' }); + mockSupabaseStageRunUpdate({ status: 'failed', error: 'Crawler execution failed' }); + + await expect(executeStage(dependencies, MOCK_RUN_ID, stage, {})).rejects.toThrow( + 'Crawler execution failed', + ); + + expect(logger.error).toHaveBeenCalledWith( + 'Stage execution failed', + executionError, + expect.objectContaining({ + function: 'executeStage', + metadata: { stageID: MOCK_STAGE_ID, stageOrder: 0 }, + }), + ); + }); + + it('passes stage.crawler_id to crawlerExecutionClient.execute', async () => { + const customCrawlerID = '00000000-0000-0000-0000-999999999999'; + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage({ crawler_id: customCrawlerID }); + + crawlerExecutionClient.execute.mockResolvedValue({ type: 'web', result: {} }); + + mockSupabaseStageRunCreate(); + mockSupabaseStageRunUpdate({ status: 'completed' }); + + await executeStage(dependencies, MOCK_RUN_ID, stage, { key: 'value' }); + + expect(crawlerExecutionClient.execute).toHaveBeenCalledWith(customCrawlerID, { key: 'value' }); + }); + + it('falls back to original stageRun if update returns null', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + + crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: 'output' }); + + mockSupabaseStageRunCreate({ status: 'running' }); + + // Simulate PGRST116 (no rows updated) → supabase returns 406 + fetchMock + .get('https://supabase.example.com') + .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'PATCH' }) + .reply(406, JSON.stringify({ + code: 'PGRST116', + details: 'The result contains 0 rows', + hint: null, + message: 'JSON object requested, multiple (or no) rows returned', + })); + + const result = await executeStage(dependencies, MOCK_RUN_ID, stage, {}); + + // When updateSchedulerStageRun returns null, fallback to original stageRun + expect(result.stageRun.id).toBe(MOCK_STAGE_RUN_ID); + expect(result.stageRun.status).toBe('running'); + }); +}); + +describe('executeFanOut', () => { + it('executes each item sequentially and returns all results', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + const items = [{ url: 'a' }, { url: 'b' }, { url: 'c' }]; + + crawlerExecutionClient.execute + .mockResolvedValueOnce({ type: 'data', result: 'result-a' }) + .mockResolvedValueOnce({ type: 'data', result: 'result-b' }) + .mockResolvedValueOnce({ type: 'data', result: 'result-c' }); + + const result = await executeFanOut(dependencies, stage, items); + + expect(result.results).toEqual(['result-a', 'result-b', 'result-c']); + expect(result.itemsTotal).toBe(3); + expect(result.itemsSucceeded).toBe(3); + expect(result.itemsFailed).toBe(0); + expect(result.status).toBe('completed'); + }); + + it('returns completed status when all items succeed', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + + crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: 'ok' }); + + const result = await executeFanOut(dependencies, stage, [{ a: 1 }, { a: 2 }]); + + expect(result.status).toBe('completed'); + expect(result.itemsFailed).toBe(0); + expect(result.itemsSucceeded).toBe(2); + }); + + it('returns partially_failed when some items fail', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + + crawlerExecutionClient.execute + .mockResolvedValueOnce({ type: 'data', result: 'ok' }) + .mockRejectedValueOnce(new Error('item failed')) + .mockResolvedValueOnce({ type: 'data', result: 'ok' }); + + const result = await executeFanOut(dependencies, stage, ['a', 'b', 'c']); + + expect(result.status).toBe('partially_failed'); + expect(result.itemsSucceeded).toBe(2); + expect(result.itemsFailed).toBe(1); + expect(result.itemsTotal).toBe(3); + expect(result.results).toEqual(['ok', 'ok']); + }); + + it('returns failed when all items fail', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies(); + const stage = mockStage(); + + crawlerExecutionClient.execute + .mockRejectedValueOnce(new Error('fail-1')) + .mockRejectedValueOnce(new Error('fail-2')); + + const result = await executeFanOut(dependencies, stage, ['a', 'b']); + + expect(result.status).toBe('failed'); + expect(result.itemsSucceeded).toBe(0); + expect(result.itemsFailed).toBe(2); + expect(result.itemsTotal).toBe(2); + expect(result.results).toEqual([]); + }); + + it('preserves null results from successful crawlers', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + + crawlerExecutionClient.execute + .mockResolvedValueOnce({ type: 'web', result: 'first' }) + .mockResolvedValueOnce({ type: 'web', result: null }) + .mockResolvedValueOnce({ type: 'web', result: 'third' }); + + const result = await executeFanOut(dependencies, stage, ['a', 'b', 'c']); + + expect(result.status).toBe('completed'); + expect(result.itemsSucceeded).toBe(3); + expect(result.results).toEqual(['first', null, 'third']); + }); + + it('handles empty items array', async () => { + const { dependencies } = createDependencies(); + const stage = mockStage(); + + const result = await executeFanOut(dependencies, stage, []); + + expect(result.results).toEqual([]); + expect(result.itemsTotal).toBe(0); + expect(result.itemsSucceeded).toBe(0); + expect(result.itemsFailed).toBe(0); + // With 0 failed items, status is 'completed' + expect(result.status).toBe('completed'); + }); + + it('logs warning for each failed item', async () => { + const { dependencies, crawlerExecutionClient, logger } = createDependencies(); + const stage = mockStage(); + const error1 = new Error('fail-1'); + const error2 = new Error('fail-2'); + + crawlerExecutionClient.execute + .mockRejectedValueOnce(error1) + .mockRejectedValueOnce(error2); + + await executeFanOut(dependencies, stage, ['a', 'b']); + + expect(logger.warn).toHaveBeenCalledTimes(2); + expect(logger.warn).toHaveBeenCalledWith( + 'Fan-out item failed', + error1, + expect.objectContaining({ + function: 'executeFanOut', + metadata: { stageID: MOCK_STAGE_ID, itemIndex: 0 }, + }), + ); + expect(logger.warn).toHaveBeenCalledWith( + 'Fan-out item failed', + error2, + expect.objectContaining({ + function: 'executeFanOut', + metadata: { stageID: MOCK_STAGE_ID, itemIndex: 1 }, + }), + ); + }); + + it('calls execute with correct crawler_id for each item', async () => { + const { dependencies, crawlerExecutionClient } = createDependencies(); + const stage = mockStage(); + + crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: null }); + + const items = [{ url: 'x' }, { url: 'y' }]; + await executeFanOut(dependencies, stage, items); + + expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(2); + expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(1, MOCK_CRAWLER_ID, { url: 'x' }); + expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(2, MOCK_CRAWLER_ID, { url: 'y' }); + }); +}); diff --git a/workers/scheduler-manager-worker/vitest.config.ts b/workers/scheduler-manager-worker/vitest.config.ts index 08b2372..65a0dbe 100644 --- a/workers/scheduler-manager-worker/vitest.config.ts +++ b/workers/scheduler-manager-worker/vitest.config.ts @@ -13,6 +13,13 @@ export default defineWorkersConfig({ SUPABASE_SECRET_KEY: 'test-secret-key', JWT_SECRET: 'test-jwt-secret-key-for-testing-only', }, + workers: [ + { + name: 'audio-underview-crawler-manager-worker', + modules: true, + script: 'export default { async fetch() { return Response.json({ success: true, result: null }); } }', + }, + ], }, }, }, diff --git a/workers/scheduler-manager-worker/wrangler.toml b/workers/scheduler-manager-worker/wrangler.toml index 65742d2..cbb218c 100644 --- a/workers/scheduler-manager-worker/wrangler.toml +++ b/workers/scheduler-manager-worker/wrangler.toml @@ -10,6 +10,10 @@ ALLOWED_ORIGINS = "http://localhost:5173,https://audio-underview.pages.dev" # SUPABASE_SECRET_KEY # JWT_SECRET +[[services]] +binding = "CRAWLER_MANAGER" +service = "audio-underview-crawler-manager-worker" + [observability.logs] enabled = true head_sampling_rate = 1 diff --git a/workers/tools/sources/index.ts b/workers/tools/sources/index.ts index 55c7a0b..0fb9e55 100644 --- a/workers/tools/sources/index.ts +++ b/workers/tools/sources/index.ts @@ -5,6 +5,11 @@ export type { BaseEnvironment, OAuthWorkerHandlers, OAuthWorkerOptions, + CrawlerExecuteResult, +} from './types.ts'; + +export { + validateCrawlerExecuteResult, } from './types.ts'; export { diff --git a/workers/tools/sources/types.ts b/workers/tools/sources/types.ts index 9bf12a2..982aeff 100644 --- a/workers/tools/sources/types.ts +++ b/workers/tools/sources/types.ts @@ -39,3 +39,22 @@ export interface OAuthWorkerOptions<Environment extends BaseEnvironment> { logger: Logger; handlers: OAuthWorkerHandlers<Environment>; } + +export interface CrawlerExecuteResult { + type: 'web' | 'data'; + result: unknown; +} + +export function validateCrawlerExecuteResult(value: unknown): CrawlerExecuteResult { + if (value === null || value === undefined || typeof value !== 'object') { + throw new Error('Invalid CrawlerExecuteResult: expected object'); + } + const record = value as Record<string, unknown>; + if (record.type !== 'web' && record.type !== 'data') { + throw new Error(`Invalid CrawlerExecuteResult: expected type 'web' or 'data', got '${String(record.type)}'`); + } + if (!('result' in record)) { + throw new Error('Invalid CrawlerExecuteResult: missing result field'); + } + return { type: record.type, result: record.result }; +}