diff --git a/applications/web/sources/components/schedulers/RunHistorySection.tsx b/applications/web/sources/components/schedulers/RunHistorySection.tsx
index 6b38fd5..06fac48 100644
--- a/applications/web/sources/components/schedulers/RunHistorySection.tsx
+++ b/applications/web/sources/components/schedulers/RunHistorySection.tsx
@@ -223,13 +223,13 @@ export function RunHistorySection({ schedulerID }: RunHistorySectionProperties)
{run.error}
>
)}
- {run.result && (
+ {run.result != null && (
<>
Result
{JSON.stringify(run.result, null, 2)}
>
)}
- {!run.error && !run.result && (
+ {!run.error && run.result == null && (
No additional details.
)}
diff --git a/functions/crawler-code-runner-function/sources/index.ts b/functions/crawler-code-runner-function/sources/index.ts
index 2e0ca50..fec1d7c 100644
--- a/functions/crawler-code-runner-function/sources/index.ts
+++ b/functions/crawler-code-runner-function/sources/index.ts
@@ -1,4 +1,4 @@
-import { createServerLogger } from '@audio-underview/logger';
+import { createServerLogger, Logger } from '@audio-underview/logger';
import {
type LambdaEvent,
type LambdaResponse,
@@ -10,12 +10,36 @@ import {
import { createContext, Script } from 'node:vm';
import { lookup } from 'node:dns/promises';
-interface RunRequestBody {
- type: 'test' | 'run';
+interface WebRunRequestBody {
+ type: 'web';
+ mode: 'test' | 'run';
url: string;
code: string;
}
+interface DataRunRequestBody {
+ type: 'data';
+ mode: 'test' | 'run';
+ data: unknown;
+ code: string;
+}
+
+type RunRequestBody = WebRunRequestBody | DataRunRequestBody;
+
+class SandboxTimeoutError extends Error {
+ constructor(message: string) {
+ super(message);
+ this.name = 'SandboxTimeoutError';
+ }
+}
+
+class SandboxExecutionError extends Error {
+ constructor(message: string) {
+ super(message);
+ this.name = 'SandboxExecutionError';
+ }
+}
+
const FETCH_TIMEOUT_MILLISECONDS = 10_000;
const CODE_EXECUTION_TIMEOUT_MILLISECONDS = 5_000;
const MAX_CODE_LENGTH = 10_000;
@@ -67,6 +91,90 @@ async function validateTargetURL(targetURL: URL, context: ResponseContext): Prom
return null;
}
+async function executeInSandbox(code: string, argument: unknown, logger: Logger): Promise<{ result: unknown }> {
+ const sandbox = createContext({
+ Array,
+ Boolean,
+ Date,
+ Error,
+ JSON,
+ Map,
+ Math,
+ Number,
+ Object,
+ Promise,
+ RegExp,
+ Set,
+ String,
+ TypeError,
+ RangeError,
+ URL,
+ URLSearchParams,
+ parseInt,
+ parseFloat,
+ isNaN,
+ isFinite,
+ encodeURIComponent,
+ decodeURIComponent,
+ encodeURI,
+ decodeURI,
+ undefined,
+ NaN,
+ Infinity,
+ });
+
+ // For non-string arguments, re-create inside sandbox context via JSON.parse.
+ // This ensures Array.isArray, instanceof etc. work correctly inside the sandbox.
+ let sandboxArgument: unknown = argument;
+ if (typeof argument !== 'string') {
+ sandbox.__rawInput__ = JSON.stringify(argument);
+ new Script('globalThis.__input__ = JSON.parse(globalThis.__rawInput__)').runInContext(sandbox);
+ sandboxArgument = sandbox.__input__;
+ delete sandbox.__rawInput__;
+ delete sandbox.__input__;
+ }
+
+ try {
+ const script = new Script(`(${code})`);
+ const userFunction = script.runInContext(sandbox, { timeout: CODE_EXECUTION_TIMEOUT_MILLISECONDS });
+ let timer: ReturnType | undefined;
+ const asyncTimeout = new Promise((_, reject) => {
+ timer = setTimeout(() => reject(new Error('Async execution timed out')), CODE_EXECUTION_TIMEOUT_MILLISECONDS);
+ timer.unref();
+ });
+ let result: unknown;
+ try {
+ result = await Promise.race([userFunction(sandboxArgument), asyncTimeout]);
+ } finally {
+ clearTimeout(timer);
+ }
+
+ // Normalize undefined to null to prevent JSON.stringify field drop
+ if (result === undefined) {
+ result = null;
+ }
+
+ return { result };
+ } catch (executionError) {
+ logger.error('Code execution failed', executionError, { function: 'executeInSandbox' });
+ if (
+ executionError != null
+ && typeof executionError === 'object'
+ && 'code' in executionError
+ && executionError.code === 'ERR_SCRIPT_EXECUTION_TIMEOUT'
+ ) {
+ throw new SandboxTimeoutError(`Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`);
+ }
+ const message = executionError instanceof Error
+ ? executionError.message
+ : 'Unknown execution error';
+ if (message === 'Async execution timed out') {
+ throw new SandboxTimeoutError(`Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`);
+ }
+ throw new SandboxExecutionError(message);
+ }
+}
+
const logger = createServerLogger({
defaultContext: {
module: 'crawler-code-runner-function',
@@ -81,24 +189,58 @@ const HELP = {
{
method: 'POST',
path: '/run',
- description: 'Fetch a URL and run code against the response body',
+ description: 'Run code against a fetched URL response (web) or provided data (data)',
body: {
- type: "'test' | 'run'",
- url: 'string - The URL to fetch',
- code: 'string - JavaScript function source to execute against the fetched response body',
+ type: "'web' | 'data'",
+ mode: "'test' | 'run'",
+ url: "string - The URL to fetch (required for type 'web')",
+ data: "unknown - The data to process (required for type 'data')",
+ code: 'string - JavaScript function source to execute against the input',
},
},
],
};
-function isRunRequestBody(value: unknown): value is RunRequestBody {
- if (value == null || typeof value !== 'object') return false;
- const object = value as Record;
- return (
- (object.type === 'test' || object.type === 'run')
- && typeof object.url === 'string'
- && typeof object.code === 'string'
- );
+function validateRunRequestBody(raw: unknown): RunRequestBody | string {
+ if (raw == null || typeof raw !== 'object') {
+ return 'Request body must be a JSON object';
+ }
+ const object = raw as Record;
+
+ if (object.type !== 'web' && object.type !== 'data') {
+ return "Field 'type' must be 'web' or 'data'";
+ }
+
+ if (object.mode !== 'test' && object.mode !== 'run') {
+ return "Field 'mode' must be 'test' or 'run'";
+ }
+
+ if (typeof object.code !== 'string') {
+ return "Field 'code' is required and must be a string";
+ }
+
+ if (object.type === 'web') {
+ if (typeof object.url !== 'string') {
+ return "Field 'url' is required and must be a string when type is 'web'";
+ }
+ return {
+ type: 'web',
+ mode: object.mode as 'test' | 'run',
+ url: object.url as string,
+ code: object.code as string,
+ };
+ }
+
+ // type === 'data'
+ if (!('data' in object)) {
+ return "Field 'data' is required when type is 'data'";
+ }
+ return {
+ type: 'data',
+ mode: object.mode as 'test' | 'run',
+ data: object.data,
+ code: object.code as string,
+ };
}
async function handleRun(body: string | undefined, context: ResponseContext): Promise {
@@ -109,116 +251,76 @@ async function handleRun(body: string | undefined, context: ResponseContext): Pr
return errorResponse('invalid_request', 'Request body must be valid JSON', 400, context);
}
- if (!isRunRequestBody(raw)) {
- const object = raw != null && typeof raw === 'object' ? raw as Record : {};
- if (!object.type || (object.type !== 'test' && object.type !== 'run')) {
- return errorResponse('invalid_request', "Field 'type' must be 'test' or 'run'", 400, context);
- }
- if (!object.url || typeof object.url !== 'string') {
- return errorResponse('invalid_request', "Field 'url' is required and must be a string", 400, context);
- }
- return errorResponse('invalid_request', "Field 'code' is required and must be a string", 400, context);
+ const validated = validateRunRequestBody(raw);
+ if (typeof validated === 'string') {
+ return errorResponse('invalid_request', validated, 400, context);
}
- const parsed = raw;
+ const parsed = validated;
if (parsed.code.length > MAX_CODE_LENGTH) {
return errorResponse('invalid_request', `Field 'code' exceeds maximum length of ${MAX_CODE_LENGTH} characters`, 400, context);
}
- let targetURL: URL;
- try {
- targetURL = new URL(parsed.url);
- } catch {
- return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context);
- }
+ if (parsed.type === 'web') {
+ let targetURL: URL;
+ try {
+ targetURL = new URL(parsed.url);
+ } catch {
+ return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context);
+ }
- const ssrfError = await validateTargetURL(targetURL, context);
- if (ssrfError) {
- return ssrfError;
- }
+ const ssrfError = await validateTargetURL(targetURL, context);
+ if (ssrfError) {
+ return ssrfError;
+ }
- let responseText: string;
- try {
- const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS);
- logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' });
- const fetchResponse = await fetch(targetURL.toString(), { signal });
- responseText = await fetchResponse.text();
- logger.info('Target URL fetched', {
- status: fetchResponse.status,
- contentLength: responseText.length,
- }, { function: 'handleRun' });
- } catch (fetchError) {
- if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') {
- logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' });
- return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context);
+ let responseText: string;
+ try {
+ const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS);
+ logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' });
+ const fetchResponse = await fetch(targetURL.toString(), { signal });
+ responseText = await fetchResponse.text();
+ logger.info('Target URL fetched', {
+ status: fetchResponse.status,
+ contentLength: responseText.length,
+ }, { function: 'handleRun' });
+ } catch (fetchError) {
+ if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') {
+ logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' });
+ return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context);
+ }
+ logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' });
+ return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context);
}
- logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' });
- return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context);
- }
- let result: unknown;
- try {
- const sandbox = createContext({
- Array,
- Boolean,
- Date,
- Error,
- JSON,
- Map,
- Math,
- Number,
- Object,
- Promise,
- RegExp,
- Set,
- String,
- TypeError,
- RangeError,
- parseInt,
- parseFloat,
- isNaN,
- isFinite,
- encodeURIComponent,
- decodeURIComponent,
- encodeURI,
- decodeURI,
- undefined,
- NaN,
- Infinity,
- });
- const script = new Script(`(${parsed.code})`);
- const userFunction = script.runInContext(sandbox, { timeout: CODE_EXECUTION_TIMEOUT_MILLISECONDS });
- let timer: ReturnType | undefined;
- const asyncTimeout = new Promise((_, reject) => {
- timer = setTimeout(() => reject(new Error('Async execution timed out')), CODE_EXECUTION_TIMEOUT_MILLISECONDS);
- timer.unref();
- });
try {
- result = await Promise.race([userFunction(responseText), asyncTimeout]);
- } finally {
- clearTimeout(timer);
+ const { result } = await executeInSandbox(parsed.code, responseText, logger);
+ return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context);
+ } catch (executionError) {
+ if (executionError instanceof SandboxTimeoutError) {
+ return errorResponse('execution_timeout', executionError.message, 422, context);
+ }
+ if (executionError instanceof SandboxExecutionError) {
+ return errorResponse('execution_failed', executionError.message, 422, context);
+ }
+ return errorResponse('execution_failed', 'Unknown execution error', 422, context);
}
+ }
+
+ // type === 'data'
+ try {
+ const { result } = await executeInSandbox(parsed.code, parsed.data, logger);
+ return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context);
} catch (executionError) {
- logger.error('Code execution failed', executionError, { function: 'handleRun' });
- if (
- executionError != null
- && typeof executionError === 'object'
- && 'code' in executionError
- && executionError.code === 'ERR_SCRIPT_EXECUTION_TIMEOUT'
- ) {
- return errorResponse('execution_timeout', `Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`, 422, context);
+ if (executionError instanceof SandboxTimeoutError) {
+ return errorResponse('execution_timeout', executionError.message, 422, context);
}
- const message = executionError instanceof Error
- ? executionError.message
- : 'Unknown execution error';
- if (message === 'Async execution timed out') {
- return errorResponse('execution_timeout', `Code execution timed out after ${CODE_EXECUTION_TIMEOUT_MILLISECONDS}ms`, 422, context);
+ if (executionError instanceof SandboxExecutionError) {
+ return errorResponse('execution_failed', executionError.message, 422, context);
}
- return errorResponse('execution_failed', message, 422, context);
+ return errorResponse('execution_failed', 'Unknown execution error', 422, context);
}
-
- return jsonResponse({ type: parsed.type, result }, 200, context);
}
export async function handler(event: LambdaEvent): Promise {
diff --git a/functions/crawler-code-runner-function/tests/index.test.ts b/functions/crawler-code-runner-function/tests/index.test.ts
index b498c9a..fae66d5 100644
--- a/functions/crawler-code-runner-function/tests/index.test.ts
+++ b/functions/crawler-code-runner-function/tests/index.test.ts
@@ -109,7 +109,7 @@ describe('crawler-code-runner-function', () => {
path: '/run',
origin: 'https://example.com',
contentType: 'application/json',
- body: JSON.stringify({ url: 'https://example.com', code: '(x) => x' }),
+ body: JSON.stringify({ mode: 'test', url: 'https://example.com', code: '(x) => x' }),
});
const response = await handler(event);
@@ -125,22 +125,55 @@ describe('crawler-code-runner-function', () => {
path: '/run',
origin: 'https://example.com',
contentType: 'application/json',
- body: JSON.stringify({ type: 'invalid', url: 'https://example.com', code: '(x) => x' }),
+ body: JSON.stringify({ type: 'invalid', mode: 'test', url: 'https://example.com', code: '(x) => x' }),
});
const response = await handler(event);
expect(response.statusCode).toBe(400);
const body = JSON.parse(response.body);
expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('type');
});
- it('returns 400 for missing url field', async () => {
+ it('returns 400 for missing mode field', async () => {
const event = createEvent({
method: 'POST',
path: '/run',
origin: 'https://example.com',
contentType: 'application/json',
- body: JSON.stringify({ type: 'run', code: '(x) => x' }),
+ body: JSON.stringify({ type: 'web', url: 'https://example.com', code: '(x) => x' }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(400);
+ const body = JSON.parse(response.body);
+ expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('mode');
+ });
+
+ it('returns 400 for invalid mode value', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({ type: 'web', mode: 'invalid', url: 'https://example.com', code: '(x) => x' }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(400);
+ const body = JSON.parse(response.body);
+ expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('mode');
+ });
+
+ it('returns 400 for missing url field when type is web', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({ type: 'web', mode: 'run', code: '(x) => x' }),
});
const response = await handler(event);
@@ -150,13 +183,29 @@ describe('crawler-code-runner-function', () => {
expect(body.error_description).toContain('url');
});
+ it('returns 400 for missing data field when type is data', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({ type: 'data', mode: 'run', code: '(x) => x' }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(400);
+ const body = JSON.parse(response.body);
+ expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('data');
+ });
+
it('returns 400 for missing code field', async () => {
const event = createEvent({
method: 'POST',
path: '/run',
origin: 'https://example.com',
contentType: 'application/json',
- body: JSON.stringify({ type: 'run', url: 'https://example.com' }),
+ body: JSON.stringify({ type: 'web', mode: 'run', url: 'https://example.com' }),
});
const response = await handler(event);
@@ -172,7 +221,7 @@ describe('crawler-code-runner-function', () => {
path: '/run',
origin: 'https://example.com',
contentType: 'application/json',
- body: JSON.stringify({ type: 'run', url: 'not-a-url', code: '(x) => x' }),
+ body: JSON.stringify({ type: 'web', mode: 'run', url: 'not-a-url', code: '(x) => x' }),
});
const response = await handler(event);
@@ -183,7 +232,7 @@ describe('crawler-code-runner-function', () => {
});
});
- describe('POST /run fetch failures', () => {
+ describe('POST /run web type - fetch failures', () => {
it('returns 502 when target URL fetch fails', async () => {
vi.stubGlobal('fetch', vi.fn().mockRejectedValue(new Error('Network error')));
@@ -193,7 +242,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '(text) => text',
}),
@@ -208,7 +258,7 @@ describe('crawler-code-runner-function', () => {
});
});
- describe('POST /run execution failures', () => {
+ describe('POST /run web type - execution failures', () => {
beforeEach(() => {
vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
status: 200,
@@ -227,7 +277,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '(text) => { throw new Error("intentional error"); }',
}),
@@ -247,7 +298,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '((( invalid syntax',
}),
@@ -260,12 +312,12 @@ describe('crawler-code-runner-function', () => {
});
});
- describe('POST /run successful execution', () => {
+ describe('POST /run web type - successful execution', () => {
afterEach(() => {
vi.unstubAllGlobals();
});
- it('executes code and returns result with type test', async () => {
+ it('executes code and returns result with mode test', async () => {
vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
status: 200,
text: () => Promise.resolve('hello world'),
@@ -277,7 +329,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'test',
+ type: 'web',
+ mode: 'test',
url: 'https://target.example.com/data',
code: '(text) => text.toUpperCase()',
}),
@@ -286,11 +339,12 @@ describe('crawler-code-runner-function', () => {
expect(response.statusCode).toBe(200);
const body = JSON.parse(response.body);
- expect(body.type).toBe('test');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('test');
expect(body.result).toBe('HELLO WORLD');
});
- it('executes code and returns result with type run', async () => {
+ it('executes code and returns result with mode run', async () => {
vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
status: 200,
text: () => Promise.resolve('hello world'),
@@ -302,7 +356,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '(text) => text.length',
}),
@@ -311,7 +366,8 @@ describe('crawler-code-runner-function', () => {
expect(response.statusCode).toBe(200);
const body = JSON.parse(response.body);
- expect(body.type).toBe('run');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('run');
expect(body.result).toBe(11);
});
@@ -327,7 +383,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/page',
code: '(text) => ({ length: text.length, hasTitle: text.includes("") })',
}),
@@ -336,7 +393,8 @@ describe('crawler-code-runner-function', () => {
expect(response.statusCode).toBe(200);
const body = JSON.parse(response.body);
- expect(body.type).toBe('run');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('run');
expect(body.result.length).toBe(43);
expect(body.result.hasTitle).toBe(true);
});
@@ -353,7 +411,8 @@ describe('crawler-code-runner-function', () => {
origin: 'https://example.com',
contentType: 'application/json',
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: 'async (text) => text.split(" ")',
}),
@@ -362,7 +421,8 @@ describe('crawler-code-runner-function', () => {
expect(response.statusCode).toBe(200);
const body = JSON.parse(response.body);
- expect(body.type).toBe('run');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('run');
expect(body.result).toEqual(['async', 'test']);
});
@@ -373,7 +433,8 @@ describe('crawler-code-runner-function', () => {
}));
const rawBody = JSON.stringify({
- type: 'test',
+ type: 'web',
+ mode: 'test',
url: 'https://target.example.com/data',
code: '(text) => text.toUpperCase()',
});
@@ -389,9 +450,229 @@ describe('crawler-code-runner-function', () => {
expect(response.statusCode).toBe(200);
const body = JSON.parse(response.body);
- expect(body.type).toBe('test');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('test');
expect(body.result).toBe('HELLO');
});
+
+ it('normalizes undefined result to null', async () => {
+ vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
+ status: 200,
+ text: () => Promise.resolve('hello'),
+ }));
+
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'web',
+ mode: 'run',
+ url: 'https://target.example.com/data',
+ code: '(text) => undefined',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.result).toBeNull();
+ });
+ });
+
+ describe('POST /run data type - successful execution', () => {
+ it('executes code against provided data object', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: { items: [1, 2, 3] },
+ code: '(data) => data.items.length',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('run');
+ expect(body.result).toBe(3);
+ });
+
+ it('executes code against provided data array', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'test',
+ data: [10, 20, 30],
+ code: '(data) => data.map((x) => x * 2)',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('test');
+ expect(body.result).toEqual([20, 40, 60]);
+ });
+
+ it('executes code against provided string data', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: 'hello world',
+ code: '(data) => data.toUpperCase()',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('run');
+ expect(body.result).toBe('HELLO WORLD');
+ });
+
+ it('executes code against null data', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: null,
+ code: '(data) => data === null ? "was null" : "not null"',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('run');
+ expect(body.result).toBe('was null');
+ });
+
+ it('ensures Array.isArray works for array data in sandbox', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'test',
+ data: [1, 2, 3],
+ code: '(data) => Array.isArray(data)',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.result).toBe(true);
+ });
+
+ it('normalizes undefined result to null for data type', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: 'anything',
+ code: '(data) => undefined',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.result).toBeNull();
+ });
+
+ it('does not perform SSRF check for data type', async () => {
+ // data type should not trigger any fetch or DNS lookup
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: { value: 42 },
+ code: '(data) => data.value + 1',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(200);
+ const body = JSON.parse(response.body);
+ expect(body.result).toBe(43);
+ });
+ });
+
+ describe('POST /run data type - execution failures', () => {
+ it('returns 422 when code execution throws an error', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: { value: 1 },
+ code: '(data) => { throw new Error("data processing error"); }',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(422);
+ const body = JSON.parse(response.body);
+ expect(body.error).toBe('execution_failed');
+ expect(body.error_description).toContain('data processing error');
+ });
+
+ it('returns 422 when code is syntactically invalid', async () => {
+ const event = createEvent({
+ method: 'POST',
+ path: '/run',
+ origin: 'https://example.com',
+ contentType: 'application/json',
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: 'test',
+ code: '((( invalid syntax',
+ }),
+ });
+ const response = await handler(event);
+
+ expect(response.statusCode).toBe(422);
+ const body = JSON.parse(response.body);
+ expect(body.error).toBe('execution_failed');
+ });
});
describe('unknown routes', () => {
diff --git a/packages/supabase-connector/migrations/006_add_active_run_unique_constraint.sql b/packages/supabase-connector/migrations/006_add_active_run_unique_constraint.sql
new file mode 100644
index 0000000..798190b
--- /dev/null
+++ b/packages/supabase-connector/migrations/006_add_active_run_unique_constraint.sql
@@ -0,0 +1,19 @@
+-- Migration: Prevent concurrent runs for the same scheduler
+-- Only one run with status 'pending' or 'running' can exist per scheduler at a time.
+-- When a run completes (completed/failed/partially_failed), the index no longer blocks new runs.
+
+-- Clean up duplicate active runs before creating the unique index.
+-- Keeps the most recent active run per scheduler, marks others as failed.
+UPDATE scheduler_runs
+SET status = 'failed', error = 'Cleaned up by migration 006', completed_at = NOW()
+WHERE id NOT IN (
+ SELECT DISTINCT ON (scheduler_id) id
+ FROM scheduler_runs
+ WHERE status IN ('pending', 'running')
+ ORDER BY scheduler_id, created_at DESC
+)
+AND status IN ('pending', 'running');
+
+CREATE UNIQUE INDEX scheduler_runs_one_active_per_scheduler
+ ON scheduler_runs (scheduler_id)
+ WHERE status IN ('pending', 'running');
diff --git a/packages/supabase-connector/migrations/007_create_crawler_permissions.sql b/packages/supabase-connector/migrations/007_create_crawler_permissions.sql
new file mode 100644
index 0000000..2a520a4
--- /dev/null
+++ b/packages/supabase-connector/migrations/007_create_crawler_permissions.sql
@@ -0,0 +1,26 @@
+-- Migration: Crawler permission system
+-- Controls who can use a crawler in their scheduler stages.
+-- Extensible for marketplace subscriptions.
+
+CREATE TYPE crawler_permission_level AS ENUM ('owner', 'subscriber');
+
+CREATE TABLE crawler_permissions (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ crawler_id UUID NOT NULL REFERENCES crawlers(id) ON DELETE CASCADE,
+ user_uuid UUID NOT NULL REFERENCES users(uuid) ON DELETE CASCADE,
+ level crawler_permission_level NOT NULL,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ UNIQUE(crawler_id, user_uuid)
+);
+
+CREATE INDEX crawler_permissions_user_uuid_index ON crawler_permissions(user_uuid);
+CREATE INDEX crawler_permissions_crawler_id_index ON crawler_permissions(crawler_id);
+
+COMMENT ON TABLE crawler_permissions IS 'Controls access to crawlers. Owner = creator, subscriber = marketplace user.';
+COMMENT ON COLUMN crawler_permissions.level IS 'Permission level: owner (full control), subscriber (can use in stages)';
+
+-- Backfill: grant owner permission to existing crawler creators
+INSERT INTO crawler_permissions (crawler_id, user_uuid, level)
+SELECT id, user_uuid, 'owner'
+FROM crawlers
+ON CONFLICT (crawler_id, user_uuid) DO NOTHING;
diff --git a/packages/supabase-connector/migrations/008_add_fan_out_strategy.sql b/packages/supabase-connector/migrations/008_add_fan_out_strategy.sql
new file mode 100644
index 0000000..4f8abcc
--- /dev/null
+++ b/packages/supabase-connector/migrations/008_add_fan_out_strategy.sql
@@ -0,0 +1,8 @@
+-- Add fan_out_strategy column to scheduler_stages
+-- 'compact' (default): remove failed items from results
+-- 'preserve': keep failed items as null, preserving positional alignment
+
+CREATE TYPE fan_out_strategy AS ENUM ('compact', 'preserve');
+
+ALTER TABLE scheduler_stages
+ ADD COLUMN fan_out_strategy fan_out_strategy NOT NULL DEFAULT 'compact';
diff --git a/packages/supabase-connector/sources/crawler-permissions.ts b/packages/supabase-connector/sources/crawler-permissions.ts
new file mode 100644
index 0000000..1e5d6ab
--- /dev/null
+++ b/packages/supabase-connector/sources/crawler-permissions.ts
@@ -0,0 +1,97 @@
+import type { SupabaseClient } from '@supabase/supabase-js';
+import { traceDatabaseOperation, SpanStatusCode } from '@audio-underview/axiom-logger/tracers';
+import type {
+ Database,
+ CrawlerPermissionRow,
+ CrawlerPermissionLevel,
+} from './types/index.ts';
+
+type SupabaseClientType = SupabaseClient;
+
+export async function createCrawlerPermission(
+ client: SupabaseClientType,
+ parameters: {
+ crawler_id: string;
+ user_uuid: string;
+ level: CrawlerPermissionLevel;
+ },
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'insert', table: 'crawler_permissions' },
+ async (span) => {
+ span.setAttribute('db.insert.crawler_id', parameters.crawler_id);
+ span.setAttribute('db.insert.user_uuid', parameters.user_uuid);
+ span.setAttribute('db.insert.level', parameters.level);
+
+ const { data, error } = await client
+ .from('crawler_permissions')
+ .insert(parameters)
+ .select()
+ .single();
+
+ if (error) {
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to create crawler permission: ${error.message}`);
+ }
+
+ span.setAttribute('db.rows_affected', 1);
+ return data as CrawlerPermissionRow;
+ },
+ );
+}
+
+export async function getCrawlerPermission(
+ client: SupabaseClientType,
+ crawlerID: string,
+ userUUID: string,
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'select', table: 'crawler_permissions' },
+ async (span) => {
+ span.setAttribute('db.query.crawler_id', crawlerID);
+ span.setAttribute('db.query.user_uuid', userUUID);
+
+ const { data, error } = await client
+ .from('crawler_permissions')
+ .select()
+ .eq('crawler_id', crawlerID)
+ .eq('user_uuid', userUUID)
+ .maybeSingle();
+
+ if (error) {
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to get crawler permission: ${error.message}`);
+ }
+
+ span.setAttribute('db.rows_affected', data === null ? 0 : 1);
+ return (data as CrawlerPermissionRow | null) ?? undefined;
+ },
+ );
+}
+
+export async function deleteCrawlerPermission(
+ client: SupabaseClientType,
+ crawlerID: string,
+ userUUID: string,
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'delete', table: 'crawler_permissions' },
+ async (span) => {
+ span.setAttribute('db.query.crawler_id', crawlerID);
+ span.setAttribute('db.query.user_uuid', userUUID);
+
+ const { error } = await client
+ .from('crawler_permissions')
+ .delete()
+ .eq('crawler_id', crawlerID)
+ .eq('user_uuid', userUUID);
+
+ if (error) {
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to delete crawler permission: ${error.message}`);
+ }
+
+ span.setAttribute('db.rows_affected', 1);
+ },
+ );
+}
diff --git a/packages/supabase-connector/sources/crawlers.ts b/packages/supabase-connector/sources/crawlers.ts
index 2d3b030..6dd7546 100644
--- a/packages/supabase-connector/sources/crawlers.ts
+++ b/packages/supabase-connector/sources/crawlers.ts
@@ -92,6 +92,44 @@ export async function listCrawlersByUser(
);
}
+/**
+ * Gets a single crawler by ID without ownership check.
+ * Used by the execution engine where the crawler may belong to any user.
+ *
+ * @param client - Supabase client
+ * @param id - Crawler ID
+ * @returns Crawler row if found, null otherwise
+ */
+export async function getCrawlerByID(
+ client: SupabaseClientType,
+ id: string,
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'select', table: 'crawlers' },
+ async (span) => {
+ span.setAttribute('db.query.id', id);
+
+ const { data, error } = await client
+ .from('crawlers')
+ .select('*')
+ .eq('id', id)
+ .single();
+
+ if (error) {
+ if (error.code === 'PGRST116') {
+ span.setAttribute('db.rows_affected', 0);
+ return null;
+ }
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to get crawler by ID: ${error.message}`);
+ }
+
+ span.setAttribute('db.rows_affected', 1);
+ return data as CrawlerRow;
+ }
+ );
+}
+
/**
* Gets a single crawler by ID, verifying ownership.
*
diff --git a/packages/supabase-connector/sources/index.ts b/packages/supabase-connector/sources/index.ts
index 592ac59..e7a4477 100644
--- a/packages/supabase-connector/sources/index.ts
+++ b/packages/supabase-connector/sources/index.ts
@@ -7,6 +7,7 @@ export type {
AccountRow,
CrawlerRow,
SchedulerRow,
+ FanOutStrategy,
SchedulerStageRow,
SchedulerRunRow,
SchedulerStageRunRow,
@@ -25,6 +26,8 @@ export type {
SocialLoginInput,
SocialLoginResult,
LinkAccountResult,
+ CrawlerPermissionLevel,
+ CrawlerPermissionRow,
SupabaseConnectorConfiguration,
Database,
} from './types/index.ts';
@@ -52,10 +55,18 @@ export {
createCrawler,
listCrawlersByUser,
getCrawler,
+ getCrawlerByID,
updateCrawler,
deleteCrawler,
} from './crawlers.ts';
+// Crawler permission operations
+export {
+ createCrawlerPermission,
+ getCrawlerPermission,
+ deleteCrawlerPermission,
+} from './crawler-permissions.ts';
+
// Scheduler operations
export type { PaginatedSchedulers } from './schedulers.ts';
export {
@@ -84,3 +95,10 @@ export {
updateSchedulerRun,
listSchedulerRuns,
} from './scheduler-runs.ts';
+
+// Scheduler stage run operations
+export {
+ createSchedulerStageRun,
+ updateSchedulerStageRun,
+ listSchedulerStageRunsByRun,
+} from './scheduler-stage-runs.ts';
diff --git a/packages/supabase-connector/sources/scheduler-runs.ts b/packages/supabase-connector/sources/scheduler-runs.ts
index 2e98261..399c48e 100644
--- a/packages/supabase-connector/sources/scheduler-runs.ts
+++ b/packages/supabase-connector/sources/scheduler-runs.ts
@@ -2,6 +2,7 @@ import type { SupabaseClient } from '@supabase/supabase-js';
import { traceDatabaseOperation, SpanStatusCode } from '@audio-underview/axiom-logger/tracers';
import type {
Database,
+ SchedulerRunStatus,
SchedulerRunRow,
SchedulerRunsInsert,
SchedulerRunsUpdate,
@@ -79,6 +80,7 @@ export async function updateSchedulerRun(
id: string,
schedulerID: string,
input: SchedulerRunsUpdate,
+ options?: { onlyIfStatus?: readonly SchedulerRunStatus[] },
): Promise {
return traceDatabaseOperation(
{ serviceName: 'supabase-connector', operation: 'update', table: 'scheduler_runs' },
@@ -86,11 +88,17 @@ export async function updateSchedulerRun(
span.setAttribute('db.update.id', id);
span.setAttribute('db.update.scheduler_id', schedulerID);
- const { data, error } = await client
+ let query = client
.from('scheduler_runs')
.update(input)
.eq('id', id)
- .eq('scheduler_id', schedulerID)
+ .eq('scheduler_id', schedulerID);
+
+ if (options?.onlyIfStatus !== undefined) {
+ query = query.in('status', options.onlyIfStatus);
+ }
+
+ const { data, error } = await query
.select()
.single();
diff --git a/packages/supabase-connector/sources/scheduler-stage-runs.ts b/packages/supabase-connector/sources/scheduler-stage-runs.ts
new file mode 100644
index 0000000..85756fc
--- /dev/null
+++ b/packages/supabase-connector/sources/scheduler-stage-runs.ts
@@ -0,0 +1,101 @@
+import type { SupabaseClient } from '@supabase/supabase-js';
+import { traceDatabaseOperation, SpanStatusCode } from '@audio-underview/axiom-logger/tracers';
+import type {
+ Database,
+ SchedulerStageRunRow,
+ SchedulerStageRunsInsert,
+ SchedulerStageRunsUpdate,
+} from './types/index.ts';
+
+type SupabaseClientType = SupabaseClient;
+
+export async function createSchedulerStageRun(
+ client: SupabaseClientType,
+ input: SchedulerStageRunsInsert,
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'insert', table: 'scheduler_stage_runs' },
+ async (span) => {
+ span.setAttribute('db.insert.run_id', input.run_id);
+ span.setAttribute('db.insert.stage_id', input.stage_id);
+ span.setAttribute('db.insert.stage_order', input.stage_order);
+
+ const { data, error } = await client
+ .from('scheduler_stage_runs')
+ .insert(input)
+ .select()
+ .single();
+
+ if (error) {
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to create scheduler stage run: ${error.message}`);
+ }
+
+ span.setAttribute('db.rows_affected', 1);
+ span.setAttribute('db.created_id', (data as SchedulerStageRunRow).id);
+ return data as SchedulerStageRunRow;
+ },
+ );
+}
+
+export async function updateSchedulerStageRun(
+ client: SupabaseClientType,
+ id: string,
+ runID: string,
+ input: SchedulerStageRunsUpdate,
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'update', table: 'scheduler_stage_runs' },
+ async (span) => {
+ span.setAttribute('db.update.id', id);
+ span.setAttribute('db.update.run_id', runID);
+
+ const { data, error } = await client
+ .from('scheduler_stage_runs')
+ .update(input)
+ .eq('id', id)
+ .eq('run_id', runID)
+ .select()
+ .single();
+
+ if (error) {
+ if (error.code === 'PGRST116') {
+ span.setAttribute('db.rows_affected', 0);
+ return null;
+ }
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to update scheduler stage run: ${error.message}`);
+ }
+
+ span.setAttribute('db.rows_affected', 1);
+ return data as SchedulerStageRunRow;
+ },
+ );
+}
+
+export async function listSchedulerStageRunsByRun(
+ client: SupabaseClientType,
+ runID: string,
+): Promise {
+ return traceDatabaseOperation(
+ { serviceName: 'supabase-connector', operation: 'select', table: 'scheduler_stage_runs' },
+ async (span) => {
+ span.setAttribute('db.query.run_id', runID);
+
+ const { data, error } = await client
+ .from('scheduler_stage_runs')
+ .select('*')
+ .eq('run_id', runID)
+ .order('stage_order', { ascending: true });
+
+ if (error) {
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+ throw new Error(`Failed to list scheduler stage runs: ${error.message}`);
+ }
+
+ const stageRuns = (data ?? []) as SchedulerStageRunRow[];
+ span.setAttribute('db.rows_affected', stageRuns.length);
+ return stageRuns;
+ },
+ );
+}
diff --git a/packages/supabase-connector/sources/types/database.ts b/packages/supabase-connector/sources/types/database.ts
index 6eaaeaa..0aba278 100644
--- a/packages/supabase-connector/sources/types/database.ts
+++ b/packages/supabase-connector/sources/types/database.ts
@@ -97,7 +97,12 @@ export interface SchedulerRow {
* (e.g. { url: { type: "string", default: "https://..." } }).
* output_schema is derived from the crawler's output_schema.
* fan_out_field names the array field in previous output to fan-out over.
+ * fan_out_strategy controls how failed items are handled:
+ * 'compact' (default) — remove failed items from results
+ * 'preserve' — keep failed items as null, preserving positional alignment
*/
+export type FanOutStrategy = 'compact' | 'preserve';
+
export interface SchedulerStageRow {
[key: string]: unknown;
id: string;
@@ -107,6 +112,7 @@ export interface SchedulerStageRow {
input_schema: Record;
output_schema: Record;
fan_out_field: string | null;
+ fan_out_strategy: FanOutStrategy;
created_at: string;
}
@@ -126,7 +132,7 @@ export interface SchedulerRunRow {
status: SchedulerRunStatus;
started_at: string | null;
completed_at: string | null;
- result: Record | null;
+ result: unknown;
error: string | null;
created_at: string;
}
@@ -144,8 +150,8 @@ export interface SchedulerStageRunRow {
status: SchedulerRunStatus;
started_at: string | null;
completed_at: string | null;
- input: Record | null;
- output: Record | null;
+ input: unknown;
+ output: unknown;
error: string | null;
items_total: number | null;
items_succeeded: number | null;
@@ -153,6 +159,24 @@ export interface SchedulerStageRunRow {
created_at: string;
}
+/**
+ * Crawler permission level type
+ */
+export type CrawlerPermissionLevel = 'owner' | 'subscriber';
+
+/**
+ * Crawler permission table row type
+ * Controls who can use a crawler in their scheduler stages.
+ */
+export interface CrawlerPermissionRow {
+ [key: string]: unknown;
+ id: string;
+ crawler_id: string;
+ user_uuid: string;
+ level: CrawlerPermissionLevel;
+ created_at: string;
+}
+
/**
* Supabase connector configuration
*/
@@ -249,6 +273,7 @@ export interface Database {
input_schema: Record;
output_schema?: Record;
fan_out_field?: string | null;
+ fan_out_strategy?: FanOutStrategy;
};
Update: Partial>;
Relationships: [
@@ -277,7 +302,7 @@ export interface Database {
status?: SchedulerRunStatus;
started_at?: string | null;
completed_at?: string | null;
- result?: Record | null;
+ result?: unknown;
error?: string | null;
};
Update: Partial>;
@@ -302,8 +327,8 @@ export interface Database {
status?: SchedulerRunStatus;
started_at?: string | null;
completed_at?: string | null;
- input?: Record | null;
- output?: Record | null;
+ input?: unknown;
+ output?: unknown;
error?: string | null;
items_total?: number | null;
items_succeeded?: number | null;
@@ -327,6 +352,33 @@ export interface Database {
},
];
};
+ crawler_permissions: {
+ Row: CrawlerPermissionRow;
+ Insert: {
+ [key: string]: unknown;
+ id?: string;
+ crawler_id: string;
+ user_uuid: string;
+ level: CrawlerPermissionLevel;
+ };
+ Update: Partial>;
+ Relationships: [
+ {
+ foreignKeyName: 'crawler_permissions_crawler_id_fkey';
+ columns: ['crawler_id'];
+ isOneToOne: false;
+ referencedRelation: 'crawlers';
+ referencedColumns: ['id'];
+ },
+ {
+ foreignKeyName: 'crawler_permissions_user_uuid_fkey';
+ columns: ['user_uuid'];
+ isOneToOne: false;
+ referencedRelation: 'users';
+ referencedColumns: ['uuid'];
+ },
+ ];
+ };
};
Views: Record;
Functions: {
@@ -342,6 +394,7 @@ export interface Database {
provider_type: ProviderType;
crawler_type: CrawlerType;
scheduler_run_status: SchedulerRunStatus;
+ crawler_permission_level: CrawlerPermissionLevel;
};
CompositeTypes: Record;
};
diff --git a/packages/supabase-connector/sources/types/index.ts b/packages/supabase-connector/sources/types/index.ts
index 8f28dcd..c6dc7a2 100644
--- a/packages/supabase-connector/sources/types/index.ts
+++ b/packages/supabase-connector/sources/types/index.ts
@@ -6,6 +6,7 @@ export type {
AccountRow,
CrawlerRow,
SchedulerRow,
+ FanOutStrategy,
SchedulerStageRow,
SchedulerRunRow,
SchedulerStageRunRow,
@@ -24,6 +25,8 @@ export type {
SocialLoginInput,
SocialLoginResult,
LinkAccountResult,
+ CrawlerPermissionLevel,
+ CrawlerPermissionRow,
SupabaseConnectorConfiguration,
Database,
} from './database.ts';
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index a950cda..043d55a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -496,6 +496,9 @@ importers:
'@audio-underview/worker-tools':
specifier: workspace:*
version: link:../tools
+ safe-regex2:
+ specifier: ^5.1.0
+ version: 5.1.0
devDependencies:
'@cloudflare/vitest-pool-workers':
specifier: catalog:worker
@@ -4164,6 +4167,10 @@ packages:
engines: {node: '>= 0.4'}
hasBin: true
+ ret@0.5.0:
+ resolution: {integrity: sha512-I1XxrZSQ+oErkRR4jYbAyEEu2I0avBvvMM5JN+6EBprOGRCs63ENqZ3vjavq8fBw2+62G5LF5XelKwuJpcvcxw==}
+ engines: {node: '>=10'}
+
rettime@0.10.1:
resolution: {integrity: sha512-uyDrIlUEH37cinabq0AX4QbgV4HbFZ/gqoiunWQ1UqBtRvTTytwhNYjE++pO/MjPTZL5KQCf2bEoJ/BJNVQ5Kw==}
@@ -4176,6 +4183,10 @@ packages:
resolution: {integrity: sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==}
engines: {node: '>=18'}
+ safe-regex2@5.1.0:
+ resolution: {integrity: sha512-pNHAuBW7TrcleFHsxBr5QMi/Iyp0ENjUKz7GCcX1UO7cMh+NmVK6HxQckNL1tJp1XAJVjG6B8OKIPqodqj9rtw==}
+ hasBin: true
+
sass@1.97.3:
resolution: {integrity: sha512-fDz1zJpd5GycprAbu4Q2PV/RprsRtKC/0z82z0JLgdytmcq0+ujJbJ/09bPGDxCLkKY3Np5cRAOcWiVkLXJURg==}
engines: {node: '>=14.0.0'}
@@ -8181,6 +8192,8 @@ snapshots:
path-parse: 1.0.7
supports-preserve-symlinks-flag: 1.0.0
+ ret@0.5.0: {}
+
rettime@0.10.1: {}
rollup@4.53.5:
@@ -8213,6 +8226,10 @@ snapshots:
run-applescript@7.1.0: {}
+ safe-regex2@5.1.0:
+ dependencies:
+ ret: 0.5.0
+
sass@1.97.3:
dependencies:
chokidar: 4.0.3
diff --git a/workers/crawler-code-runner-worker/sources/create-code-runner.ts b/workers/crawler-code-runner-worker/sources/create-code-runner.ts
index 3c5c00c..768a966 100644
--- a/workers/crawler-code-runner-worker/sources/create-code-runner.ts
+++ b/workers/crawler-code-runner-worker/sources/create-code-runner.ts
@@ -1,5 +1,5 @@
export interface CodeRunner {
- execute(data: string): Promise;
+ execute(data: unknown): Promise;
}
export const MAX_CODE_LENGTH = 10_000;
diff --git a/workers/crawler-code-runner-worker/sources/index.ts b/workers/crawler-code-runner-worker/sources/index.ts
index ca5f783..80d768c 100644
--- a/workers/crawler-code-runner-worker/sources/index.ts
+++ b/workers/crawler-code-runner-worker/sources/index.ts
@@ -12,13 +12,24 @@ interface Environment {
LOADER: WorkerLoader;
}
-interface RunRequestBody {
- type: 'test' | 'run';
+interface WebRunRequestBody {
+ type: 'web';
+ mode: 'test' | 'run';
url: string;
code: string;
}
+interface DataRunRequestBody {
+ type: 'data';
+ mode: 'test' | 'run';
+ data: unknown;
+ code: string;
+}
+
+type RunRequestBody = WebRunRequestBody | DataRunRequestBody;
+
const FETCH_TIMEOUT_MILLISECONDS = 10_000;
+const MAX_RESPONSE_BYTES = 10 * 1024 * 1024; // 10MB
const logger = createWorkerLogger({
defaultContext: {
@@ -34,80 +45,150 @@ const HELP = {
{
method: 'POST',
path: '/run',
- description: 'Fetch a URL and run code against the response body',
+ description: 'Run code against a fetched URL response (web) or provided data (data)',
body: {
- type: "'test' | 'run'",
- url: 'string - The URL to fetch',
- code: 'string - JavaScript function source to execute against the fetched response body',
+ type: "'web' | 'data'",
+ mode: "'test' | 'run'",
+ url: "string - The URL to fetch (required for type 'web')",
+ data: "unknown - The data to process (required for type 'data')",
+ code: 'string - JavaScript function source to execute against the input',
},
},
],
};
+function validateRunRequestBody(raw: unknown): RunRequestBody | string {
+ if (raw == null || typeof raw !== 'object') {
+ return 'Request body must be a JSON object';
+ }
+ const object = raw as Record;
+
+ if (object.type !== 'web' && object.type !== 'data') {
+ return "Field 'type' must be 'web' or 'data'";
+ }
+
+ if (object.mode !== 'test' && object.mode !== 'run') {
+ return "Field 'mode' must be 'test' or 'run'";
+ }
+
+ if (typeof object.code !== 'string') {
+ return "Field 'code' is required and must be a string";
+ }
+
+ if (object.type === 'web') {
+ if (typeof object.url !== 'string') {
+ return "Field 'url' is required and must be a string when type is 'web'";
+ }
+ return {
+ type: 'web',
+ mode: object.mode as 'test' | 'run',
+ url: object.url as string,
+ code: object.code as string,
+ };
+ }
+
+ // type === 'data'
+ if (!('data' in object)) {
+ return "Field 'data' is required when type is 'data'";
+ }
+ return {
+ type: 'data',
+ mode: object.mode as 'test' | 'run',
+ data: object.data,
+ code: object.code as string,
+ };
+}
+
async function handleRun(
request: Request,
environment: Environment,
context: ResponseContext,
): Promise {
- let body: RunRequestBody;
+ let raw: unknown;
try {
- body = await request.json() as RunRequestBody;
+ raw = await request.json();
} catch {
return errorResponse('invalid_request', 'Request body must be valid JSON', 400, context);
}
- if (!body.type || (body.type !== 'test' && body.type !== 'run')) {
- return errorResponse('invalid_request', "Field 'type' must be 'test' or 'run'", 400, context);
- }
-
- if (!body.url || typeof body.url !== 'string') {
- return errorResponse('invalid_request', "Field 'url' is required and must be a string", 400, context);
+ const validated = validateRunRequestBody(raw);
+ if (typeof validated === 'string') {
+ return errorResponse('invalid_request', validated, 400, context);
}
- if (!body.code || typeof body.code !== 'string') {
- return errorResponse('invalid_request', "Field 'code' is required and must be a string", 400, context);
- }
+ const parsed = validated;
- if (body.code.length > MAX_CODE_LENGTH) {
+ if (parsed.code.length > MAX_CODE_LENGTH) {
return errorResponse('invalid_request', `Field 'code' exceeds maximum length of ${MAX_CODE_LENGTH} characters`, 400, context);
}
- let targetURL: URL;
- try {
- targetURL = new URL(body.url);
- } catch {
- return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context);
- }
+ if (parsed.type === 'web') {
+ let targetURL: URL;
+ try {
+ targetURL = new URL(parsed.url);
+ } catch {
+ return errorResponse('invalid_request', "Field 'url' must be a valid URL", 400, context);
+ }
- let responseText: string;
- try {
- const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS);
- logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' });
- const fetchResponse = await fetch(targetURL.toString(), { signal });
- responseText = await fetchResponse.text();
- logger.info('Target URL fetched', {
- status: fetchResponse.status,
- contentLength: responseText.length,
- }, { function: 'handleRun' });
- } catch (fetchError) {
- if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') {
- logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' });
- return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context);
+ let responseText: string;
+ try {
+ const signal = AbortSignal.timeout(FETCH_TIMEOUT_MILLISECONDS);
+ logger.info('Fetching target URL', { url: targetURL.toString() }, { function: 'handleRun' });
+ const fetchResponse = await fetch(targetURL.toString(), { signal });
+
+ const contentLength = fetchResponse.headers.get('Content-Length');
+ if (contentLength !== null && Number(contentLength) > MAX_RESPONSE_BYTES) {
+ logger.warn('Response too large', { url: targetURL.toString(), contentLength }, { function: 'handleRun' });
+ return errorResponse('response_too_large', `Response exceeds maximum size of ${MAX_RESPONSE_BYTES} bytes`, 413, context);
+ }
+
+ responseText = await fetchResponse.text();
+ logger.info('Target URL fetched', {
+ status: fetchResponse.status,
+ contentLength: responseText.length,
+ }, { function: 'handleRun' });
+ } catch (fetchError) {
+ if (fetchError instanceof DOMException && fetchError.name === 'TimeoutError') {
+ logger.error('Fetch timed out', { url: targetURL.toString(), timeoutMilliseconds: FETCH_TIMEOUT_MILLISECONDS }, { function: 'handleRun' });
+ return errorResponse('fetch_timeout', `Fetch timed out after ${FETCH_TIMEOUT_MILLISECONDS}ms`, 504, context);
+ }
+ logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' });
+ return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context);
+ }
+
+ let result: unknown;
+ try {
+ const runner = createCodeRunner(environment.LOADER, parsed.code);
+ result = await runner.execute(responseText);
+ } catch (executionError) {
+ logger.error('Code execution failed', executionError, { function: 'handleRun' });
+ return errorResponse('execution_failed', 'Code execution failed', 422, context);
}
- logger.error('Failed to fetch target URL', { error: fetchError, url: targetURL.toString() }, { function: 'handleRun' });
- return errorResponse('fetch_failed', 'Failed to fetch the target URL', 502, context);
+
+ // Normalize undefined to null to prevent JSON.stringify field drop
+ if (result === undefined) {
+ result = null;
+ }
+
+ return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context);
}
+ // type === 'data'
let result: unknown;
try {
- const runner = createCodeRunner(environment.LOADER, body.code);
- result = await runner.execute(responseText);
+ const runner = createCodeRunner(environment.LOADER, parsed.code);
+ result = await runner.execute(parsed.data);
} catch (executionError) {
logger.error('Code execution failed', executionError, { function: 'handleRun' });
return errorResponse('execution_failed', 'Code execution failed', 422, context);
}
- return jsonResponse({ type: body.type, result }, 200, context);
+ // Normalize undefined to null to prevent JSON.stringify field drop
+ if (result === undefined) {
+ result = null;
+ }
+
+ return jsonResponse({ type: parsed.type, mode: parsed.mode, result }, 200, context);
}
export default {
diff --git a/workers/crawler-code-runner-worker/tests/index.test.ts b/workers/crawler-code-runner-worker/tests/index.test.ts
index b343f4a..6a04715 100644
--- a/workers/crawler-code-runner-worker/tests/index.test.ts
+++ b/workers/crawler-code-runner-worker/tests/index.test.ts
@@ -98,7 +98,7 @@ describe('crawler-code-runner-worker', () => {
const request = new Request(`${WORKER_URL}/run`, {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
- body: JSON.stringify({ url: 'https://example.com', code: '(x) => x' }),
+ body: JSON.stringify({ mode: 'test', url: 'https://example.com', code: '(x) => x' }),
});
const response = await worker.fetch(request, env);
@@ -112,20 +112,49 @@ describe('crawler-code-runner-worker', () => {
const request = new Request(`${WORKER_URL}/run`, {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
- body: JSON.stringify({ type: 'invalid', url: 'https://example.com', code: '(x) => x' }),
+ body: JSON.stringify({ type: 'invalid', mode: 'test', url: 'https://example.com', code: '(x) => x' }),
});
const response = await worker.fetch(request, env);
expect(response.status).toBe(400);
const body = await response.json();
expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('type');
});
- it('returns 400 for missing url field', async () => {
+ it('returns 400 for missing mode field', async () => {
const request = new Request(`${WORKER_URL}/run`, {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
- body: JSON.stringify({ type: 'run', code: '(x) => x' }),
+ body: JSON.stringify({ type: 'web', url: 'https://example.com', code: '(x) => x' }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(400);
+ const body = await response.json();
+ expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('mode');
+ });
+
+ it('returns 400 for invalid mode value', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({ type: 'web', mode: 'invalid', url: 'https://example.com', code: '(x) => x' }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(400);
+ const body = await response.json();
+ expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('mode');
+ });
+
+ it('returns 400 for missing url field when type is web', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({ type: 'web', mode: 'run', code: '(x) => x' }),
});
const response = await worker.fetch(request, env);
@@ -135,11 +164,25 @@ describe('crawler-code-runner-worker', () => {
expect(body.error_description).toContain('url');
});
+ it('returns 400 for missing data field when type is data', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({ type: 'data', mode: 'run', code: '(x) => x' }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(400);
+ const body = await response.json();
+ expect(body.error).toBe('invalid_request');
+ expect(body.error_description).toContain('data');
+ });
+
it('returns 400 for missing code field', async () => {
const request = new Request(`${WORKER_URL}/run`, {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
- body: JSON.stringify({ type: 'run', url: 'https://example.com' }),
+ body: JSON.stringify({ type: 'web', mode: 'run', url: 'https://example.com' }),
});
const response = await worker.fetch(request, env);
@@ -154,7 +197,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: 'x'.repeat(MAX_CODE_LENGTH + 1),
}),
@@ -171,7 +215,7 @@ describe('crawler-code-runner-worker', () => {
const request = new Request(`${WORKER_URL}/run`, {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
- body: JSON.stringify({ type: 'run', url: 'not-a-url', code: '(x) => x' }),
+ body: JSON.stringify({ type: 'web', mode: 'run', url: 'not-a-url', code: '(x) => x' }),
});
const response = await worker.fetch(request, env);
@@ -182,7 +226,7 @@ describe('crawler-code-runner-worker', () => {
});
});
- describe('POST /run fetch failures', () => {
+ describe('POST /run web type - fetch failures', () => {
it('returns 502 when target URL fetch fails', async () => {
fetchMock
.get('https://target.example.com')
@@ -193,7 +237,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '(text) => text',
}),
@@ -206,7 +251,7 @@ describe('crawler-code-runner-worker', () => {
});
});
- describe('POST /run execution failures', () => {
+ describe('POST /run web type - execution failures', () => {
it('returns 422 when code execution throws an error', async () => {
fetchMock
.get('https://target.example.com')
@@ -217,7 +262,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '(text) => { throw new Error("intentional error"); }',
}),
@@ -240,7 +286,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '((( invalid syntax',
}),
@@ -253,8 +300,8 @@ describe('crawler-code-runner-worker', () => {
});
});
- describe('POST /run successful execution', () => {
- it('executes code and returns result with type test', async () => {
+ describe('POST /run web type - successful execution', () => {
+ it('executes code and returns result with mode test', async () => {
fetchMock
.get('https://target.example.com')
.intercept({ path: '/data', method: 'GET' })
@@ -264,7 +311,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'test',
+ type: 'web',
+ mode: 'test',
url: 'https://target.example.com/data',
code: '(text) => text.toUpperCase()',
}),
@@ -273,11 +321,12 @@ describe('crawler-code-runner-worker', () => {
expect(response.status).toBe(200);
const body = await response.json();
- expect(body.type).toBe('test');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('test');
expect(body.result).toBe('HELLO WORLD');
});
- it('executes code and returns result with type run', async () => {
+ it('executes code and returns result with mode run', async () => {
fetchMock
.get('https://target.example.com')
.intercept({ path: '/data', method: 'GET' })
@@ -287,7 +336,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: '(text) => text.length',
}),
@@ -296,7 +346,8 @@ describe('crawler-code-runner-worker', () => {
expect(response.status).toBe(200);
const body = await response.json();
- expect(body.type).toBe('run');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('run');
expect(body.result).toBe(11);
});
@@ -310,7 +361,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/page',
code: '(text) => ({ length: text.length, hasTitle: text.includes("") })',
}),
@@ -319,7 +371,8 @@ describe('crawler-code-runner-worker', () => {
expect(response.status).toBe(200);
const body = await response.json();
- expect(body.type).toBe('run');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('run');
expect(body.result.length).toBe(43);
expect(body.result.hasTitle).toBe(true);
});
@@ -334,7 +387,8 @@ describe('crawler-code-runner-worker', () => {
method: 'POST',
headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
body: JSON.stringify({
- type: 'run',
+ type: 'web',
+ mode: 'run',
url: 'https://target.example.com/data',
code: 'async (text) => text.split(" ")',
}),
@@ -343,11 +397,132 @@ describe('crawler-code-runner-worker', () => {
expect(response.status).toBe(200);
const body = await response.json();
- expect(body.type).toBe('run');
+ expect(body.type).toBe('web');
+ expect(body.mode).toBe('run');
expect(body.result).toEqual(['async', 'test']);
});
});
+ describe('POST /run data type - successful execution', () => {
+ it('executes code against provided data object', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: { items: [1, 2, 3] },
+ code: '(data) => data.items.length',
+ }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(200);
+ const body = await response.json();
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('run');
+ expect(body.result).toBe(3);
+ });
+
+ it('executes code against provided data array', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'test',
+ data: [10, 20, 30],
+ code: '(data) => data.map((x) => x * 2)',
+ }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(200);
+ const body = await response.json();
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('test');
+ expect(body.result).toEqual([20, 40, 60]);
+ });
+
+ it('executes code against provided string data', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: 'hello world',
+ code: '(data) => data.toUpperCase()',
+ }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(200);
+ const body = await response.json();
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('run');
+ expect(body.result).toBe('HELLO WORLD');
+ });
+
+ it('executes code against null data', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: null,
+ code: '(data) => data === null ? "was null" : "not null"',
+ }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(200);
+ const body = await response.json();
+ expect(body.type).toBe('data');
+ expect(body.mode).toBe('run');
+ expect(body.result).toBe('was null');
+ });
+ });
+
+ describe('POST /run data type - execution failures', () => {
+ it('returns 422 when code execution throws an error', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: { value: 1 },
+ code: '(data) => { throw new Error("data processing error"); }',
+ }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(422);
+ const body = await response.json();
+ expect(body.error).toBe('execution_failed');
+ });
+
+ it('returns 422 when code is syntactically invalid', async () => {
+ const request = new Request(`${WORKER_URL}/run`, {
+ method: 'POST',
+ headers: { Origin: 'https://example.com', 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ type: 'data',
+ mode: 'run',
+ data: 'test',
+ code: '((( invalid syntax',
+ }),
+ });
+ const response = await worker.fetch(request, env);
+
+ expect(response.status).toBe(422);
+ const body = await response.json();
+ expect(body.error).toBe('execution_failed');
+ });
+ });
+
describe('unknown routes', () => {
it('returns 404 for unknown path', async () => {
const request = new Request(`${WORKER_URL}/unknown`, {
diff --git a/workers/crawler-manager-worker/package.json b/workers/crawler-manager-worker/package.json
index 7ebde66..294f65f 100644
--- a/workers/crawler-manager-worker/package.json
+++ b/workers/crawler-manager-worker/package.json
@@ -13,8 +13,9 @@
"packageManager": "pnpm@10.26.1",
"dependencies": {
"@audio-underview/logger": "workspace:*",
+ "@audio-underview/supabase-connector": "workspace:*",
"@audio-underview/worker-tools": "workspace:*",
- "@audio-underview/supabase-connector": "workspace:*"
+ "safe-regex2": "^5.1.0"
},
"devDependencies": {
"@cloudflare/vitest-pool-workers": "catalog:worker",
diff --git a/workers/crawler-manager-worker/sources/code-runner-client.ts b/workers/crawler-manager-worker/sources/code-runner-client.ts
new file mode 100644
index 0000000..f8ab8ee
--- /dev/null
+++ b/workers/crawler-manager-worker/sources/code-runner-client.ts
@@ -0,0 +1,178 @@
+export interface CodeRunnerResult {
+ type: 'web' | 'data';
+ mode: 'test' | 'run';
+ result: unknown;
+}
+
+export function validateCodeRunnerResult(value: unknown): CodeRunnerResult {
+ if (value == null || typeof value !== 'object') {
+ throw new CodeRunnerExecutionError('invalid_response', 'Expected object from code-runner', 0);
+ }
+ const record = value as Record;
+ if (record.type !== 'web' && record.type !== 'data') {
+ throw new CodeRunnerExecutionError('invalid_response', `Expected type 'web' or 'data', got '${String(record.type)}'`, 0);
+ }
+ if (record.mode !== 'test' && record.mode !== 'run') {
+ throw new CodeRunnerExecutionError('invalid_response', `Expected mode 'test' or 'run', got '${String(record.mode)}'`, 0);
+ }
+ if (!('result' in record)) {
+ throw new CodeRunnerExecutionError('invalid_response', 'Missing result field', 0);
+ }
+ return { type: record.type, mode: record.mode, result: record.result };
+}
+
+export interface CodeRunnerClient {
+ run(
+ type: 'web' | 'data',
+ url: string | undefined,
+ data: unknown | undefined,
+ code: string,
+ ): Promise;
+}
+
+export class CodeRunnerExecutionError extends Error {
+ readonly errorCode: string;
+ readonly errorDescription: string;
+ readonly statusCode: number;
+
+ constructor(errorCode: string, errorDescription: string, statusCode: number) {
+ super(`CodeRunner error ${statusCode}: [${errorCode}] ${errorDescription}`);
+ this.name = 'CodeRunnerExecutionError';
+ this.errorCode = errorCode;
+ this.errorDescription = errorDescription;
+ this.statusCode = statusCode;
+ }
+}
+
+const MAX_RETRY_ATTEMPTS = 2;
+const INITIAL_BACKOFF_MILLISECONDS = 1_000;
+const REQUEST_TIMEOUT_MILLISECONDS = 30_000;
+
+interface CodeRunnerRequestBody {
+ type: 'web' | 'data';
+ mode: 'run';
+ url?: string;
+ data?: unknown;
+ code: string;
+}
+
+interface CodeRunnerErrorResponse {
+ error_code?: string;
+ error_description?: string;
+}
+
+function isRetryableStatusCode(statusCode: number): boolean {
+ return statusCode >= 500 && statusCode < 600;
+}
+
+function buildRequestBody(
+ type: 'web' | 'data',
+ url: string | undefined,
+ data: unknown | undefined,
+ code: string,
+): CodeRunnerRequestBody {
+ if (type === 'web') {
+ return { type: 'web', mode: 'run', url, code };
+ }
+ return { type: 'data', mode: 'run', data, code };
+}
+
+async function delay(milliseconds: number): Promise {
+ return new Promise((resolve) => setTimeout(resolve, milliseconds));
+}
+
+export class HTTPCodeRunnerClient implements CodeRunnerClient {
+ private readonly baseURL: string;
+
+ constructor(baseURL: string) {
+ this.baseURL = baseURL.replace(/\/+$/, '');
+ }
+
+ async run(
+ type: 'web' | 'data',
+ url: string | undefined,
+ data: unknown | undefined,
+ code: string,
+ ): Promise {
+ const requestBody = buildRequestBody(type, url, data, code);
+ const endpoint = `${this.baseURL}/run`;
+
+ let lastError: unknown;
+
+ for (let attempt = 0; attempt <= MAX_RETRY_ATTEMPTS; attempt++) {
+ if (attempt > 0) {
+ const backoffMilliseconds = INITIAL_BACKOFF_MILLISECONDS * Math.pow(2, attempt - 1);
+ await delay(backoffMilliseconds);
+ }
+
+ let response: Response;
+ try {
+ response = await fetch(endpoint, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(requestBody),
+ signal: AbortSignal.timeout(REQUEST_TIMEOUT_MILLISECONDS),
+ });
+ } catch (error: unknown) {
+ lastError = error;
+ if (attempt < MAX_RETRY_ATTEMPTS) {
+ continue;
+ }
+ throw new CodeRunnerExecutionError(
+ 'network_error',
+ error instanceof Error ? error.message : 'Unknown network error',
+ 0,
+ );
+ }
+
+ if (response.ok) {
+ try {
+ const result = validateCodeRunnerResult(await response.json());
+ return result;
+ } catch (error: unknown) {
+ throw new CodeRunnerExecutionError(
+ 'invalid_response',
+ error instanceof Error ? error.message : 'Failed to parse code runner response',
+ response.status,
+ );
+ }
+ }
+
+ if (isRetryableStatusCode(response.status)) {
+ lastError = new CodeRunnerExecutionError(
+ 'server_error',
+ `Server returned ${response.status}`,
+ response.status,
+ );
+ if (attempt < MAX_RETRY_ATTEMPTS) {
+ continue;
+ }
+ throw lastError;
+ }
+
+ // 4xx errors fail immediately (user code error) — no retry
+ let errorCode = 'execution_error';
+ let errorDescription = `Code runner returned HTTP ${response.status}`;
+
+ try {
+ const errorBody = (await response.json()) as CodeRunnerErrorResponse;
+ errorCode = errorBody.error_code ?? errorCode;
+ errorDescription = errorBody.error_description ?? errorDescription;
+ } catch {
+ // Response body is not valid JSON; use defaults
+ }
+
+ throw new CodeRunnerExecutionError(errorCode, errorDescription, response.status);
+ }
+
+ // Exhausted all retry attempts — throw the last captured error
+ if (lastError instanceof CodeRunnerExecutionError) {
+ throw lastError;
+ }
+ throw new CodeRunnerExecutionError(
+ 'network_error',
+ lastError instanceof Error ? lastError.message : 'Unknown error after retries',
+ 0,
+ );
+ }
+}
diff --git a/workers/crawler-manager-worker/sources/crawler-executor.ts b/workers/crawler-manager-worker/sources/crawler-executor.ts
new file mode 100644
index 0000000..30a1c27
--- /dev/null
+++ b/workers/crawler-manager-worker/sources/crawler-executor.ts
@@ -0,0 +1,79 @@
+import type { Logger } from '@audio-underview/logger';
+import type { CrawlerRow } from '@audio-underview/supabase-connector';
+import type { CrawlerExecuteResult } from '@audio-underview/worker-tools';
+import type { CodeRunnerClient } from './code-runner-client.ts';
+import { isSafeURLPattern } from './safe-url-pattern.ts';
+
+export type { CrawlerExecuteResult };
+
+function resolveURL(
+ input: unknown,
+ crawler: CrawlerRow,
+): string | null {
+ // 1. input.url if present
+ if (input !== null && input !== undefined && typeof input === 'object' && 'url' in input) {
+ const url = (input as Record).url;
+ if (typeof url === 'string' && url.length > 0) {
+ return url;
+ }
+ }
+
+ // 2. input_schema url default
+ const inputSchema = crawler.input_schema;
+ if (inputSchema.url !== null && inputSchema.url !== undefined && typeof inputSchema.url === 'object' && 'default' in inputSchema.url) {
+ const defaultURL = (inputSchema.url as Record).default;
+ if (typeof defaultURL === 'string' && defaultURL.length > 0) {
+ return defaultURL;
+ }
+ }
+
+ return null;
+}
+
+export async function executeCrawler(
+ codeRunnerClient: CodeRunnerClient,
+ crawler: CrawlerRow,
+ input: unknown,
+ logger: Logger,
+): Promise {
+ if (crawler.type === 'web') {
+ const url = resolveURL(input, crawler);
+ if (!url) {
+ throw new Error(
+ `Crawler ${crawler.id}: no URL available. Provide url in input or set a default in input_schema.`,
+ );
+ }
+
+ if (crawler.url_pattern) {
+ if (!isSafeURLPattern(crawler.url_pattern)) {
+ logger.warn('Skipping url_pattern validation: potential ReDoS pattern detected', {
+ urlPattern: crawler.url_pattern,
+ crawlerID: crawler.id,
+ }, { function: 'executeCrawler' });
+ } else {
+ try {
+ const pattern = new RegExp(crawler.url_pattern);
+ if (!pattern.test(url)) {
+ logger.warn('URL does not match crawler url_pattern', {
+ url,
+ urlPattern: crawler.url_pattern,
+ crawlerID: crawler.id,
+ }, { function: 'executeCrawler' });
+ }
+ } catch (error: unknown) {
+ logger.warn('Invalid url_pattern regex', {
+ urlPattern: crawler.url_pattern,
+ crawlerID: crawler.id,
+ error: error instanceof Error ? error.message : String(error),
+ }, { function: 'executeCrawler' });
+ }
+ }
+ }
+
+ const response = await codeRunnerClient.run('web', url, undefined, crawler.code);
+ return { type: 'web', result: response.result };
+ }
+
+ const response = await codeRunnerClient.run('data', undefined, input, crawler.code);
+ return { type: 'data', result: response.result };
+}
diff --git a/workers/crawler-manager-worker/sources/index.ts b/workers/crawler-manager-worker/sources/index.ts
index 5fc03fd..f0b4c80 100644
--- a/workers/crawler-manager-worker/sources/index.ts
+++ b/workers/crawler-manager-worker/sources/index.ts
@@ -1,3 +1,5 @@
+import { WorkerEntrypoint } from 'cloudflare:workers';
+import { isSafeURLPattern } from './safe-url-pattern.ts';
import { createWorkerLogger } from '@audio-underview/logger';
import {
type ResponseContext,
@@ -11,16 +13,24 @@ import {
createCrawler,
listCrawlersByUser,
getCrawler,
+ getCrawlerByID,
updateCrawler,
deleteCrawler,
+ createCrawlerPermission,
} from '@audio-underview/supabase-connector';
import { handleTokenExchange } from './token-exchange.ts';
+import { HTTPCodeRunnerClient } from './code-runner-client.ts';
+import { executeCrawler } from './crawler-executor.ts';
+import type { CrawlerExecuteResult } from './crawler-executor.ts';
+
+export type { CrawlerExecuteResult };
interface Environment {
ALLOWED_ORIGINS: string;
SUPABASE_URL: string;
SUPABASE_SECRET_KEY: string;
JWT_SECRET: string;
+ CODE_RUNNER_FUNCTION_URL: string;
}
interface CreateCrawlerRequestBody {
@@ -51,10 +61,6 @@ const HELP = {
],
};
-function hasNestedQuantifiers(pattern: string): boolean {
- // Detect patterns like (a+)+, (.*)*, (a{2,})+, etc.
- return /(\([^)]*[+*][^)]*\))[+*]|\(\?:[^)]*[+*][^)]*\)[+*]/.test(pattern);
-}
async function validateCrawlerBody(
request: Request,
@@ -115,7 +121,7 @@ async function validateCrawlerBody(
return errorResponse('invalid_request', `Field 'url_pattern' must not exceed ${MAX_URL_PATTERN_LENGTH} characters`, 400, context);
}
- if (hasNestedQuantifiers(body.url_pattern)) {
+ if (!isSafeURLPattern(body.url_pattern)) {
return errorResponse('invalid_request', "Field 'url_pattern' contains potentially unsafe regex pattern", 400, context);
}
@@ -166,6 +172,12 @@ async function handleCreateCrawler(
output_schema: body.output_schema,
});
+ await createCrawlerPermission(supabaseClient, {
+ crawler_id: crawler.id,
+ user_uuid: userUUID,
+ level: 'owner',
+ });
+
return jsonResponse(crawler, 201, context);
}
@@ -293,8 +305,9 @@ function parseCrawlerID(pathname: string): string | null {
return id;
}
-export default {
- async fetch(request: Request, environment: Environment): Promise {
+export default class CrawlerManagerWorker extends WorkerEntrypoint {
+ async fetch(request: Request): Promise {
+ const environment = this.env;
const url = new URL(request.url);
const origin = request.headers.get('Origin') ?? '';
@@ -404,5 +417,27 @@ export default {
logger.error('Unhandled worker error', error, { function: 'fetch' });
return errorResponse('server_error', 'An unexpected error occurred', 500, context);
}
- },
-};
+ }
+
+ // Service Binding RPC — called by scheduler-manager-worker only.
+ // No user ownership check: binding declaration itself is the access control.
+ async executeCrawler(crawlerID: string, input: unknown): Promise {
+ const rpcLogger = logger.createChild({
+ function: 'executeCrawler',
+ metadata: { crawlerID },
+ });
+
+ const supabaseClient = createSupabaseClient({
+ supabaseURL: this.env.SUPABASE_URL,
+ supabaseSecretKey: this.env.SUPABASE_SECRET_KEY,
+ });
+
+ const crawler = await getCrawlerByID(supabaseClient, crawlerID);
+ if (!crawler) {
+ throw new Error(`Crawler ${crawlerID} not found`);
+ }
+
+ const codeRunnerClient = new HTTPCodeRunnerClient(this.env.CODE_RUNNER_FUNCTION_URL);
+ return executeCrawler(codeRunnerClient, crawler, input, rpcLogger);
+ }
+}
diff --git a/workers/crawler-manager-worker/sources/safe-url-pattern.ts b/workers/crawler-manager-worker/sources/safe-url-pattern.ts
new file mode 100644
index 0000000..c26ca3f
--- /dev/null
+++ b/workers/crawler-manager-worker/sources/safe-url-pattern.ts
@@ -0,0 +1,5 @@
+import isSafeRegex from 'safe-regex2';
+
+export function isSafeURLPattern(pattern: string): boolean {
+ return isSafeRegex(pattern);
+}
diff --git a/workers/crawler-manager-worker/tests/crawler-executor.test.ts b/workers/crawler-manager-worker/tests/crawler-executor.test.ts
new file mode 100644
index 0000000..55e2074
--- /dev/null
+++ b/workers/crawler-manager-worker/tests/crawler-executor.test.ts
@@ -0,0 +1,205 @@
+import { describe, it, expect, vi } from 'vitest';
+import { executeCrawler } from '../sources/crawler-executor.ts';
+import { validateCodeRunnerResult } from '../sources/code-runner-client.ts';
+import type { CodeRunnerClient } from '../sources/code-runner-client.ts';
+import type { CrawlerRow } from '@audio-underview/supabase-connector';
+
+function createMockCodeRunnerClient(
+ result: unknown = { extracted: 'data' },
+): CodeRunnerClient & { run: ReturnType } {
+ return {
+ run: vi.fn().mockResolvedValue({ type: 'web', mode: 'run', result }),
+ };
+}
+
+function createMockLogger() {
+ return {
+ info: vi.fn(),
+ warn: vi.fn(),
+ error: vi.fn(),
+ debug: vi.fn(),
+ createChild: vi.fn().mockReturnThis(),
+ } as any;
+}
+
+function createMockCrawler(overrides: Partial = {}): CrawlerRow {
+ return {
+ id: '00000000-0000-0000-0000-000000000001',
+ user_uuid: '00000000-0000-0000-0000-000000000002',
+ name: 'Test Crawler',
+ type: 'web',
+ url_pattern: '.*\\.example\\.com',
+ code: '(text) => ({ title: "test" })',
+ input_schema: { body: 'string' },
+ output_schema: {},
+ created_at: '2026-01-01T00:00:00Z',
+ updated_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ };
+}
+
+describe('executeCrawler', () => {
+ describe('web crawler', () => {
+ it('resolves URL from input.url and calls codeRunnerClient.run', async () => {
+ const client = createMockCodeRunnerClient({ title: 'Hello' });
+ const logger = createMockLogger();
+ const crawler = createMockCrawler({ code: '(text) => ({ title: text })' });
+ const input = { url: 'https://www.example.com/page' };
+
+ const result = await executeCrawler(client, crawler, input, logger);
+
+ expect(result).toEqual({ type: 'web', result: { title: 'Hello' } });
+ expect(client.run).toHaveBeenCalledOnce();
+ expect(client.run).toHaveBeenCalledWith(
+ 'web',
+ 'https://www.example.com/page',
+ undefined,
+ crawler.code,
+ );
+ });
+
+ it('resolves URL from input_schema.url.default when input has no url', async () => {
+ const client = createMockCodeRunnerClient();
+ const logger = createMockLogger();
+ const crawler = createMockCrawler({
+ input_schema: {
+ url: { default: 'https://fallback.example.com/default' },
+ },
+ });
+ const input = {};
+
+ await executeCrawler(client, crawler, input, logger);
+
+ expect(client.run).toHaveBeenCalledWith(
+ 'web',
+ 'https://fallback.example.com/default',
+ undefined,
+ crawler.code,
+ );
+ });
+
+ it('throws when no URL is available', async () => {
+ const client = createMockCodeRunnerClient();
+ const logger = createMockLogger();
+ const crawler = createMockCrawler({ input_schema: {} });
+ const input = {};
+
+ await expect(executeCrawler(client, crawler, input, logger)).rejects.toThrow(
+ /no URL available/,
+ );
+ expect(client.run).not.toHaveBeenCalled();
+ });
+
+ it('warns when URL does not match url_pattern but still executes', async () => {
+ const client = createMockCodeRunnerClient();
+ const logger = createMockLogger();
+ const crawler = createMockCrawler({ url_pattern: '^https://only\\.allowed\\.com' });
+ const input = { url: 'https://different.com/page' };
+
+ const result = await executeCrawler(client, crawler, input, logger);
+
+ expect(logger.warn).toHaveBeenCalledOnce();
+ expect(logger.warn).toHaveBeenCalledWith(
+ 'URL does not match crawler url_pattern',
+ expect.objectContaining({
+ url: 'https://different.com/page',
+ urlPattern: '^https://only\\.allowed\\.com',
+ crawlerID: crawler.id,
+ }),
+ { function: 'executeCrawler' },
+ );
+ expect(result.type).toBe('web');
+ expect(client.run).toHaveBeenCalledOnce();
+ });
+
+ it('skips url_pattern validation on unsafe regex and logs warning', async () => {
+ const client = createMockCodeRunnerClient();
+ const logger = createMockLogger();
+ const crawler = createMockCrawler({ url_pattern: '[invalid(' });
+ const input = { url: 'https://www.example.com/page' };
+
+ const result = await executeCrawler(client, crawler, input, logger);
+
+ expect(logger.warn).toHaveBeenCalledWith(
+ 'Skipping url_pattern validation: potential ReDoS pattern detected',
+ expect.objectContaining({
+ urlPattern: '[invalid(',
+ crawlerID: crawler.id,
+ }),
+ expect.objectContaining({ function: 'executeCrawler' }),
+ );
+ expect(result.type).toBe('web');
+ expect(client.run).toHaveBeenCalledOnce();
+ });
+ });
+
+ describe('data crawler', () => {
+ it('calls codeRunnerClient.run with data type and input', async () => {
+ const client = createMockCodeRunnerClient({ processed: true });
+ client.run.mockResolvedValue({ type: 'data', mode: 'run', result: { processed: true } });
+ const logger = createMockLogger();
+ const crawler = createMockCrawler({ type: 'data', url_pattern: null });
+ const input = { items: [1, 2, 3] };
+
+ const result = await executeCrawler(client, crawler, input, logger);
+
+ expect(result).toEqual({ type: 'data', result: { processed: true } });
+ expect(client.run).toHaveBeenCalledOnce();
+ expect(client.run).toHaveBeenCalledWith(
+ 'data',
+ undefined,
+ input,
+ crawler.code,
+ );
+ });
+ });
+
+ describe('error propagation', () => {
+ it('propagates errors from codeRunnerClient.run', async () => {
+ const client = createMockCodeRunnerClient();
+ client.run.mockRejectedValue(new Error('Code execution failed'));
+ const logger = createMockLogger();
+ const crawler = createMockCrawler();
+ const input = { url: 'https://www.example.com/page' };
+
+ await expect(executeCrawler(client, crawler, input, logger)).rejects.toThrow(
+ 'Code execution failed',
+ );
+ });
+ });
+});
+
+describe('validateCodeRunnerResult', () => {
+ it('accepts valid web result', () => {
+ const result = validateCodeRunnerResult({ type: 'web', mode: 'run', result: { data: 'ok' } });
+ expect(result.type).toBe('web');
+ expect(result.mode).toBe('run');
+ expect(result.result).toEqual({ data: 'ok' });
+ });
+
+ it('accepts valid result with null', () => {
+ const result = validateCodeRunnerResult({ type: 'data', mode: 'test', result: null });
+ expect(result.type).toBe('data');
+ expect(result.result).toBeNull();
+ });
+
+ it('throws on null input', () => {
+ expect(() => validateCodeRunnerResult(null)).toThrow('Expected object from code-runner');
+ });
+
+ it('throws on non-object input', () => {
+ expect(() => validateCodeRunnerResult('string')).toThrow('Expected object from code-runner');
+ });
+
+ it('throws on invalid type', () => {
+ expect(() => validateCodeRunnerResult({ type: 'unknown', mode: 'run', result: {} })).toThrow("Expected type 'web' or 'data'");
+ });
+
+ it('throws on invalid mode', () => {
+ expect(() => validateCodeRunnerResult({ type: 'web', mode: 'unknown', result: {} })).toThrow("Expected mode 'test' or 'run'");
+ });
+
+ it('throws on missing result field', () => {
+ expect(() => validateCodeRunnerResult({ type: 'web', mode: 'run' })).toThrow('Missing result field');
+ });
+});
diff --git a/workers/crawler-manager-worker/tests/index.test.ts b/workers/crawler-manager-worker/tests/index.test.ts
index ac6560e..cc36141 100644
--- a/workers/crawler-manager-worker/tests/index.test.ts
+++ b/workers/crawler-manager-worker/tests/index.test.ts
@@ -1,7 +1,6 @@
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
-import { env, fetchMock } from 'cloudflare:test';
+import { env, fetchMock, SELF } from 'cloudflare:test';
import { signJWT } from '@audio-underview/worker-tools';
-import worker from '../sources/index.ts';
const WORKER_URL = 'https://worker.example.com';
const MOCK_USER_UUID = '00000000-0000-0000-0000-000000000001';
@@ -99,7 +98,7 @@ describe('crawler-manager-worker', () => {
method: 'OPTIONS',
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(204);
expect(response.headers.get('Access-Control-Allow-Origin')).toBe('https://example.com');
@@ -111,7 +110,7 @@ describe('crawler-manager-worker', () => {
method: 'OPTIONS',
headers: { Origin: 'https://unknown.example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(204);
expect(response.headers.get('Access-Control-Allow-Origin')).toBeNull();
@@ -124,7 +123,7 @@ describe('crawler-manager-worker', () => {
method: 'HEAD',
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
expect(response.headers.get('Content-Type')).toBe('application/json');
@@ -137,7 +136,7 @@ describe('crawler-manager-worker', () => {
const request = new Request(WORKER_URL, {
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -150,7 +149,7 @@ describe('crawler-manager-worker', () => {
const request = new Request(`${WORKER_URL}/help`, {
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -161,7 +160,7 @@ describe('crawler-manager-worker', () => {
const request = new Request(WORKER_URL, {
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
const body = await response.json();
const tokenEndpoint = body.endpoints.find((endpoint: { path: string }) => endpoint.path === '/authentication/token');
@@ -175,7 +174,7 @@ describe('crawler-manager-worker', () => {
const request = new Request(`${WORKER_URL}/crawlers`, {
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(401);
const body = await response.json();
@@ -189,7 +188,7 @@ describe('crawler-manager-worker', () => {
Authorization: 'Bearer invalid-token',
},
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(401);
const body = await response.json();
@@ -209,7 +208,7 @@ describe('crawler-manager-worker', () => {
Authorization: `Bearer ${expiredToken}`,
},
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(401);
const body = await response.json();
@@ -229,7 +228,7 @@ describe('crawler-manager-worker', () => {
Authorization: `Bearer ${wrongSecretToken}`,
},
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(401);
});
@@ -255,7 +254,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ provider: 'google', access_token: 'valid-google-token' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -283,7 +282,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ provider: 'github', access_token: 'valid-github-token' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -300,7 +299,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ access_token: 'some-token' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -316,7 +315,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ provider: 'twitter', access_token: 'some-token' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
});
@@ -330,7 +329,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ provider: 'google' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
});
@@ -343,7 +342,7 @@ describe('crawler-manager-worker', () => {
},
body: 'not json',
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
});
@@ -362,7 +361,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ provider: 'google', access_token: 'bad-token' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(401);
});
@@ -391,7 +390,7 @@ describe('crawler-manager-worker', () => {
},
body: JSON.stringify({ provider: 'google', access_token: 'valid-but-unregistered' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(401);
});
@@ -403,7 +402,7 @@ describe('crawler-manager-worker', () => {
method: 'POST',
body: 'not json',
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -417,7 +416,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url_pattern: '.*', code: '(x) => x' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -430,7 +429,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: ' ', url_pattern: '.*', code: '(x) => x' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -443,7 +442,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'x'.repeat(256), url_pattern: '.*', code: '(x) => x' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -456,7 +455,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'test', url_pattern: '(((', code: '(x) => x' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -469,7 +468,7 @@ describe('crawler-manager-worker', () => {
body: JSON.stringify({ name: 'test', url_pattern: '(a+)+', code: '(x) => x' }),
headers: { 'Content-Type': 'application/json' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json() as Record;
expect(body.error_description).toContain('unsafe regex pattern');
@@ -481,7 +480,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'test', type: 'invalid', code: '(x) => x', url_pattern: '.*' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json() as Record;
expect(body.error_description).toContain('type');
@@ -493,7 +492,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'test', type: 'web', code: '(x) => x' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json() as Record;
expect(body.error_description).toContain('url_pattern');
@@ -505,7 +504,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'test', type: 'data', code: '(x) => x' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json() as Record;
expect(body.error_description).toContain('input_schema');
@@ -517,7 +516,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'test', url_pattern: '.*', code: '(x) => x', output_schema: 'invalid' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json() as Record;
expect(body.error_description).toContain('output_schema');
@@ -534,7 +533,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(crawlerData),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(201);
const body = await response.json();
@@ -555,7 +554,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(crawlerData),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(201);
const body = await response.json();
@@ -569,7 +568,7 @@ describe('crawler-manager-worker', () => {
mockSupabaseCrawlerList();
const request = await authenticatedRequest('/crawlers');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -585,7 +584,7 @@ describe('crawler-manager-worker', () => {
mockSupabaseCrawlerList();
const request = await authenticatedRequest('/crawlers?offset=0&limit=10');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -596,7 +595,7 @@ describe('crawler-manager-worker', () => {
it('returns 400 for negative offset', async () => {
const request = await authenticatedRequest('/crawlers?offset=-1');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -606,7 +605,7 @@ describe('crawler-manager-worker', () => {
it('returns 400 for zero limit', async () => {
const request = await authenticatedRequest('/crawlers?limit=0');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -616,7 +615,7 @@ describe('crawler-manager-worker', () => {
it('returns 400 for limit exceeding maximum', async () => {
const request = await authenticatedRequest('/crawlers?limit=101');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -626,7 +625,7 @@ describe('crawler-manager-worker', () => {
it('returns 400 for non-numeric offset', async () => {
const request = await authenticatedRequest('/crawlers?offset=abc');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -636,7 +635,7 @@ describe('crawler-manager-worker', () => {
it('returns 400 for non-integer limit', async () => {
const request = await authenticatedRequest('/crawlers?limit=1.5');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -650,7 +649,7 @@ describe('crawler-manager-worker', () => {
mockSupabaseCrawlerGet();
const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`);
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -661,7 +660,7 @@ describe('crawler-manager-worker', () => {
mockSupabaseCrawlerNotFound();
const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`);
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
});
@@ -673,7 +672,7 @@ describe('crawler-manager-worker', () => {
method: 'PUT',
body: 'not json',
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
const body = await response.json();
@@ -686,7 +685,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: 'test' }),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(400);
});
@@ -707,7 +706,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(updatedData),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -730,7 +729,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(validCrawlerBody()),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
});
@@ -746,7 +745,7 @@ describe('crawler-manager-worker', () => {
const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`, {
method: 'DELETE',
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(200);
const body = await response.json();
@@ -762,7 +761,7 @@ describe('crawler-manager-worker', () => {
const request = await authenticatedRequest(`/crawlers/${MOCK_CRAWLER_ID}`, {
method: 'DELETE',
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
const body = await response.json();
@@ -773,7 +772,7 @@ describe('crawler-manager-worker', () => {
describe('invalid UUID format', () => {
it('returns 404 for invalid crawler ID in GET', async () => {
const request = await authenticatedRequest('/crawlers/abc');
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
});
@@ -784,7 +783,7 @@ describe('crawler-manager-worker', () => {
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(validCrawlerBody()),
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
});
@@ -793,7 +792,7 @@ describe('crawler-manager-worker', () => {
const request = await authenticatedRequest('/crawlers/abc', {
method: 'DELETE',
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
});
@@ -804,7 +803,7 @@ describe('crawler-manager-worker', () => {
const request = new Request(`${WORKER_URL}/unknown`, {
headers: { Origin: 'https://example.com' },
});
- const response = await worker.fetch(request, env);
+ const response = await SELF.fetch(request);
expect(response.status).toBe(404);
const body = await response.json();
diff --git a/workers/crawler-manager-worker/vitest.config.ts b/workers/crawler-manager-worker/vitest.config.ts
index 08b2372..5e4a918 100644
--- a/workers/crawler-manager-worker/vitest.config.ts
+++ b/workers/crawler-manager-worker/vitest.config.ts
@@ -12,6 +12,7 @@ export default defineWorkersConfig({
SUPABASE_URL: 'https://supabase.example.com',
SUPABASE_SECRET_KEY: 'test-secret-key',
JWT_SECRET: 'test-jwt-secret-key-for-testing-only',
+ CODE_RUNNER_FUNCTION_URL: 'https://code-runner.example.com',
},
},
},
diff --git a/workers/crawler-manager-worker/wrangler.toml b/workers/crawler-manager-worker/wrangler.toml
index f0e0d52..662bad9 100644
--- a/workers/crawler-manager-worker/wrangler.toml
+++ b/workers/crawler-manager-worker/wrangler.toml
@@ -8,6 +8,8 @@ ALLOWED_ORIGINS = "http://localhost:5173,https://audio-underview.pages.dev"
# Secrets (set via wrangler secret put):
# SUPABASE_URL
# SUPABASE_SECRET_KEY
+# JWT_SECRET
+# CODE_RUNNER_FUNCTION_URL
[observability.logs]
enabled = true
diff --git a/workers/scheduler-manager-worker/sources/crawler-execution-client.ts b/workers/scheduler-manager-worker/sources/crawler-execution-client.ts
new file mode 100644
index 0000000..e8c402a
--- /dev/null
+++ b/workers/scheduler-manager-worker/sources/crawler-execution-client.ts
@@ -0,0 +1,27 @@
+import {
+ type CrawlerExecuteResult,
+ validateCrawlerExecuteResult,
+} from '@audio-underview/worker-tools';
+
+export type { CrawlerExecuteResult };
+
+interface CrawlerManagerRPC {
+ executeCrawler(crawlerID: string, input: unknown): Promise;
+}
+
+export interface CrawlerExecutionClient {
+ execute(crawlerID: string, input: unknown): Promise;
+}
+
+export class ServiceBindingCrawlerExecutionClient implements CrawlerExecutionClient {
+ private readonly binding: Service;
+
+ constructor(binding: Service) {
+ this.binding = binding;
+ }
+
+ async execute(crawlerID: string, input: unknown): Promise {
+ const raw = await (this.binding as unknown as CrawlerManagerRPC).executeCrawler(crawlerID, input);
+ return validateCrawlerExecuteResult(raw);
+ }
+}
diff --git a/workers/scheduler-manager-worker/sources/handlers/scheduler-execution.ts b/workers/scheduler-manager-worker/sources/handlers/scheduler-execution.ts
new file mode 100644
index 0000000..432bed3
--- /dev/null
+++ b/workers/scheduler-manager-worker/sources/handlers/scheduler-execution.ts
@@ -0,0 +1,122 @@
+import {
+ type ResponseContext,
+ jsonResponse,
+} from '@audio-underview/worker-tools';
+import {
+ type SchedulerRunStatus,
+ createSupabaseClient,
+ createSchedulerRun,
+ getSchedulerRun,
+ updateSchedulerRun,
+} from '@audio-underview/supabase-connector';
+import { createWorkerLogger } from '@audio-underview/logger';
+import type { Environment } from '../index.ts';
+import { ServiceBindingCrawlerExecutionClient } from '../crawler-execution-client.ts';
+import { executeScheduler } from '../scheduler-executor.ts';
+import { verifySchedulerOwnership } from './tools.ts';
+
+export function resolveHTTPStatus(status: string, error: string | null | undefined): number {
+ if (status === 'completed' || status === 'partially_failed') return 200;
+ if (status !== 'failed' || error === null || error === undefined) return 200;
+
+ if (error.includes('timed out')) return 408;
+ if (error.includes('Invalid input_schema') || error.includes('fan_out_field')) return 422;
+ if (error.includes('CodeRunner error') || error.includes('Invalid CrawlerExecuteResult')) return 502;
+ if (error.includes('Supabase') || error.includes('database')) return 503;
+
+ return 500;
+}
+
+const logger = createWorkerLogger({
+ defaultContext: {
+ module: 'scheduler-execution-handler',
+ },
+});
+
+export async function handleExecuteScheduler(
+ environment: Environment,
+ context: ResponseContext,
+ schedulerID: string,
+ userUUID: string,
+): Promise {
+ const supabaseClient = createSupabaseClient({
+ supabaseURL: environment.SUPABASE_URL,
+ supabaseSecretKey: environment.SUPABASE_SECRET_KEY,
+ });
+
+ const ownershipError = await verifySchedulerOwnership(supabaseClient, schedulerID, userUUID, context);
+ if (ownershipError) return ownershipError;
+
+ // Atomic concurrent run guard via DB unique partial index
+ // (scheduler_runs_one_active_per_scheduler: only one pending/running run per scheduler)
+ let run;
+ try {
+ run = await createSchedulerRun(supabaseClient, {
+ scheduler_id: schedulerID,
+ status: 'pending',
+ });
+ } catch (error: unknown) {
+ const message = error instanceof Error ? error.message : String(error);
+ if (message.includes('scheduler_runs_one_active_per_scheduler')) {
+ return jsonResponse({
+ error: 'conflict',
+ error_description: 'A run is already in progress',
+ }, 409, context);
+ }
+ throw error;
+ }
+
+ const crawlerExecutionClient = new ServiceBindingCrawlerExecutionClient(environment.CRAWLER_MANAGER);
+
+ // Pipeline timeout: 5 minutes. Prevents run stuck in 'running' on client disconnect or hang.
+ // AbortController signals executeScheduler to stop updating run status after timeout.
+ const PIPELINE_TIMEOUT_MILLISECONDS = 300_000;
+ const abortController = new AbortController();
+
+ try {
+ await Promise.race([
+ executeScheduler(
+ { supabaseClient, crawlerExecutionClient, logger },
+ schedulerID,
+ userUUID,
+ run.id,
+ abortController.signal,
+ ),
+ new Promise((_, reject) =>
+ setTimeout(() => reject(new Error('Pipeline execution timed out after 5 minutes')), PIPELINE_TIMEOUT_MILLISECONDS),
+ ),
+ ]);
+ } catch (error: unknown) {
+ const message = error instanceof Error ? error.message : String(error);
+
+ if (message.includes('timed out')) {
+ abortController.abort();
+ await updateSchedulerRun(supabaseClient, run.id, schedulerID, {
+ status: 'failed',
+ completed_at: new Date().toISOString(),
+ error: message,
+ }, { onlyIfStatus: ['pending', 'running'] satisfies SchedulerRunStatus[] }).catch((updateError: unknown) => {
+ logger.error('Failed to update run status after timeout', updateError, {
+ function: 'handleExecuteScheduler',
+ metadata: { schedulerID, runID: run.id },
+ });
+ });
+ }
+ }
+
+ // Fetch final run state
+ const completedRun = await getSchedulerRun(supabaseClient, run.id, schedulerID);
+ const finalRun = completedRun ?? run;
+
+ const responseBody = {
+ run_id: finalRun.id,
+ status: finalRun.status,
+ result: finalRun.result,
+ error: finalRun.error,
+ started_at: finalRun.started_at,
+ completed_at: finalRun.completed_at,
+ };
+
+ const httpStatus = resolveHTTPStatus(finalRun.status, finalRun.error);
+ return jsonResponse(responseBody, httpStatus, context);
+}
diff --git a/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts b/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts
index a5a8629..cefb133 100644
--- a/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts
+++ b/workers/scheduler-manager-worker/sources/handlers/scheduler-stages.ts
@@ -5,6 +5,7 @@ import {
} from '@audio-underview/worker-tools';
import {
createSupabaseClient,
+ getCrawlerPermission,
createSchedulerStage,
listSchedulerStages,
getSchedulerStage,
@@ -15,12 +16,15 @@ import {
import type { Environment } from '../index.ts';
import { verifySchedulerOwnership, UUID_PATTERN } from './tools.ts';
+type FanOutStrategy = 'compact' | 'preserve';
+
interface CreateStageRequestBody {
crawler_id: string;
stage_order: number;
input_schema: Record;
output_schema?: Record;
fan_out_field?: string;
+ fan_out_strategy?: FanOutStrategy;
}
interface UpdateStageRequestBody {
@@ -28,6 +32,7 @@ interface UpdateStageRequestBody {
input_schema?: Record;
output_schema?: Record;
fan_out_field?: string | null;
+ fan_out_strategy?: FanOutStrategy;
}
function isPlainObject(value: unknown): value is Record {
@@ -64,6 +69,11 @@ export async function handleCreateStage(
return errorResponse('invalid_request', "Field 'crawler_id' is required and must be a valid UUID", 400, context);
}
+ const crawlerPermission = await getCrawlerPermission(supabaseClient, body.crawler_id, userUUID);
+ if (crawlerPermission === undefined) {
+ return errorResponse('forbidden', 'You do not have permission to use this crawler', 403, context);
+ }
+
if (typeof body.stage_order !== 'number' || !Number.isInteger(body.stage_order) || body.stage_order < 0) {
return errorResponse('invalid_request', "Field 'stage_order' is required and must be a non-negative integer", 400, context);
}
@@ -82,6 +92,12 @@ export async function handleCreateStage(
}
}
+ if (body.fan_out_strategy !== undefined) {
+ if (body.fan_out_strategy !== 'compact' && body.fan_out_strategy !== 'preserve') {
+ return errorResponse('invalid_request', "Field 'fan_out_strategy' must be 'compact' or 'preserve'", 400, context);
+ }
+ }
+
try {
const stage = await createSchedulerStage(supabaseClient, {
scheduler_id: schedulerID,
@@ -90,6 +106,7 @@ export async function handleCreateStage(
input_schema: body.input_schema,
output_schema: body.output_schema,
fan_out_field: body.fan_out_field,
+ fan_out_strategy: body.fan_out_strategy,
});
return jsonResponse(stage, 201, context);
@@ -177,6 +194,11 @@ export async function handleUpdateStage(
if (typeof body.crawler_id !== 'string' || !UUID_PATTERN.test(body.crawler_id)) {
return errorResponse('invalid_request', "Field 'crawler_id' must be a valid UUID", 400, context);
}
+
+ const crawlerPermission = await getCrawlerPermission(supabaseClient, body.crawler_id, userUUID);
+ if (crawlerPermission === undefined) {
+ return errorResponse('forbidden', 'You do not have permission to use this crawler', 403, context);
+ }
}
if (body.input_schema !== undefined && !isPlainObject(body.input_schema)) {
@@ -193,7 +215,13 @@ export async function handleUpdateStage(
}
}
- if (body.crawler_id === undefined && body.input_schema === undefined && body.output_schema === undefined && body.fan_out_field === undefined) {
+ if (body.fan_out_strategy !== undefined) {
+ if (body.fan_out_strategy !== 'compact' && body.fan_out_strategy !== 'preserve') {
+ return errorResponse('invalid_request', "Field 'fan_out_strategy' must be 'compact' or 'preserve'", 400, context);
+ }
+ }
+
+ if (body.crawler_id === undefined && body.input_schema === undefined && body.output_schema === undefined && body.fan_out_field === undefined && body.fan_out_strategy === undefined) {
return errorResponse('invalid_request', 'At least one field must be provided for update', 400, context);
}
@@ -202,6 +230,7 @@ export async function handleUpdateStage(
if (body.input_schema !== undefined) updatePayload.input_schema = body.input_schema;
if (body.output_schema !== undefined) updatePayload.output_schema = body.output_schema;
if (body.fan_out_field !== undefined) updatePayload.fan_out_field = body.fan_out_field;
+ if (body.fan_out_strategy !== undefined) updatePayload.fan_out_strategy = body.fan_out_strategy;
try {
const stage = await updateSchedulerStage(supabaseClient, stageID, schedulerID, updatePayload);
diff --git a/workers/scheduler-manager-worker/sources/index.ts b/workers/scheduler-manager-worker/sources/index.ts
index bff7015..095eff3 100644
--- a/workers/scheduler-manager-worker/sources/index.ts
+++ b/workers/scheduler-manager-worker/sources/index.ts
@@ -27,6 +27,7 @@ import {
handleListRuns,
handleGetRun,
} from './handlers/scheduler-runs.ts';
+import { handleExecuteScheduler } from './handlers/scheduler-execution.ts';
import { UUID_PATTERN } from './handlers/tools.ts';
export interface Environment {
@@ -34,6 +35,7 @@ export interface Environment {
SUPABASE_URL: string;
SUPABASE_SECRET_KEY: string;
JWT_SECRET: string;
+ CRAWLER_MANAGER: Service;
}
const logger = createWorkerLogger({
@@ -60,12 +62,14 @@ const HELP = {
{ method: 'PUT', path: '/schedulers/:id/stages/reorder', description: 'Reorder stages' },
{ method: 'GET', path: '/schedulers/:id/runs', description: 'List runs for a scheduler' },
{ method: 'GET', path: '/schedulers/:id/runs/:runID', description: 'Get a run by ID' },
+ { method: 'POST', path: '/schedulers/:id/execute', description: 'Execute a scheduler pipeline' },
],
};
interface ParsedRoute {
type: 'schedulers_collection'
| 'scheduler_single'
+ | 'scheduler_execute'
| 'stages_collection'
| 'stage_single'
| 'stages_reorder'
@@ -91,6 +95,14 @@ function parseRoute(pathname: string): ParsedRoute {
return { type: 'scheduler_single', schedulerID: id };
}
+ // /schedulers/:id/execute
+ const executeMatch = pathname.match(/^\/schedulers\/([0-9a-f-]+)\/execute$/i);
+ if (executeMatch) {
+ const id = executeMatch[1];
+ if (!UUID_PATTERN.test(id)) return { type: null };
+ return { type: 'scheduler_execute', schedulerID: id };
+ }
+
// /schedulers/:id/stages/reorder
const reorderMatch = pathname.match(/^\/schedulers\/([0-9a-f-]+)\/stages\/reorder$/i);
if (reorderMatch) {
@@ -237,6 +249,15 @@ export default {
return response;
}
+ case 'scheduler_execute': {
+ if (request.method === 'POST') {
+ return await handleExecuteScheduler(environment, context, route.schedulerID!, userUUID);
+ }
+ const response = errorResponse('method_not_allowed', 'Method not allowed', 405, context);
+ response.headers.set('Allow', 'POST');
+ return response;
+ }
+
case 'stages_reorder': {
if (request.method === 'PUT') {
return await handleReorderStages(request, environment, context, route.schedulerID!, userUUID);
diff --git a/workers/scheduler-manager-worker/sources/scheduler-executor.ts b/workers/scheduler-manager-worker/sources/scheduler-executor.ts
new file mode 100644
index 0000000..a6ed4cd
--- /dev/null
+++ b/workers/scheduler-manager-worker/sources/scheduler-executor.ts
@@ -0,0 +1,193 @@
+import type { SupabaseClient } from '@audio-underview/supabase-connector';
+import type { Logger } from '@audio-underview/logger';
+import {
+ listSchedulerStages,
+ updateSchedulerRun,
+ updateScheduler,
+ createSchedulerStageRun,
+ updateSchedulerStageRun,
+} from '@audio-underview/supabase-connector';
+import type { CrawlerExecutionClient } from './crawler-execution-client.ts';
+import {
+ executeStage,
+ executeFanOut,
+ resolveDefaultInput,
+} from './stage-runner.ts';
+
+export interface ExecutorDependencies {
+ supabaseClient: SupabaseClient;
+ crawlerExecutionClient: CrawlerExecutionClient;
+ logger: Logger;
+}
+
+export async function executeScheduler(
+ dependencies: ExecutorDependencies,
+ schedulerID: string,
+ userUUID: string,
+ runID: string,
+ signal?: AbortSignal,
+): Promise {
+ const { supabaseClient, logger } = dependencies;
+
+ const stageRunnerDependencies = {
+ supabaseClient: dependencies.supabaseClient,
+ crawlerExecutionClient: dependencies.crawlerExecutionClient,
+ logger: dependencies.logger,
+ };
+
+ try {
+ // Mark run as running
+ await updateSchedulerRun(supabaseClient, runID, schedulerID, {
+ status: 'running',
+ started_at: new Date().toISOString(),
+ });
+
+ const stages = await listSchedulerStages(supabaseClient, schedulerID);
+
+ if (stages.length === 0) {
+ await updateSchedulerRun(supabaseClient, runID, schedulerID, {
+ status: 'completed',
+ completed_at: new Date().toISOString(),
+ result: null,
+ });
+ return;
+ }
+
+ let currentInput: unknown = resolveDefaultInput(stages[0].input_schema);
+ let lastOutput: unknown = null;
+ let hasPartialFailure = false;
+
+ for (const stage of stages) {
+ if (signal?.aborted) break;
+
+ // Fan-out check
+ if (stage.fan_out_field) {
+ if (currentInput !== null && currentInput !== undefined && typeof currentInput !== 'object') {
+ throw new Error(
+ `Stage ${stage.stage_order}: fan_out_field "${stage.fan_out_field}" requires object input, got ${typeof currentInput}`,
+ );
+ }
+ const inputObject = currentInput as Record | null;
+ const fanOutItems = inputObject?.[stage.fan_out_field];
+
+ if (fanOutItems === undefined || fanOutItems === null) {
+ throw new Error(
+ `Stage ${stage.stage_order}: fan_out_field "${stage.fan_out_field}" not found in input`,
+ );
+ }
+
+ if (!Array.isArray(fanOutItems)) {
+ throw new Error(
+ `Stage ${stage.stage_order}: fan_out_field "${stage.fan_out_field}" is not an array`,
+ );
+ }
+
+ // Create stage_run record for the fan-out stage
+ const stageRun = await createSchedulerStageRun(supabaseClient, {
+ run_id: runID,
+ stage_id: stage.id,
+ stage_order: stage.stage_order,
+ status: 'running',
+ started_at: new Date().toISOString(),
+ input: currentInput,
+ });
+
+ if (fanOutItems.length === 0) {
+ await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, {
+ status: 'completed',
+ completed_at: new Date().toISOString(),
+ output: [],
+ items_total: 0,
+ items_succeeded: 0,
+ items_failed: 0,
+ });
+ currentInput = [];
+ lastOutput = [];
+ continue;
+ }
+
+ const fanOutResult = await executeFanOut(
+ stageRunnerDependencies,
+ stage,
+ fanOutItems,
+ 1,
+ signal,
+ );
+
+ await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, {
+ status: fanOutResult.status,
+ completed_at: new Date().toISOString(),
+ output: fanOutResult.results,
+ items_total: fanOutResult.itemsTotal,
+ items_succeeded: fanOutResult.itemsSucceeded,
+ items_failed: fanOutResult.itemsFailed,
+ });
+
+ if (fanOutResult.status === 'failed') {
+ throw new Error(
+ `Stage ${stage.stage_order}: all fan-out items failed`,
+ );
+ }
+
+ if (fanOutResult.status === 'partially_failed') {
+ hasPartialFailure = true;
+ }
+
+ currentInput = fanOutResult.results;
+ lastOutput = fanOutResult.results;
+ } else {
+ // Normal stage execution
+ const stageResult = await executeStage(
+ stageRunnerDependencies,
+ runID,
+ stage,
+ currentInput,
+ signal,
+ );
+
+ currentInput = stageResult.output;
+ lastOutput = stageResult.output;
+ }
+ }
+
+ // Pipeline completed successfully — skip if handler already timed out
+ if (signal?.aborted) return;
+
+ await updateSchedulerRun(supabaseClient, runID, schedulerID, {
+ status: hasPartialFailure ? 'partially_failed' : 'completed',
+ completed_at: new Date().toISOString(),
+ result: lastOutput,
+ });
+ } catch (error: unknown) {
+ if (signal?.aborted) return;
+
+ const errorMessage = error instanceof Error ? error.message : String(error);
+
+ logger.error('Scheduler execution failed', error, {
+ function: 'executeScheduler',
+ metadata: { schedulerID, runID },
+ });
+
+ await updateSchedulerRun(supabaseClient, runID, schedulerID, {
+ status: 'failed',
+ completed_at: new Date().toISOString(),
+ error: errorMessage,
+ }).catch((updateError: unknown) => {
+ logger.error('Failed to update run status after error', updateError, {
+ function: 'executeScheduler',
+ metadata: { schedulerID, runID },
+ });
+ });
+ } finally {
+ if (signal?.aborted) return;
+ // Always update last_run_at
+ await updateScheduler(supabaseClient, schedulerID, userUUID, {
+ last_run_at: new Date().toISOString(),
+ }).catch((updateError: unknown) => {
+ logger.error('Failed to update scheduler last_run_at', updateError, {
+ function: 'executeScheduler',
+ metadata: { schedulerID },
+ });
+ });
+ }
+}
diff --git a/workers/scheduler-manager-worker/sources/stage-runner.ts b/workers/scheduler-manager-worker/sources/stage-runner.ts
new file mode 100644
index 0000000..6f7b318
--- /dev/null
+++ b/workers/scheduler-manager-worker/sources/stage-runner.ts
@@ -0,0 +1,160 @@
+import type { SupabaseClient } from '@audio-underview/supabase-connector';
+import type { Logger } from '@audio-underview/logger';
+import type {
+ SchedulerStageRow,
+ SchedulerStageRunRow,
+} from '@audio-underview/supabase-connector';
+import {
+ createSchedulerStageRun,
+ updateSchedulerStageRun,
+} from '@audio-underview/supabase-connector';
+import type { CrawlerExecutionClient } from './crawler-execution-client.ts';
+
+export interface StageRunnerDependencies {
+ supabaseClient: SupabaseClient;
+ crawlerExecutionClient: CrawlerExecutionClient;
+ logger: Logger;
+}
+
+export interface StageResult {
+ output: unknown;
+ stageRun: SchedulerStageRunRow;
+}
+
+export interface FanOutResult {
+ results: unknown[];
+ itemsTotal: number;
+ itemsSucceeded: number;
+ itemsFailed: number;
+ status: 'completed' | 'partially_failed' | 'failed';
+}
+
+export function resolveDefaultInput(inputSchema: unknown): Record {
+ if (inputSchema === null || inputSchema === undefined || typeof inputSchema !== 'object' || Array.isArray(inputSchema)) {
+ throw new Error(`Invalid input_schema: expected object, got ${typeof inputSchema}`);
+ }
+ const defaults: Record = {};
+ for (const [key, value] of Object.entries(inputSchema)) {
+ if (value !== null && value !== undefined && typeof value === 'object' && 'default' in value) {
+ defaults[key] = (value as Record).default;
+ }
+ }
+ return defaults;
+}
+
+export async function executeStage(
+ dependencies: StageRunnerDependencies,
+ runID: string,
+ stage: SchedulerStageRow,
+ input: unknown,
+ signal?: AbortSignal,
+): Promise {
+ const { supabaseClient, crawlerExecutionClient, logger } = dependencies;
+
+ if (signal?.aborted) {
+ throw new Error('Stage execution aborted: pipeline timed out');
+ }
+
+ const stageRun = await createSchedulerStageRun(supabaseClient, {
+ run_id: runID,
+ stage_id: stage.id,
+ stage_order: stage.stage_order,
+ status: 'running',
+ started_at: new Date().toISOString(),
+ input,
+ });
+
+ try {
+ const response = await crawlerExecutionClient.execute(stage.crawler_id, input);
+
+ const updatedStageRun = await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, {
+ status: 'completed',
+ completed_at: new Date().toISOString(),
+ output: response.result,
+ });
+
+ return { output: response.result, stageRun: updatedStageRun ?? stageRun };
+ } catch (error: unknown) {
+ const errorMessage = error instanceof Error ? error.message : String(error);
+
+ logger.error('Stage execution failed', error, {
+ function: 'executeStage',
+ metadata: { stageID: stage.id, stageOrder: stage.stage_order },
+ });
+
+ await updateSchedulerStageRun(supabaseClient, stageRun.id, runID, {
+ status: 'failed',
+ completed_at: new Date().toISOString(),
+ error: errorMessage,
+ }).catch((updateError: unknown) => {
+ logger.error('Failed to update stage run status after error', updateError, {
+ function: 'executeStage',
+ metadata: { stageRunID: stageRun.id, runID },
+ });
+ });
+
+ throw error;
+ }
+}
+
+const FAN_OUT_FAILED = Symbol('fan-out-failed');
+
+export async function executeFanOut(
+ dependencies: StageRunnerDependencies,
+ stage: SchedulerStageRow,
+ items: unknown[],
+ concurrency: number = 1,
+ signal?: AbortSignal,
+): Promise {
+ const { crawlerExecutionClient, logger } = dependencies;
+
+ const results: (unknown | typeof FAN_OUT_FAILED)[] = new Array(items.length).fill(FAN_OUT_FAILED);
+ let itemsSucceeded = 0;
+ let itemsFailed = 0;
+
+ let nextIndex = 0;
+
+ async function worker(): Promise {
+ while (nextIndex < items.length) {
+ if (signal?.aborted) break;
+ const index = nextIndex++;
+ const item = items[index];
+ try {
+ const response = await crawlerExecutionClient.execute(stage.crawler_id, item);
+ results[index] = response.result;
+ itemsSucceeded++;
+ } catch (error: unknown) {
+ logger.warn('Fan-out item failed', error, {
+ function: 'executeFanOut',
+ metadata: { stageID: stage.id, itemIndex: index },
+ });
+ itemsFailed++;
+ }
+ }
+ }
+
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => worker());
+ await Promise.all(workers);
+
+ let status: 'completed' | 'partially_failed' | 'failed';
+ if (itemsFailed === 0) {
+ status = 'completed';
+ } else if (itemsSucceeded > 0) {
+ status = 'partially_failed';
+ } else {
+ status = 'failed';
+ }
+
+ const strategy = stage.fan_out_strategy ?? 'compact';
+ const finalResults = strategy === 'preserve'
+ ? results.map((result) => result === FAN_OUT_FAILED ? null : result)
+ : results.filter((result) => result !== FAN_OUT_FAILED);
+
+ return {
+ results: finalResults,
+ itemsTotal: items.length,
+ itemsSucceeded,
+ itemsFailed,
+ status,
+ };
+}
diff --git a/workers/scheduler-manager-worker/tests/index.test.ts b/workers/scheduler-manager-worker/tests/index.test.ts
index 580570b..6d60f5a 100644
--- a/workers/scheduler-manager-worker/tests/index.test.ts
+++ b/workers/scheduler-manager-worker/tests/index.test.ts
@@ -47,6 +47,7 @@ function mockStageResponse(overrides: Record = {}) {
input_schema: { url: { type: 'string', default: 'https://example.com' } },
output_schema: {},
fan_out_field: null,
+ fan_out_strategy: 'compact',
created_at: '2026-01-01T00:00:00Z',
...overrides,
};
@@ -89,6 +90,19 @@ function mockSupabaseSchedulerGet(data: unknown = mockSchedulerResponse()) {
.reply(200, JSON.stringify(data));
}
+function mockCrawlerPermission() {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/crawler_permissions/, method: 'GET' })
+ .reply(200, JSON.stringify({
+ id: '00000000-0000-0000-0000-000000000099',
+ crawler_id: MOCK_CRAWLER_ID,
+ user_uuid: MOCK_USER_UUID,
+ level: 'owner',
+ created_at: '2026-01-01T00:00:00Z',
+ }));
+}
+
function mockSupabaseSchedulerNotFound() {
fetchMock
.get('https://supabase.example.com')
@@ -402,6 +416,8 @@ describe('scheduler-manager-worker', () => {
it('creates a stage and returns 201', async () => {
// Mock scheduler ownership check
mockSupabaseSchedulerGet();
+ // Mock crawler permission check
+ mockCrawlerPermission();
// Mock stage creation
fetchMock
.get('https://supabase.example.com')
diff --git a/workers/scheduler-manager-worker/tests/scheduler-execution.test.ts b/workers/scheduler-manager-worker/tests/scheduler-execution.test.ts
new file mode 100644
index 0000000..b44baa9
--- /dev/null
+++ b/workers/scheduler-manager-worker/tests/scheduler-execution.test.ts
@@ -0,0 +1,87 @@
+import { describe, it, expect } from 'vitest';
+import { resolveHTTPStatus } from '../sources/handlers/scheduler-execution.ts';
+import { validateCrawlerExecuteResult } from '@audio-underview/worker-tools';
+
+describe('resolveHTTPStatus', () => {
+ it('returns 200 for completed', () => {
+ expect(resolveHTTPStatus('completed', null)).toBe(200);
+ });
+
+ it('returns 200 for partially_failed', () => {
+ expect(resolveHTTPStatus('partially_failed', null)).toBe(200);
+ });
+
+ it('returns 200 for failed with no error message', () => {
+ expect(resolveHTTPStatus('failed', null)).toBe(200);
+ });
+
+ it('returns 500 for failed with unknown error', () => {
+ expect(resolveHTTPStatus('failed', 'Something went wrong')).toBe(500);
+ });
+
+ it('returns 408 for pipeline timeout', () => {
+ expect(resolveHTTPStatus('failed', 'Pipeline execution timed out after 5 minutes')).toBe(408);
+ });
+
+ it('returns 422 for invalid input_schema', () => {
+ expect(resolveHTTPStatus('failed', 'Invalid input_schema: expected object, got string')).toBe(422);
+ });
+
+ it('returns 422 for fan_out_field error', () => {
+ expect(resolveHTTPStatus('failed', 'Stage 1: fan_out_field "items" not found in input')).toBe(422);
+ });
+
+ it('returns 502 for code-runner error', () => {
+ expect(resolveHTTPStatus('failed', 'CodeRunner error 500: [server_error] Server returned 500')).toBe(502);
+ });
+
+ it('returns 502 for invalid RPC response', () => {
+ expect(resolveHTTPStatus('failed', 'Invalid CrawlerExecuteResult: expected object')).toBe(502);
+ });
+
+ it('returns 503 for database error', () => {
+ expect(resolveHTTPStatus('failed', 'Supabase request failed')).toBe(503);
+ });
+
+ it('returns 503 for database connection error', () => {
+ expect(resolveHTTPStatus('failed', 'database connection refused')).toBe(503);
+ });
+
+ it('returns 200 for pending status', () => {
+ expect(resolveHTTPStatus('pending', null)).toBe(200);
+ });
+
+ it('returns 200 for running status', () => {
+ expect(resolveHTTPStatus('running', null)).toBe(200);
+ });
+});
+
+describe('validateCrawlerExecuteResult', () => {
+ it('accepts valid web result', () => {
+ const result = validateCrawlerExecuteResult({ type: 'web', result: { data: 'hello' } });
+ expect(result.type).toBe('web');
+ expect(result.result).toEqual({ data: 'hello' });
+ });
+
+ it('accepts valid data result with null', () => {
+ const result = validateCrawlerExecuteResult({ type: 'data', result: null });
+ expect(result.type).toBe('data');
+ expect(result.result).toBeNull();
+ });
+
+ it('throws on null input', () => {
+ expect(() => validateCrawlerExecuteResult(null)).toThrow('expected object');
+ });
+
+ it('throws on non-object input', () => {
+ expect(() => validateCrawlerExecuteResult('string')).toThrow('expected object');
+ });
+
+ it('throws on invalid type', () => {
+ expect(() => validateCrawlerExecuteResult({ type: 'unknown', result: {} })).toThrow("expected type 'web' or 'data'");
+ });
+
+ it('throws on missing result field', () => {
+ expect(() => validateCrawlerExecuteResult({ type: 'web' })).toThrow('missing result field');
+ });
+});
diff --git a/workers/scheduler-manager-worker/tests/scheduler-executor.test.ts b/workers/scheduler-manager-worker/tests/scheduler-executor.test.ts
new file mode 100644
index 0000000..12a6f29
--- /dev/null
+++ b/workers/scheduler-manager-worker/tests/scheduler-executor.test.ts
@@ -0,0 +1,603 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { env, fetchMock } from 'cloudflare:test';
+import { createSupabaseClient } from '@audio-underview/supabase-connector';
+import type { SchedulerStageRow } from '@audio-underview/supabase-connector';
+import type { CrawlerExecutionClient } from '../sources/crawler-execution-client.ts';
+import type { Logger } from '@audio-underview/logger';
+import { executeScheduler } from '../sources/scheduler-executor.ts';
+import type { ExecutorDependencies } from '../sources/scheduler-executor.ts';
+
+const SCHEDULER_ID = '00000000-0000-0000-0000-000000000010';
+const USER_UUID = '00000000-0000-0000-0000-000000000001';
+const RUN_ID = '00000000-0000-0000-0000-000000000040';
+const STAGE_ID = '00000000-0000-0000-0000-000000000020';
+const STAGE_ID_2 = '00000000-0000-0000-0000-000000000021';
+const CRAWLER_ID = '00000000-0000-0000-0000-000000000030';
+const CRAWLER_ID_2 = '00000000-0000-0000-0000-000000000031';
+const STAGE_RUN_ID = '00000000-0000-0000-0000-000000000050';
+const STAGE_RUN_ID_2 = '00000000-0000-0000-0000-000000000051';
+
+function mockStage(overrides: Partial = {}): SchedulerStageRow {
+ return {
+ id: STAGE_ID,
+ scheduler_id: SCHEDULER_ID,
+ crawler_id: CRAWLER_ID,
+ stage_order: 0,
+ input_schema: { url: { type: 'string', default: 'https://example.com' } },
+ output_schema: {},
+ fan_out_field: null,
+ fan_out_strategy: 'compact',
+ created_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ } as SchedulerStageRow;
+}
+
+function mockRunRow(overrides: Record = {}) {
+ return {
+ id: RUN_ID,
+ scheduler_id: SCHEDULER_ID,
+ status: 'running',
+ started_at: '2026-01-01T00:00:00Z',
+ completed_at: null,
+ result: null,
+ error: null,
+ created_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ };
+}
+
+function mockSchedulerRow(overrides: Record = {}) {
+ return {
+ id: SCHEDULER_ID,
+ user_uuid: USER_UUID,
+ name: 'Test Scheduler',
+ cron_expression: null,
+ is_enabled: true,
+ last_run_at: null,
+ created_at: '2026-01-01T00:00:00Z',
+ updated_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ };
+}
+
+function mockStageRunRow(overrides: Record = {}) {
+ return {
+ id: STAGE_RUN_ID,
+ run_id: RUN_ID,
+ stage_id: STAGE_ID,
+ stage_order: 0,
+ status: 'running',
+ started_at: '2026-01-01T00:00:00Z',
+ completed_at: null,
+ input: null,
+ output: null,
+ error: null,
+ items_total: null,
+ items_succeeded: null,
+ items_failed: null,
+ created_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ };
+}
+
+// --- Supabase mock helpers ---
+
+function mockListSchedulerStages(stages: SchedulerStageRow[]) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_stages/, method: 'GET' })
+ .reply(200, JSON.stringify(stages));
+}
+
+function mockUpdateSchedulerRun(overrides: Record = {}) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_runs/, method: 'PATCH' })
+ .reply(200, JSON.stringify(mockRunRow(overrides)));
+}
+
+function mockUpdateScheduler(overrides: Record = {}) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/schedulers/, method: 'PATCH' })
+ .reply(200, JSON.stringify(mockSchedulerRow(overrides)));
+}
+
+function mockCreateStageRun(overrides: Record = {}) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'POST' })
+ .reply(201, JSON.stringify(mockStageRunRow(overrides)));
+}
+
+function mockUpdateStageRun(overrides: Record = {}) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'PATCH' })
+ .reply(200, JSON.stringify(mockStageRunRow(overrides)));
+}
+
+function mockUpdateSchedulerRunError() {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_runs/, method: 'PATCH' })
+ .reply(500, JSON.stringify({ message: 'Internal Server Error' }));
+}
+
+// --- Crawler execution client mock ---
+
+function createMockCrawlerExecutionClient(
+ results: unknown[] = [],
+): CrawlerExecutionClient & { execute: ReturnType } {
+ let callIndex = 0;
+ return {
+ execute: vi.fn().mockImplementation(async () => {
+ const result = results[callIndex] ?? { extracted: 'data' };
+ callIndex++;
+ return { type: 'data' as const, result };
+ }),
+ };
+}
+
+function createMockLogger(): Logger {
+ return {
+ info: vi.fn(),
+ warn: vi.fn(),
+ error: vi.fn(),
+ debug: vi.fn(),
+ createChild: vi.fn().mockReturnThis(),
+ } as unknown as Logger;
+}
+
+function createDependencies(crawlerResults: unknown[] = []): {
+ dependencies: ExecutorDependencies;
+ crawlerExecutionClient: ReturnType;
+ logger: ReturnType;
+} {
+ const crawlerExecutionClient = createMockCrawlerExecutionClient(crawlerResults);
+ const logger = createMockLogger();
+ const supabaseClient = createSupabaseClient({
+ supabaseURL: env.SUPABASE_URL,
+ supabaseSecretKey: env.SUPABASE_SECRET_KEY,
+ });
+ return {
+ dependencies: { supabaseClient, crawlerExecutionClient, logger },
+ crawlerExecutionClient,
+ logger,
+ };
+}
+
+beforeEach(() => {
+ fetchMock.activate();
+ fetchMock.disableNetConnect();
+});
+
+afterEach(() => {
+ fetchMock.deactivate();
+});
+
+describe('executeScheduler', () => {
+ it('marks run as completed with result null when stages are empty', async () => {
+ const { dependencies } = createDependencies();
+
+ // 1. updateSchedulerRun → status: 'running'
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages → empty array
+ mockListSchedulerStages([]);
+ // 3. updateSchedulerRun → status: 'completed', result: null
+ mockUpdateSchedulerRun({ status: 'completed', result: null });
+ // finally: updateScheduler → last_run_at
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(dependencies.crawlerExecutionClient.execute).not.toHaveBeenCalled();
+ });
+
+ it('executes a single stage and marks run completed with output', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies([
+ { title: 'Test Page' },
+ ]);
+
+ // 1. updateSchedulerRun → status: 'running'
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages → one stage
+ mockListSchedulerStages([mockStage()]);
+ // 3. executeStage internals:
+ // a. createSchedulerStageRun
+ mockCreateStageRun();
+ // b. crawlerExecutionClient.execute → via mock
+ // c. updateSchedulerStageRun
+ mockUpdateStageRun({ status: 'completed', output: { title: 'Test Page' } });
+ // 4. updateSchedulerRun → status: 'completed'
+ mockUpdateSchedulerRun({ status: 'completed', result: { title: 'Test Page' } });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(1);
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledWith(CRAWLER_ID, {
+ url: 'https://example.com',
+ });
+ });
+
+ it('chains multi-stage output: stage N output becomes stage N+1 input', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies([
+ { urls: ['https://a.com', 'https://b.com'] },
+ { results: [1, 2] },
+ ]);
+
+ const stage1 = mockStage({
+ id: STAGE_ID,
+ crawler_id: CRAWLER_ID,
+ stage_order: 0,
+ });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ crawler_id: CRAWLER_ID_2,
+ stage_order: 1,
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1: createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. stage2: createStageRun + updateStageRun
+ mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 });
+ mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'completed' });
+ // 5. updateSchedulerRun → completed
+ mockUpdateSchedulerRun({ status: 'completed' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(2);
+ // Stage 1 gets default input from input_schema
+ expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(1, CRAWLER_ID, {
+ url: 'https://example.com',
+ });
+ // Stage 2 gets output of stage 1 as input
+ expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(2, CRAWLER_ID_2, {
+ urls: ['https://a.com', 'https://b.com'],
+ });
+ });
+
+ it('handles fan-out stage: validates fan_out_field exists and is array', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies([
+ { links: ['https://a.com', 'https://b.com'] },
+ ]);
+
+ const stage1 = mockStage({ stage_order: 0 });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ crawler_id: CRAWLER_ID_2,
+ stage_order: 1,
+ fan_out_field: 'links',
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1 (normal): createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. stage2 (fan-out): createStageRun for the fan-out stage
+ mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 });
+ // 5. executeFanOut calls crawlerExecutionClient.execute for each item
+ // (2 items from links array → 2 crawler calls, producing results for items 2 and 3)
+ // 6. updateStageRun for the fan-out stage
+ mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'completed' });
+ // 7. updateSchedulerRun → completed
+ mockUpdateSchedulerRun({ status: 'completed' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ // Stage 1 + 2 fan-out items = 3 total crawler calls
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(3);
+ expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(2, CRAWLER_ID_2, 'https://a.com');
+ expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(3, CRAWLER_ID_2, 'https://b.com');
+ });
+
+ it('completes fan-out with empty array: output is []', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies([
+ { links: [] },
+ ]);
+
+ const stage1 = mockStage({ stage_order: 0 });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ stage_order: 1,
+ fan_out_field: 'links',
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1 (normal): createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. stage2 (fan-out): createStageRun
+ mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 });
+ // 5. empty array → updateStageRun with output: []
+ mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'completed', output: [] });
+ // 6. updateSchedulerRun → completed (lastOutput = [])
+ mockUpdateSchedulerRun({ status: 'completed', result: [] });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ // Only stage 1 executes via crawler; fan-out stage has empty array
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(1);
+ });
+
+ it('fails run when all fan-out items fail', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies();
+
+ crawlerExecutionClient.execute
+ .mockReset()
+ .mockImplementationOnce(async () => ({ type: 'data' as const, result: { data: 'stage1' } }))
+ .mockImplementationOnce(async () => { throw new Error('fan-out item 1 failed'); })
+ .mockImplementationOnce(async () => { throw new Error('fan-out item 2 failed'); });
+
+ const stage1 = mockStage({ stage_order: 0 });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ stage_order: 1,
+ fan_out_field: 'items',
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1: createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. stage2 fan-out: createStageRun
+ mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 });
+ // 5. executeFanOut → all fail → status: 'failed'
+ // 6. updateStageRun with status: 'failed'
+ mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'failed' });
+ // 7. throw → catch block: updateSchedulerRun → status: 'failed'
+ mockUpdateSchedulerRun({ status: 'failed', error: 'Stage 1: all fan-out items failed' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Scheduler execution failed',
+ expect.any(Error),
+ expect.objectContaining({ function: 'executeScheduler' }),
+ );
+ });
+
+ it('sets run status to partially_failed when some fan-out items fail', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+
+ crawlerExecutionClient.execute
+ .mockReset()
+ .mockImplementationOnce(async () => ({ type: 'data' as const, result: { items: ['a', 'b'] } }))
+ .mockImplementationOnce(async () => ({ type: 'data' as const, result: 'ok-a' }))
+ .mockImplementationOnce(async () => { throw new Error('item b failed'); });
+
+ const stage1 = mockStage({ stage_order: 0 });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ stage_order: 1,
+ fan_out_field: 'items',
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1: createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. stage2 fan-out: createStageRun
+ mockCreateStageRun({ id: STAGE_RUN_ID_2, stage_id: STAGE_ID_2, stage_order: 1 });
+ // 5. executeFanOut → partially_failed
+ // 6. updateStageRun with status: 'partially_failed'
+ mockUpdateStageRun({ id: STAGE_RUN_ID_2, status: 'partially_failed' });
+ // 7. updateSchedulerRun → partially_failed (hasPartialFailure = true)
+ mockUpdateSchedulerRun({ status: 'partially_failed' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(3);
+ });
+
+ it('marks run as failed when a stage throws an error', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies();
+
+ crawlerExecutionClient.execute
+ .mockReset()
+ .mockRejectedValueOnce(new Error('Crawler connection timeout'));
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([mockStage()]);
+ // 3. executeStage: createStageRun
+ mockCreateStageRun();
+ // 4. executeStage: crawler throws → updateStageRun with error
+ mockUpdateStageRun({ status: 'failed', error: 'Crawler connection timeout' });
+ // 5. catch: updateSchedulerRun → failed
+ mockUpdateSchedulerRun({ status: 'failed', error: 'Crawler connection timeout' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Scheduler execution failed',
+ expect.any(Error),
+ expect.objectContaining({
+ function: 'executeScheduler',
+ metadata: { schedulerID: SCHEDULER_ID, runID: RUN_ID },
+ }),
+ );
+ });
+
+ it('always updates scheduler.last_run_at in finally block', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+
+ crawlerExecutionClient.execute
+ .mockReset()
+ .mockRejectedValueOnce(new Error('Some error'));
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([mockStage()]);
+ // 3. executeStage: createStageRun
+ mockCreateStageRun();
+ // 4. executeStage: crawler throws → updateStageRun with error
+ mockUpdateStageRun({ status: 'failed' });
+ // 5. catch: updateSchedulerRun → failed
+ mockUpdateSchedulerRun({ status: 'failed' });
+ // finally: updateScheduler → capture request body to verify last_run_at
+ let capturedBody: Record | undefined;
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/schedulers/, method: 'PATCH' })
+ .reply(200, (request: { body: string }) => {
+ capturedBody = JSON.parse(request.body as string) as Record;
+ return { data: JSON.stringify(mockSchedulerRow()) };
+ });
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(capturedBody).toBeDefined();
+ expect(capturedBody!.last_run_at).toBeDefined();
+ expect(typeof capturedBody!.last_run_at).toBe('string');
+ });
+
+ it('logs error but does not throw when run status update fails in catch block', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies();
+
+ crawlerExecutionClient.execute
+ .mockReset()
+ .mockRejectedValueOnce(new Error('Crawler failed'));
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([mockStage()]);
+ // 3. executeStage: createStageRun
+ mockCreateStageRun();
+ // 4. executeStage: crawler throws → updateStageRun with error
+ mockUpdateStageRun({ status: 'failed' });
+ // 5. catch: updateSchedulerRun → HTTP 500 error
+ mockUpdateSchedulerRunError();
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ // Should NOT throw even though the catch-block update failed
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ // Should log both the original error and the update failure
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Scheduler execution failed',
+ expect.any(Error),
+ expect.objectContaining({ function: 'executeScheduler' }),
+ );
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Failed to update run status after error',
+ expect.any(Error),
+ expect.objectContaining({ function: 'executeScheduler' }),
+ );
+ });
+
+ it('throws error when fan_out_field references a non-existent field', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies([
+ { data: 'no-links-field' },
+ ]);
+
+ const stage1 = mockStage({ stage_order: 0 });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ stage_order: 1,
+ fan_out_field: 'links',
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1: createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. fan_out_field 'links' not found in input → throws
+ // 5. catch: updateSchedulerRun → failed
+ mockUpdateSchedulerRun({ status: 'failed' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Scheduler execution failed',
+ expect.objectContaining({
+ message: expect.stringContaining('fan_out_field "links" not found in input'),
+ }),
+ expect.objectContaining({ function: 'executeScheduler' }),
+ );
+ });
+
+ it('throws error when fan_out_field references a non-array value', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies([
+ { links: 'not-an-array' },
+ ]);
+
+ const stage1 = mockStage({ stage_order: 0 });
+ const stage2 = mockStage({
+ id: STAGE_ID_2,
+ stage_order: 1,
+ fan_out_field: 'links',
+ input_schema: {},
+ });
+
+ // 1. updateSchedulerRun → running
+ mockUpdateSchedulerRun({ status: 'running' });
+ // 2. listSchedulerStages
+ mockListSchedulerStages([stage1, stage2]);
+ // 3. stage1: createStageRun + updateStageRun
+ mockCreateStageRun();
+ mockUpdateStageRun({ status: 'completed' });
+ // 4. fan_out_field 'links' is not array → throws
+ // 5. catch: updateSchedulerRun → failed
+ mockUpdateSchedulerRun({ status: 'failed' });
+ // finally: updateScheduler
+ mockUpdateScheduler();
+
+ await executeScheduler(dependencies, SCHEDULER_ID, USER_UUID, RUN_ID);
+
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Scheduler execution failed',
+ expect.objectContaining({
+ message: expect.stringContaining('fan_out_field "links" is not an array'),
+ }),
+ expect.objectContaining({ function: 'executeScheduler' }),
+ );
+ });
+});
diff --git a/workers/scheduler-manager-worker/tests/stage-runner.test.ts b/workers/scheduler-manager-worker/tests/stage-runner.test.ts
new file mode 100644
index 0000000..09aa840
--- /dev/null
+++ b/workers/scheduler-manager-worker/tests/stage-runner.test.ts
@@ -0,0 +1,419 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { env, fetchMock } from 'cloudflare:test';
+import { createSupabaseClient } from '@audio-underview/supabase-connector';
+import type { SchedulerStageRow } from '@audio-underview/supabase-connector';
+import type { CrawlerExecutionClient } from '../sources/crawler-execution-client.ts';
+import type { Logger } from '@audio-underview/logger';
+import {
+ resolveDefaultInput,
+ executeStage,
+ executeFanOut,
+} from '../sources/stage-runner.ts';
+import type { StageRunnerDependencies } from '../sources/stage-runner.ts';
+
+const MOCK_SCHEDULER_ID = '00000000-0000-0000-0000-000000000010';
+const MOCK_STAGE_ID = '00000000-0000-0000-0000-000000000020';
+const MOCK_CRAWLER_ID = '00000000-0000-0000-0000-000000000030';
+const MOCK_RUN_ID = '00000000-0000-0000-0000-000000000040';
+const MOCK_STAGE_RUN_ID = '00000000-0000-0000-0000-000000000050';
+
+function mockStage(overrides: Partial = {}): SchedulerStageRow {
+ return {
+ id: MOCK_STAGE_ID,
+ scheduler_id: MOCK_SCHEDULER_ID,
+ crawler_id: MOCK_CRAWLER_ID,
+ stage_order: 0,
+ input_schema: { url: { type: 'string', default: 'https://example.com' } },
+ output_schema: {},
+ fan_out_field: null,
+ fan_out_strategy: 'compact',
+ created_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ } as SchedulerStageRow;
+}
+
+function mockStageRunRow(overrides: Record = {}) {
+ return {
+ id: MOCK_STAGE_RUN_ID,
+ run_id: MOCK_RUN_ID,
+ stage_id: MOCK_STAGE_ID,
+ stage_order: 0,
+ status: 'running',
+ started_at: '2026-01-01T00:00:00Z',
+ completed_at: null,
+ input: null,
+ output: null,
+ error: null,
+ items_total: null,
+ items_succeeded: null,
+ items_failed: null,
+ created_at: '2026-01-01T00:00:00Z',
+ ...overrides,
+ };
+}
+
+function mockSupabaseStageRunCreate(overrides: Record = {}) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'POST' })
+ .reply(201, JSON.stringify(mockStageRunRow(overrides)));
+}
+
+function mockSupabaseStageRunUpdate(overrides: Record = {}) {
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'PATCH' })
+ .reply(200, JSON.stringify(mockStageRunRow(overrides)));
+}
+
+function createMockCrawlerExecutionClient(): CrawlerExecutionClient & {
+ execute: ReturnType;
+} {
+ return {
+ execute: vi.fn(),
+ };
+}
+
+function createMockLogger(): Logger {
+ return {
+ info: vi.fn(),
+ warn: vi.fn(),
+ error: vi.fn(),
+ debug: vi.fn(),
+ createChild: vi.fn().mockReturnThis(),
+ } as unknown as Logger;
+}
+
+function createDependencies(
+ crawlerExecutionClient?: CrawlerExecutionClient,
+): { dependencies: StageRunnerDependencies; crawlerExecutionClient: ReturnType; logger: ReturnType } {
+ const client = (crawlerExecutionClient as ReturnType) ?? createMockCrawlerExecutionClient();
+ const logger = createMockLogger();
+ const supabaseClient = createSupabaseClient({
+ supabaseURL: env.SUPABASE_URL,
+ supabaseSecretKey: env.SUPABASE_SECRET_KEY,
+ });
+ return {
+ dependencies: { supabaseClient, crawlerExecutionClient: client, logger },
+ crawlerExecutionClient: client,
+ logger,
+ };
+}
+
+beforeEach(() => {
+ fetchMock.activate();
+ fetchMock.disableNetConnect();
+});
+
+afterEach(() => {
+ fetchMock.deactivate();
+});
+
+describe('resolveDefaultInput', () => {
+ it('extracts default values from descriptor format', () => {
+ const schema = {
+ url: { type: 'string', default: 'https://example.com' },
+ count: { type: 'number', default: 10 },
+ };
+ const result = resolveDefaultInput(schema);
+ expect(result).toEqual({
+ url: 'https://example.com',
+ count: 10,
+ });
+ });
+
+ it('skips fields without default', () => {
+ const schema = {
+ url: { type: 'string', default: 'https://example.com' },
+ query: { type: 'string' },
+ };
+ const result = resolveDefaultInput(schema);
+ expect(result).toEqual({ url: 'https://example.com' });
+ expect(result).not.toHaveProperty('query');
+ });
+
+ it('returns empty object for schema with no defaults', () => {
+ const schema = {
+ url: { type: 'string' },
+ query: { type: 'string' },
+ };
+ const result = resolveDefaultInput(schema);
+ expect(result).toEqual({});
+ });
+
+ it('handles empty schema', () => {
+ const result = resolveDefaultInput({});
+ expect(result).toEqual({});
+ });
+
+ it('ignores non-object field values', () => {
+ const schema = {
+ url: 'not-an-object',
+ count: 42,
+ flag: null,
+ };
+ const result = resolveDefaultInput(schema as Record);
+ expect(result).toEqual({});
+ });
+
+ it('handles default value of null', () => {
+ const schema = {
+ optional: { type: 'string', default: null },
+ };
+ const result = resolveDefaultInput(schema);
+ expect(result).toEqual({ optional: null });
+ });
+
+ it('handles default value of false', () => {
+ const schema = {
+ enabled: { type: 'boolean', default: false },
+ };
+ const result = resolveDefaultInput(schema);
+ expect(result).toEqual({ enabled: false });
+ });
+});
+
+describe('executeStage', () => {
+ it('calls crawlerExecutionClient.execute and returns output with stageRun', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+ const input = { url: 'https://example.com' };
+ const crawlerResult = { items: [1, 2, 3] };
+
+ crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: crawlerResult });
+
+ mockSupabaseStageRunCreate();
+ mockSupabaseStageRunUpdate({ status: 'completed', output: crawlerResult });
+
+ const result = await executeStage(dependencies, MOCK_RUN_ID, stage, input);
+
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledWith(MOCK_CRAWLER_ID, input);
+ expect(result.output).toEqual(crawlerResult);
+ expect(result.stageRun.status).toBe('completed');
+ });
+
+ it('creates stage_run with status running then updates to completed', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+ const crawlerResult = { data: 'test' };
+
+ crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: crawlerResult });
+
+ mockSupabaseStageRunCreate({ status: 'running' });
+ mockSupabaseStageRunUpdate({ status: 'completed', output: crawlerResult });
+
+ const result = await executeStage(dependencies, MOCK_RUN_ID, stage, {});
+
+ expect(result.stageRun.id).toBe(MOCK_STAGE_RUN_ID);
+ expect(result.output).toEqual(crawlerResult);
+ });
+
+ it('updates stage_run to failed and re-throws on error', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies();
+ const stage = mockStage();
+ const executionError = new Error('Crawler execution failed');
+
+ crawlerExecutionClient.execute.mockRejectedValue(executionError);
+
+ mockSupabaseStageRunCreate({ status: 'running' });
+ mockSupabaseStageRunUpdate({ status: 'failed', error: 'Crawler execution failed' });
+
+ await expect(executeStage(dependencies, MOCK_RUN_ID, stage, {})).rejects.toThrow(
+ 'Crawler execution failed',
+ );
+
+ expect(logger.error).toHaveBeenCalledWith(
+ 'Stage execution failed',
+ executionError,
+ expect.objectContaining({
+ function: 'executeStage',
+ metadata: { stageID: MOCK_STAGE_ID, stageOrder: 0 },
+ }),
+ );
+ });
+
+ it('passes stage.crawler_id to crawlerExecutionClient.execute', async () => {
+ const customCrawlerID = '00000000-0000-0000-0000-999999999999';
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage({ crawler_id: customCrawlerID });
+
+ crawlerExecutionClient.execute.mockResolvedValue({ type: 'web', result: {} });
+
+ mockSupabaseStageRunCreate();
+ mockSupabaseStageRunUpdate({ status: 'completed' });
+
+ await executeStage(dependencies, MOCK_RUN_ID, stage, { key: 'value' });
+
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledWith(customCrawlerID, { key: 'value' });
+ });
+
+ it('falls back to original stageRun if update returns null', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+
+ crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: 'output' });
+
+ mockSupabaseStageRunCreate({ status: 'running' });
+
+ // Simulate PGRST116 (no rows updated) → supabase returns 406
+ fetchMock
+ .get('https://supabase.example.com')
+ .intercept({ path: /^\/rest\/v1\/scheduler_stage_runs/, method: 'PATCH' })
+ .reply(406, JSON.stringify({
+ code: 'PGRST116',
+ details: 'The result contains 0 rows',
+ hint: null,
+ message: 'JSON object requested, multiple (or no) rows returned',
+ }));
+
+ const result = await executeStage(dependencies, MOCK_RUN_ID, stage, {});
+
+ // When updateSchedulerStageRun returns null, fallback to original stageRun
+ expect(result.stageRun.id).toBe(MOCK_STAGE_RUN_ID);
+ expect(result.stageRun.status).toBe('running');
+ });
+});
+
+describe('executeFanOut', () => {
+ it('executes each item sequentially and returns all results', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+ const items = [{ url: 'a' }, { url: 'b' }, { url: 'c' }];
+
+ crawlerExecutionClient.execute
+ .mockResolvedValueOnce({ type: 'data', result: 'result-a' })
+ .mockResolvedValueOnce({ type: 'data', result: 'result-b' })
+ .mockResolvedValueOnce({ type: 'data', result: 'result-c' });
+
+ const result = await executeFanOut(dependencies, stage, items);
+
+ expect(result.results).toEqual(['result-a', 'result-b', 'result-c']);
+ expect(result.itemsTotal).toBe(3);
+ expect(result.itemsSucceeded).toBe(3);
+ expect(result.itemsFailed).toBe(0);
+ expect(result.status).toBe('completed');
+ });
+
+ it('returns completed status when all items succeed', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+
+ crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: 'ok' });
+
+ const result = await executeFanOut(dependencies, stage, [{ a: 1 }, { a: 2 }]);
+
+ expect(result.status).toBe('completed');
+ expect(result.itemsFailed).toBe(0);
+ expect(result.itemsSucceeded).toBe(2);
+ });
+
+ it('returns partially_failed when some items fail', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+
+ crawlerExecutionClient.execute
+ .mockResolvedValueOnce({ type: 'data', result: 'ok' })
+ .mockRejectedValueOnce(new Error('item failed'))
+ .mockResolvedValueOnce({ type: 'data', result: 'ok' });
+
+ const result = await executeFanOut(dependencies, stage, ['a', 'b', 'c']);
+
+ expect(result.status).toBe('partially_failed');
+ expect(result.itemsSucceeded).toBe(2);
+ expect(result.itemsFailed).toBe(1);
+ expect(result.itemsTotal).toBe(3);
+ expect(result.results).toEqual(['ok', 'ok']);
+ });
+
+ it('returns failed when all items fail', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies();
+ const stage = mockStage();
+
+ crawlerExecutionClient.execute
+ .mockRejectedValueOnce(new Error('fail-1'))
+ .mockRejectedValueOnce(new Error('fail-2'));
+
+ const result = await executeFanOut(dependencies, stage, ['a', 'b']);
+
+ expect(result.status).toBe('failed');
+ expect(result.itemsSucceeded).toBe(0);
+ expect(result.itemsFailed).toBe(2);
+ expect(result.itemsTotal).toBe(2);
+ expect(result.results).toEqual([]);
+ });
+
+ it('preserves null results from successful crawlers', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+
+ crawlerExecutionClient.execute
+ .mockResolvedValueOnce({ type: 'web', result: 'first' })
+ .mockResolvedValueOnce({ type: 'web', result: null })
+ .mockResolvedValueOnce({ type: 'web', result: 'third' });
+
+ const result = await executeFanOut(dependencies, stage, ['a', 'b', 'c']);
+
+ expect(result.status).toBe('completed');
+ expect(result.itemsSucceeded).toBe(3);
+ expect(result.results).toEqual(['first', null, 'third']);
+ });
+
+ it('handles empty items array', async () => {
+ const { dependencies } = createDependencies();
+ const stage = mockStage();
+
+ const result = await executeFanOut(dependencies, stage, []);
+
+ expect(result.results).toEqual([]);
+ expect(result.itemsTotal).toBe(0);
+ expect(result.itemsSucceeded).toBe(0);
+ expect(result.itemsFailed).toBe(0);
+ // With 0 failed items, status is 'completed'
+ expect(result.status).toBe('completed');
+ });
+
+ it('logs warning for each failed item', async () => {
+ const { dependencies, crawlerExecutionClient, logger } = createDependencies();
+ const stage = mockStage();
+ const error1 = new Error('fail-1');
+ const error2 = new Error('fail-2');
+
+ crawlerExecutionClient.execute
+ .mockRejectedValueOnce(error1)
+ .mockRejectedValueOnce(error2);
+
+ await executeFanOut(dependencies, stage, ['a', 'b']);
+
+ expect(logger.warn).toHaveBeenCalledTimes(2);
+ expect(logger.warn).toHaveBeenCalledWith(
+ 'Fan-out item failed',
+ error1,
+ expect.objectContaining({
+ function: 'executeFanOut',
+ metadata: { stageID: MOCK_STAGE_ID, itemIndex: 0 },
+ }),
+ );
+ expect(logger.warn).toHaveBeenCalledWith(
+ 'Fan-out item failed',
+ error2,
+ expect.objectContaining({
+ function: 'executeFanOut',
+ metadata: { stageID: MOCK_STAGE_ID, itemIndex: 1 },
+ }),
+ );
+ });
+
+ it('calls execute with correct crawler_id for each item', async () => {
+ const { dependencies, crawlerExecutionClient } = createDependencies();
+ const stage = mockStage();
+
+ crawlerExecutionClient.execute.mockResolvedValue({ type: 'data', result: null });
+
+ const items = [{ url: 'x' }, { url: 'y' }];
+ await executeFanOut(dependencies, stage, items);
+
+ expect(crawlerExecutionClient.execute).toHaveBeenCalledTimes(2);
+ expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(1, MOCK_CRAWLER_ID, { url: 'x' });
+ expect(crawlerExecutionClient.execute).toHaveBeenNthCalledWith(2, MOCK_CRAWLER_ID, { url: 'y' });
+ });
+});
diff --git a/workers/scheduler-manager-worker/vitest.config.ts b/workers/scheduler-manager-worker/vitest.config.ts
index 08b2372..65a0dbe 100644
--- a/workers/scheduler-manager-worker/vitest.config.ts
+++ b/workers/scheduler-manager-worker/vitest.config.ts
@@ -13,6 +13,13 @@ export default defineWorkersConfig({
SUPABASE_SECRET_KEY: 'test-secret-key',
JWT_SECRET: 'test-jwt-secret-key-for-testing-only',
},
+ workers: [
+ {
+ name: 'audio-underview-crawler-manager-worker',
+ modules: true,
+ script: 'export default { async fetch() { return Response.json({ success: true, result: null }); } }',
+ },
+ ],
},
},
},
diff --git a/workers/scheduler-manager-worker/wrangler.toml b/workers/scheduler-manager-worker/wrangler.toml
index 65742d2..cbb218c 100644
--- a/workers/scheduler-manager-worker/wrangler.toml
+++ b/workers/scheduler-manager-worker/wrangler.toml
@@ -10,6 +10,10 @@ ALLOWED_ORIGINS = "http://localhost:5173,https://audio-underview.pages.dev"
# SUPABASE_SECRET_KEY
# JWT_SECRET
+[[services]]
+binding = "CRAWLER_MANAGER"
+service = "audio-underview-crawler-manager-worker"
+
[observability.logs]
enabled = true
head_sampling_rate = 1
diff --git a/workers/tools/sources/index.ts b/workers/tools/sources/index.ts
index 55c7a0b..0fb9e55 100644
--- a/workers/tools/sources/index.ts
+++ b/workers/tools/sources/index.ts
@@ -5,6 +5,11 @@ export type {
BaseEnvironment,
OAuthWorkerHandlers,
OAuthWorkerOptions,
+ CrawlerExecuteResult,
+} from './types.ts';
+
+export {
+ validateCrawlerExecuteResult,
} from './types.ts';
export {
diff --git a/workers/tools/sources/types.ts b/workers/tools/sources/types.ts
index 9bf12a2..982aeff 100644
--- a/workers/tools/sources/types.ts
+++ b/workers/tools/sources/types.ts
@@ -39,3 +39,22 @@ export interface OAuthWorkerOptions {
logger: Logger;
handlers: OAuthWorkerHandlers;
}
+
+export interface CrawlerExecuteResult {
+ type: 'web' | 'data';
+ result: unknown;
+}
+
+export function validateCrawlerExecuteResult(value: unknown): CrawlerExecuteResult {
+ if (value === null || value === undefined || typeof value !== 'object') {
+ throw new Error('Invalid CrawlerExecuteResult: expected object');
+ }
+ const record = value as Record;
+ if (record.type !== 'web' && record.type !== 'data') {
+ throw new Error(`Invalid CrawlerExecuteResult: expected type 'web' or 'data', got '${String(record.type)}'`);
+ }
+ if (!('result' in record)) {
+ throw new Error('Invalid CrawlerExecuteResult: missing result field');
+ }
+ return { type: record.type, result: record.result };
+}