diff --git a/docker-compose-build.yml b/docker-compose-build.yml index 1d635c0394..3a6cde112a 100644 --- a/docker-compose-build.yml +++ b/docker-compose-build.yml @@ -211,10 +211,19 @@ services: volumes: - ./policy-service/tls:/usr/local/app/tls:ro - ./policy-service/configs:/usr/local/app/configs:ro + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # - /var/run/docker.sock:/var/run/docker.sock:ro <<: *service-template expose: - '5006' + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # python-sandbox: + # build: + # context: ./policy-service/docker/python-sandbox + # dockerfile: Dockerfile + # image: guardian/python-sandbox:latest + prometheus: image: prom/prometheus:v2.44.0 volumes: diff --git a/docker-compose-production-build.yml b/docker-compose-production-build.yml index 2b6dd59b68..8569071a70 100644 --- a/docker-compose-production-build.yml +++ b/docker-compose-production-build.yml @@ -194,10 +194,19 @@ services: volumes: - ./policy-service/tls:/usr/local/app/tls:ro - ./policy-service/configs:/usr/local/app/configs:ro + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # - /var/run/docker.sock:/var/run/docker.sock:ro <<: *service-template expose: - '5006' + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # python-sandbox: + # build: + # context: ./policy-service/docker/python-sandbox + # dockerfile: Dockerfile + # image: guardian/python-sandbox:latest + prometheus: image: prom/prometheus:v2.44.0 volumes: diff --git a/docker-compose-production.yml b/docker-compose-production.yml index cd6bd3427e..d530b18351 100644 --- a/docker-compose-production.yml +++ b/docker-compose-production.yml @@ -177,10 +177,19 @@ services: volumes: - ./policy-service/tls:/usr/local/app/tls:ro - ./policy-service/configs:/usr/local/app/configs:ro + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # - /var/run/docker.sock:/var/run/docker.sock:ro <<: *service-template expose: - '5006' + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # python-sandbox: + # build: + # context: ./policy-service/docker/python-sandbox + # dockerfile: Dockerfile + # image: guardian/python-sandbox:latest + prometheus: image: prom/prometheus:v2.44.0 volumes: diff --git a/docker-compose-quickstart.yml b/docker-compose-quickstart.yml index 7658f8b2c7..f5e902a7e0 100644 --- a/docker-compose-quickstart.yml +++ b/docker-compose-quickstart.yml @@ -133,10 +133,19 @@ services: volumes: - ./policy-service/tls:/usr/local/app/tls:ro - ./policy-service/configs:/usr/local/app/configs:ro + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # - /var/run/docker.sock:/var/run/docker.sock:ro <<: *service-template expose: - '5006' + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # python-sandbox: + # build: + # context: ./policy-service/docker/python-sandbox + # dockerfile: Dockerfile + # image: guardian/python-sandbox:latest + queue-service: image: gcr.io/hedera-registry/queue-service:${GUARDIAN_VERSION:-latest} depends_on: diff --git a/docker-compose.yml b/docker-compose.yml index 5a6a504bba..8980b5a606 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -198,10 +198,19 @@ services: volumes: - ./policy-service/tls:/usr/local/app/tls:ro - ./policy-service/configs:/usr/local/app/configs:ro + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # - /var/run/docker.sock:/var/run/docker.sock:ro <<: *service-template expose: - '5006' + # Uncomment for PYTHON_SANDBOX_MODE=docker: + # python-sandbox: + # build: + # context: ./policy-service/docker/python-sandbox + # dockerfile: Dockerfile + # image: guardian/python-sandbox:latest + prometheus: image: prom/prometheus:v2.44.0 volumes: diff --git a/docs/guardian/standard-registry/policies/python-implementation-in-guardian.md b/docs/guardian/standard-registry/policies/python-implementation-in-guardian.md index 32c2cebe80..70183fd1ca 100644 --- a/docs/guardian/standard-registry/policies/python-implementation-in-guardian.md +++ b/docs/guardian/standard-registry/policies/python-implementation-in-guardian.md @@ -25,7 +25,7 @@ A new dropdown setting has been added to the Custom Logic block in the Policy Ed #### Use Case -Choose "Python" when you want to leverage Python’s expressive syntax and advanced computation libraries for policy logic. +Choose "Python" when you want to leverage Python's expressive syntax and advanced computation libraries for policy logic. ### 2. Python Scripting Support @@ -69,23 +69,172 @@ This field helps track the Guardian system version that was used to generate or * Python execution is subject to the limitations and security constraints defined in Guardian's runtime. {% endhint %} -### 4. Supported Python Libraries and its Versions - -| Library Name | Version | -| :----------: | :-----: | -| numpy | 1.26.4 | -| scipy | 1.12.0 | -| sympy | 1.12 | -| pandas | 2.2.0 | -| pint | 0.25.1 | -| duckdb | 1.0.0 | -| sqlalchemy | 2.0.29 | -| cftime | 1.6.3 | -| matplotlib | 3.5.2 | -| seaborn | 0.13.2 | -| bokeh | 3.4.1 | -| altair | 5.3.0 | -| cartopy | 0.23.0 | -| astropy | 6.0.1 | -| statsmodels | 0.14.2 | -| networkx | 3.3 | +### 4. Supported Python Libraries + +#### Installed Libraries + +| Library Name | Import Name | Version | +| :----------: | :---------: | :-----: | +| numpy | `numpy` | 1.26.4 | +| scipy | `scipy` | 1.12.0 | +| sympy | `sympy` | 1.12 | +| pandas | `pandas` | 2.2.0 | +| pint | `pint` | 0.25.3 | +| cftime | `cftime` | 1.6.3 | +| astropy | `astropy` | 6.0.1 | +| statsmodels | `statsmodels` | 0.14.2 | +| networkx | `networkx` | 3.3 | +| scikit-learn | `sklearn` | 1.4.2 | +| xarray | `xarray` | 2024.3.0 | +| geopandas | `geopandas` | 0.14.3 | + +{% hint style="info" %} +Library versions listed are for the default Pyodide mode. Docker mode may have newer versions as it uses native CPython with pip. +{% endhint %} + +#### Docker-Only Libraries + +These libraries require native C/C++ dependencies (GDAL) and are only available in Docker mode: + +| Library Name | Import Name | Purpose | +| :----------: | :---------: | :------ | +| rasterio | `rasterio` | Read/write raster geospatial data (GeoTIFF, satellite imagery) | +| rioxarray | `rioxarray` | Bridge between xarray and rasterio — CRS management, reprojection | + +#### Python Built-in Modules (always available) + +| Module | Purpose | +| :----: | :------ | +| `calendar` | Calendar rendering, weekday calculations | +| `datetime` | Date/time types and arithmetic | +| `collections` | OrderedDict, Counter, defaultdict, namedtuple | +| `math` | Basic math functions (sin, log, sqrt, pi) | +| `copy` | Deep/shallow copy of objects | + +#### Available as Transitive Dependencies (no explicit install needed) + +| Library | Import Name | Purpose | Installed via | +| :-----: | :---------: | :------ | :------------ | +| python-dateutil | `dateutil` | Smart date parsing, relative deltas | pandas | +| six | `six` | Python 2/3 compatibility | pandas → python-dateutil | +| matplotlib | `matplotlib` | Data visualization | networkx (transitive) | + +#### Removed Libraries (Issue #5505) + +The following libraries were removed as part of sandbox hardening. They are unnecessary for computation — their data processing features are covered by pandas, and they were designed to work with external resources (databases, networks, web servers) that are not available in the sandbox. + +| Library | Reason for Removal | +| :-----: | :----------------- | +| duckdb | SQL database engine; covered by pandas | +| sqlalchemy | SQL toolkit/ORM; covered by pandas | +| bokeh | Visualization; unnecessary for computation | +| altair | Visualization; unnecessary for computation | +| cartopy | Map visualization; unnecessary for computation | +| seaborn | Visualization; unnecessary for computation | + +### 5. Execution Modes + +Guardian supports two execution modes for Python custom logic blocks, controlled by the `PYTHON_SANDBOX_MODE` environment variable. + +#### Pyodide Mode (default) + +The default mode runs Python code using Pyodide (CPython compiled to WebAssembly) inside a Node.js Worker Thread. + +* **No additional infrastructure required** — works out of the box +* **Startup:** packages are pre-cached at policy-service startup for faster execution +* **Limitation:** some C-extension packages (rasterio, rioxarray) are unavailable in WASM + +**Configuration:** No env var needed (default), or explicitly set `PYTHON_SANDBOX_MODE=pyodide` + +#### Docker Mode (experimental) + +Runs Python code in an ephemeral Docker container using native CPython 3.12. Provides OS-level isolation. + +**Container security flags:** + +| Flag | Purpose | +| :--- | :------ | +| `--network=none` | All network access blocked | +| `--cap-drop=ALL` | No Linux capabilities | +| `--security-opt=no-new-privileges` | Prevent privilege escalation | +| `--read-only` | Read-only root filesystem | +| `--user=1001:1001` | Non-root execution | +| `--log-driver=none` | No container log storage | +| `--pull=never` | Never pull untrusted images | +| `--tmpfs /tmp` | Writable scratch space (noexec, destroyed on exit) | + +**Setup:** + +1. Build the sandbox image: +```bash +docker buildx build -t guardian/python-sandbox:latest policy-service/docker/python-sandbox +``` +Or via docker-compose: +```bash +docker compose -f docker-compose-build.yml build python-sandbox +``` + +2. Set the environment variable in policy-service configuration: +``` +PYTHON_SANDBOX_MODE=docker +``` + +3. Ensure the policy-service container has Docker socket access. For docker-compose deployments, uncomment the Docker socket volume mount and the `python-sandbox` image build definition in the relevant compose file: + - `docker-compose-build.yml`, `docker-compose.yml`, `docker-compose-production.yml`, `docker-compose-production-build.yml`, `docker-compose-quickstart.yml` — uncomment the Docker socket volume and `python-sandbox` image build + +{% hint style="warning" %} +Docker mode requires the Docker daemon to be available. The policy-service needs access to the Docker socket to spawn sandbox containers. For production deployments, consider using a Docker API proxy to restrict operations to sandbox container management only. +{% endhint %} + +### 6. Sandbox Security + +Python code in custom logic blocks runs in a sandboxed environment. The following restrictions are enforced: + +#### Pyodide Mode Restrictions + +| Restriction | Details | +| :---------- | :----- | +| JavaScript bridge (`from js import ...`) | Blocked via module stub + import hook | +| `pyodide.http` network access | Blocked via module stub + import hook | +| `os.system`, `os.popen`, `os.exec*`, `os.spawn*` | All replaced with blocked function | +| `subprocess.run`, `subprocess.Popen` | All execution functions replaced | +| `socket.socket`, `socket.connect` | All networking functions replaced | +| `os.environ` (secrets) | Cleared on startup (only HOME/PATH kept) | +| `importlib.reload` | Blocked to prevent undoing patches | +| `builtins.__import__` | Guarded via closure to prevent bypass | +| Execution timeout | Configurable via `PYTHON_SANDBOX_TIMEOUT_MS` (default 120s) | + +#### Docker Mode Restrictions + +All restrictions above are provided by Docker container isolation: + +* **Network:** `--network=none` blocks all connections (verified: HTTP requests fail) +* **File system:** `--read-only` + no host mounts — container sees only its own minimal filesystem +* **Processes:** commands run inside isolated container only, destroyed after execution +* **Environment:** `os.environ` cleared before user code runs +* **Resources:** container destroyed with `--rm` after each execution + +#### Vulnerability Comparison + +| Attack Vector | Pyodide Mode | Docker Mode | +| :------------ | :----------- | :---------- | +| Network requests | Blocked (Python-level) | Blocked (OS-level `--network=none`) | +| Host filesystem access | Blocked (WASM virtual FS) | Blocked (`--read-only`, no mounts) | +| Process execution | Blocked (functions replaced) | Runs inside isolated container | +| `os.environ` secrets | Cleared | Cleared + container has own env | +| `ctypes` C function calls | Not blocked (needed by pandas, harmless in WASM) | Runs inside isolated container | +| Python introspection bypass | Possible (known limitation) | Irrelevant — container is isolated | +| Memory/CPU exhaustion | Timeout only | Timeout + container destroyed | + +{% hint style="info" %} +* **Pyodide mode** is suitable when users are trusted or semi-trusted. It blocks common attack vectors but is vulnerable to sophisticated Python introspection attacks. +* **Docker mode** is suitable for untrusted code. OS-level isolation makes Python-level bypasses irrelevant — the container has no network, no host access, and is destroyed after execution. +{% endhint %} + +### 7. Configuration Reference + +| Environment Variable | Default | Description | +| :------------------- | :------ | :---------- | +| `PYTHON_SANDBOX_MODE` | `pyodide` | Execution mode: `pyodide` (default) or `docker` | +| `PYTHON_SANDBOX_TIMEOUT_MS` | `120000` | Execution timeout in milliseconds (both modes) | +| `PYTHON_SANDBOX_IMAGE` | `guardian/python-sandbox:latest` | Docker sandbox image name (Docker mode only) | diff --git a/policy-service/Dockerfile b/policy-service/Dockerfile index 46140db132..98b82aaf4a 100644 --- a/policy-service/Dockerfile +++ b/policy-service/Dockerfile @@ -54,6 +54,9 @@ COPY --link --from=deps /usr/local/app/node_modules node_modules/ COPY --link --from=deps /usr/local/app/package.json ./ COPY --link --from=build /usr/local/app/dist dist/ +# Allow node user to write Pyodide package cache (warmup downloads wheels at startup) +RUN chown -R node:node node_modules/pyodide/ 2>/dev/null || true + # Change the user to node USER node diff --git a/policy-service/docker/python-sandbox/.dockerignore b/policy-service/docker/python-sandbox/.dockerignore new file mode 100644 index 0000000000..73c1852558 --- /dev/null +++ b/policy-service/docker/python-sandbox/.dockerignore @@ -0,0 +1,6 @@ +node_modules +npm-debug.log +.git +.gitignore +README.md +*.md diff --git a/policy-service/docker/python-sandbox/Dockerfile b/policy-service/docker/python-sandbox/Dockerfile new file mode 100644 index 0000000000..f217c05727 --- /dev/null +++ b/policy-service/docker/python-sandbox/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.12.10-slim + +WORKDIR /sandbox + +# Install system dependencies for geospatial libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + gdal-bin libgdal-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages (pinned to major.minor for reproducibility) +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy entrypoint +COPY entrypoint.py ./ + +# Create non-root user and fix permissions +RUN adduser --disabled-password --uid 1001 sandbox && chown -R sandbox:sandbox /sandbox +USER sandbox + +ENTRYPOINT ["python3", "entrypoint.py"] diff --git a/policy-service/docker/python-sandbox/entrypoint.py b/policy-service/docker/python-sandbox/entrypoint.py new file mode 100644 index 0000000000..55436ee1d5 --- /dev/null +++ b/policy-service/docker/python-sandbox/entrypoint.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +""" +CPython entrypoint for the Guardian Python sandbox container. +Receives JSON input via stdin, executes user Python code, sends results via stdout. +Protocol: newline-delimited JSON messages. +""" +import sys +import json +import os + + +def _json_serializer(obj): + """Handle non-JSON-serializable types (numpy, pandas, datetime, etc.).""" + try: + import numpy as np + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + # Handle NaN and Inf + val = float(obj) + if val != val: # NaN + return None + if val == float('inf') or val == float('-inf'): + return None + return val + if isinstance(obj, (np.bool_,)): + return bool(obj) + except ImportError: + pass + try: + import pandas as pd + if isinstance(obj, pd.DataFrame): + return obj.to_dict(orient='records') + if isinstance(obj, pd.Series): + return obj.tolist() + except ImportError: + pass + import datetime + if isinstance(obj, (datetime.datetime, datetime.date)): + return obj.isoformat() + if isinstance(obj, datetime.timedelta): + return obj.total_seconds() + if isinstance(obj, set): + return list(obj) + if isinstance(obj, bytes): + return obj.decode('utf-8', errors='replace') + return str(obj) + + +def send_message(msg): + """Send a JSON message to stdout (protocol line).""" + sys.stdout.write(json.dumps(msg, default=_json_serializer) + '\n') + sys.stdout.flush() + + +def build_table_helper(tables_pack): + """Port of buildTableHelper from table-field-core.ts.""" + + def is_plain_object(value): + return isinstance(value, dict) + + def is_table_value(value): + return is_plain_object(value) and value.get('type') == 'table' + + def empty_table(): + return {'type': 'table', 'columnKeys': [], 'rows': []} + + def to_object(value): + if value is None: + return empty_table() + if isinstance(value, str): + try: + return json.loads(value) + except (json.JSONDecodeError, ValueError): + return empty_table() + return value + + def normalize(value): + maybe_table = to_object(value) + if not is_table_value(maybe_table): + return empty_table() + if tables_pack and isinstance(maybe_table.get('fileId'), str): + packed = tables_pack.get(maybe_table['fileId']) + if packed: + return { + 'type': 'table', + 'columnKeys': packed.get('columnKeys', []) if isinstance(packed.get('columnKeys'), list) else [], + 'rows': packed.get('rows', []) if isinstance(packed.get('rows'), list) else [], + 'fileId': maybe_table['fileId'] + } + return { + 'type': 'table', + 'columnKeys': maybe_table.get('columnKeys', []) if isinstance(maybe_table.get('columnKeys'), list) else [], + 'rows': maybe_table.get('rows', []) if isinstance(maybe_table.get('rows'), list) else [], + 'fileId': maybe_table.get('fileId') if isinstance(maybe_table.get('fileId'), str) else None + } + + def get_column_keys(value): + table = normalize(value) + if table['columnKeys']: + return table['columnKeys'] + rows = table['rows'] + if rows: + return list(rows[0].keys()) + return [] + + def get_rows(value): + return normalize(value)['rows'] + + def get_column_key_by_index(value, index): + keys = get_column_keys(value) + if 0 <= index < len(keys): + return keys[index] + return '' + + def get_cell(value, row_index, key_or_index): + rows = get_rows(value) + if row_index < 0 or row_index >= len(rows): + return None + row = rows[row_index] + column_key = get_column_key_by_index(value, key_or_index) if isinstance(key_or_index, int) else key_or_index + return row.get(column_key) + + def to_number(value): + if isinstance(value, (int, float)): + return value if value == value else 0 # NaN check + if isinstance(value, str): + try: + return float(value.replace(',', '.')) + except (ValueError, TypeError): + return 0 + return 0 + + def get_column_values(value, key_or_index): + column_key = get_column_key_by_index(value, key_or_index) if isinstance(key_or_index, int) else key_or_index + return [row.get(column_key) for row in get_rows(value)] + + class TableHelper: + """Wrapper to match the JS table helper interface.""" + pass + + helper = TableHelper() + helper.normalize = normalize + helper.keys = get_column_keys + helper.rows = get_rows + helper.cell = get_cell + helper.col = get_column_values + helper.num = to_number + return helper + + +def main(): + # Read JSON input from stdin + try: + raw = sys.stdin.read() + input_data = json.loads(raw) + except Exception as e: + send_message({'type': 'error', 'error': f'Failed to parse input: {e}'}) + sys.exit(1) + + exec_func = input_data.get('execFunc', '') + user = input_data.get('user', {}) + documents = input_data.get('documents', []) + artifacts = input_data.get('artifacts', []) + sources = input_data.get('sources', []) + tables_pack = input_data.get('tablesPack', {}) + + # Build callbacks + def done(result, final=True): + send_message({'type': 'done', 'result': result, 'final': final}) + + def debug(result): + send_message({'type': 'debug', 'result': result}) + + # Override print to send via JSON protocol + def sandbox_print(*args, **kwargs): + send_message({'type': 'stdout', 'message': ' '.join(str(a) for a in args)}) + + # Build table helper + table = build_table_helper(tables_pack) + + # Prepare user namespace + user_globals = { + '__builtins__': __builtins__, + 'user': user, + 'documents': documents, + 'artifacts': artifacts, + 'sources': sources, + 'done': done, + 'debug': debug, + 'print': sandbox_print, + 'table': table, + } + + # Clear sensitive env vars right before user code runs + # (after all library imports that may set their own vars) + keep_keys = {'HOME', 'PATH'} + for key in list(os.environ.keys()): + if key not in keep_keys: + del os.environ[key] + + # Execute user code + try: + exec(exec_func, user_globals) + except Exception as e: + send_message({'type': 'error', 'error': f'{type(e).__name__}: {e}'}) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/policy-service/docker/python-sandbox/requirements.txt b/policy-service/docker/python-sandbox/requirements.txt new file mode 100644 index 0000000000..3df98e8fa5 --- /dev/null +++ b/policy-service/docker/python-sandbox/requirements.txt @@ -0,0 +1,17 @@ +# Package list must match python-packages.json in policy-service/src/policy-engine/helpers/workers/ +# Common packages (same as Pyodide worker) +numpy>=1.26,<2 +scipy>=1.12,<2 +sympy>=1.12,<2 +pandas>=2.2,<3 +pint>=0.25,<1 +cftime>=1.6,<2 +astropy>=6.0,<7 +statsmodels>=0.14,<1 +networkx>=3.3,<4 +scikit-learn>=1.4,<2 +xarray>=2024.3,<2025 +geopandas>=0.14,<1 +# CPython-only packages (not available in Pyodide/WASM) +rasterio>=1.3,<2 +rioxarray>=0.15,<1 diff --git a/policy-service/src/app.ts b/policy-service/src/app.ts index 2acd8dc10c..6f20a9a32b 100644 --- a/policy-service/src/app.ts +++ b/policy-service/src/app.ts @@ -7,6 +7,7 @@ import { MikroORM } from '@mikro-orm/core'; import { MongoDriver } from '@mikro-orm/mongodb'; import { DEFAULT_MONGO } from '#constants'; import { BlockTreeGenerator } from './policy-engine/block-tree-generator.js'; +import { warmupPyodideCache } from './policy-engine/helpers/workers/pyodide-warmup.js'; export const obj = {}; @@ -62,6 +63,9 @@ Promise.all([ await state.updateState(ApplicationStates.READY); startMetricsServer(); + + // Pre-cache Pyodide packages in background (non-blocking) + warmupPyodideCache().catch(() => { /* errors logged inside */ }); }, (reason) => { console.log(reason); process.exit(0); diff --git a/policy-service/src/policy-engine/blocks/custom-logic-block.ts b/policy-service/src/policy-engine/blocks/custom-logic-block.ts index 4616b14a4b..bd775f8dee 100644 --- a/policy-service/src/policy-engine/blocks/custom-logic-block.ts +++ b/policy-service/src/policy-engine/blocks/custom-logic-block.ts @@ -172,6 +172,19 @@ export class CustomLogicBlock { actionStatus: RecordActionStep ): Promise { return new Promise(async (resolve, reject) => { + let settled = false; + const safeResolve = (value: any) => { + if (!settled) { + settled = true; + resolve(value); + } + }; + const safeReject = (err: any) => { + if (!settled) { + settled = true; + reject(err); + } + }; try { const ref = PolicyComponentsUtils.GetBlockRef(this); let documents: IPolicyDocument[]; @@ -200,7 +213,7 @@ export class CustomLogicBlock { // } - resolve(null); + safeResolve(null); } return; } @@ -227,7 +240,7 @@ export class CustomLogicBlock { // } - resolve(items); + safeResolve(items); } return; } else { @@ -239,7 +252,7 @@ export class CustomLogicBlock { } catch { // } - resolve(item); + safeResolve(item); } return; } @@ -275,37 +288,91 @@ export class CustomLogicBlock { const expression = ref.options.expression || ''; if (ref.options.selectedScriptLanguage === ScriptLanguageOption.PYTHON) { - const worker = new Worker( - path.join(path.dirname(filename), '..', 'helpers', 'workers', 'custom-logic-python-worker.js'), - { - workerData: { - execFunc: `${execCode}${expression}`, - user, - artifacts, - documents: context.documents, - sources: context.sources, - tablesPack - }, - }); - worker.on('error', (error) => { - reject(error); - }); - worker.on('message', async (data) => { - if (data?.error) { - reject(new Error(data.error)); - return; - } + const pythonWorkerData = { + execFunc: `${execCode}${expression}`, + user, + artifacts, + documents: context.documents, + sources: context.sources, + tablesPack + }; + + const pythonTimeoutMs = parseInt(process.env.PYTHON_SANDBOX_TIMEOUT_MS || '120000', 10); + + if (process.env.PYTHON_SANDBOX_MODE === 'docker') { + const { runPythonInDocker } = await import('../helpers/workers/custom-logic-python-docker-worker.js'); try { - if (data?.type === 'done') { - await done(data.result, data.final); - } - if (data?.type === 'debug') { - ref.debug(data.message); + const pendingDones: Promise[] = []; + await runPythonInDocker(pythonWorkerData, { + onDone: (result, final) => { + pendingDones.push(done(result, final).catch(safeReject)); + }, + onDebug: (result) => { + ref.debug(result); + } + }); + // Wait for all done() calls to complete before resolving + if (pendingDones.length > 0) { + await Promise.all(pendingDones); + } else { + try { disposeTables(); } catch { /* */ } + safeResolve(null); } } catch (error) { - reject(error); + try { disposeTables(); } catch { /* */ } + safeReject(error); } - }); + } else { + const worker = new Worker( + path.join(path.dirname(filename), '..', 'helpers', 'workers', 'custom-logic-python-worker.js'), + { workerData: pythonWorkerData }); + + const pendingDones: Promise[] = []; + + // Timeout for Pyodide worker + const workerTimer = setTimeout(() => { + worker.terminate(); + safeReject(new Error('Python sandbox execution timed out')); + }, pythonTimeoutMs); + + worker.on('exit', async (code) => { + clearTimeout(workerTimer); + // Wait for all pending done() calls to finish + if (pendingDones.length > 0) { + try { await Promise.all(pendingDones); } catch { /* already handled */ } + } else { + try { disposeTables(); } catch { /* */ } + } + if (code !== 0 && code !== null) { + safeReject(new Error(`Python worker exited with code ${code}`)); + } else { + safeResolve(null); + } + }); + worker.on('error', (error) => { + clearTimeout(workerTimer); + try { disposeTables(); } catch { /* */ } + safeReject(error); + }); + worker.on('message', (data) => { + if (data?.error) { + clearTimeout(workerTimer); + safeReject(new Error(data.error)); + return; + } + try { + if (data?.type === 'done') { + pendingDones.push(done(data.result, data.final).catch(safeReject)); + } + if (data?.type === 'debug') { + ref.debug(data.result); + } + } catch (error) { + clearTimeout(workerTimer); + safeReject(error); + } + }); + } } else { const worker = new Worker( path.join(path.dirname(filename), '..', 'helpers', 'workers', 'custom-logic-worker.js'), @@ -336,7 +403,7 @@ export class CustomLogicBlock { }); } } catch (error) { - reject(error); + safeReject(error); } }); } diff --git a/policy-service/src/policy-engine/helpers/workers/custom-logic-python-docker-worker.ts b/policy-service/src/policy-engine/helpers/workers/custom-logic-python-docker-worker.ts new file mode 100644 index 0000000000..05b6c0fa59 --- /dev/null +++ b/policy-service/src/policy-engine/helpers/workers/custom-logic-python-docker-worker.ts @@ -0,0 +1,203 @@ +import { spawn } from 'node:child_process'; +import crypto from 'node:crypto'; + +interface WorkerData { + execFunc: string; + user: any; + documents: any[]; + artifacts: any[]; + sources: any[]; + tablesPack: Record; +} + +interface DockerCallbacks { + onDone: (result: any, final: boolean) => Promise | void; + onDebug: (result: any) => void; +} + +/** + * Run Python code in an isolated Docker container. + * + * Communicates via newline-delimited JSON over stdin/stdout. + * Resolves when the container exits cleanly. + * Rejects on timeout, container error, error JSON from container, or spawn failure. + */ +export function runPythonInDocker( + workerData: WorkerData, + callbacks: DockerCallbacks +): Promise { + return new Promise((resolve, reject) => { + let payload: string; + try { + payload = JSON.stringify(workerData); + } catch (err) { + reject(new Error('Failed to serialize Python sandbox payload: ' + (err as Error).message)); + return; + } + + const image = process.env.PYTHON_SANDBOX_IMAGE || 'guardian/python-sandbox:latest'; + if (!/^[a-zA-Z0-9][a-zA-Z0-9._\-/]*:[a-zA-Z0-9._\-]+$/.test(image)) { + reject(new Error(`Invalid sandbox image name: ${image}`)); + return; + } + const timeoutMs = parseInt(process.env.PYTHON_SANDBOX_TIMEOUT_MS || '120000', 10); + const containerName = `python-sandbox-${crypto.randomUUID()}`; + + const args = [ + 'run', '--rm', '-i', + `--name=${containerName}`, + '--network=none', + '--cap-drop=ALL', + '--security-opt=no-new-privileges', + '--read-only', + '--user=1001:1001', + '--log-driver=none', + '--pull=never', + '--tmpfs', '/tmp:rw,noexec,nosuid,size=64m', + image + ]; + + const container = spawn('docker', args, { + stdio: ['pipe', 'pipe', 'pipe'] + }); + + let settled = false; + let stdoutBuffer = ''; + let timer: ReturnType | null = null; + + const forceRemoveContainer = () => { + // Fire-and-forget: use spawn instead of execFileSync to avoid blocking the event loop + try { + const rm = spawn('docker', ['rm', '-f', containerName], { + stdio: 'ignore' + }); + rm.on('error', () => { /* ignore */ }); + } catch { + // container may already be removed by --rm + } + }; + + const settle = (err?: Error) => { + if (settled) { + return; + } + settled = true; + if (timer !== null) { + clearTimeout(timer); + timer = null; + } + if (err) { + reject(err); + } else { + resolve(); + } + }; + + timer = setTimeout(() => { + forceRemoveContainer(); + settle(new Error('Python sandbox execution timed out')); + }, timeoutMs); + + // --- Write input data to container stdin --- + container.stdin.on('error', (err) => { + console.error('[python-sandbox] stdin pipe error:', err.message); + // Don't settle here — the close event will handle the exit. + // The container may already be dead, which is the most common cause. + }); + + container.stdin.write(payload, (writeErr) => { + if (writeErr) { + console.error('[python-sandbox] stdin write error:', writeErr.message); + return; + } + container.stdin.end(); + }); + + /** + * Process a single newline-delimited JSON line. + * Returns true if the caller should stop processing further lines (i.e. settled). + */ + const processLine = (line: string): boolean => { + if (!line.trim() || settled) { + return settled; + } + try { + const msg = JSON.parse(line); + switch (msg.type) { + case 'done': + if (!settled) callbacks.onDone(msg.result, msg.final); + break; + case 'debug': + if (!settled) callbacks.onDebug(msg.result); + break; + case 'error': + settle(new Error(msg.error || 'Unknown error from Python sandbox')); + return true; + case 'stdout': + console.log('[python-sandbox stdout]', msg.message); + break; + case 'stderr': + console.error('[python-sandbox]', msg.message); + break; + default: + console.warn('[python-sandbox] Unknown message type:', msg.type); + break; + } + } catch { + console.error('[python-sandbox] Malformed output:', line.slice(0, 200)); + } + return false; + }; + + // --- Parse stdout line by line (newline-delimited JSON) --- + container.stdout.on('data', (chunk: Buffer) => { + if (settled) { + return; + } + stdoutBuffer += chunk.toString(); + const lines = stdoutBuffer.split('\n'); + stdoutBuffer = lines.pop() || ''; + + for (const line of lines) { + if (settled) { + break; + } + processLine(line); + } + }); + + container.stderr.on('data', (chunk: Buffer) => { + console.error('[python-sandbox stderr]', chunk.toString()); + }); + + container.on('error', (err) => { + forceRemoveContainer(); + settle(new Error('Failed to start Docker sandbox: ' + err.message)); + }); + + container.on('close', (code) => { + // Process any remaining data in the buffer (including debug messages). + if (stdoutBuffer.trim()) { + const remainingLines = stdoutBuffer.split('\n'); + for (const line of remainingLines) { + if (settled) { + break; + } + processLine(line); + } + stdoutBuffer = ''; + } + + if (settled) { + return; + } + + if (code !== 0 && code !== null) { + settle(new Error(`Python sandbox exited with code ${code}`)); + } else { + // Container exited cleanly. Resolve even if no explicit 'done' message was sent. + settle(); + } + }); + }); +} diff --git a/policy-service/src/policy-engine/helpers/workers/custom-logic-python-worker.ts b/policy-service/src/policy-engine/helpers/workers/custom-logic-python-worker.ts index 7d33a2506d..78818a78a3 100644 --- a/policy-service/src/policy-engine/helpers/workers/custom-logic-python-worker.ts +++ b/policy-service/src/policy-engine/helpers/workers/custom-logic-python-worker.ts @@ -1,6 +1,7 @@ import { workerData, parentPort } from 'node:worker_threads'; import { loadPyodide } from 'pyodide' import { buildTableHelper } from '../table-field-core.js'; +import { PYTHON_PACKAGES } from './python-packages.js'; /** * Execute function @@ -43,7 +44,7 @@ async function execute() { const { execFunc, user, documents, artifacts, sources, tablesPack } = workerData; pyodide.setStdout({ batched: console.log }); - pyodide.setStderr({ batched: console.error }) + pyodide.setStderr({ batched: console.error }); pyodide.globals.set('user', user); pyodide.globals.set('documents', documents); @@ -58,37 +59,92 @@ async function execute() { await pyodide.loadPackage('micropip'); const micropip = pyodide.pyimport('micropip'); - const libs = [ - 'numpy', - 'scipy', - 'sympy', - 'pandas', - 'pint', - 'duckdb', - 'sqlalchemy', - 'cftime', - 'matplotlib', - 'seaborn', - 'bokeh', - 'altair', - 'cartopy', - 'astropy', - 'statsmodels', - 'networkx' - ]; + const libs: string[] = PYTHON_PACKAGES; for (const lib of libs) { try { await micropip.install(lib); } catch (e) { - console.error(`Failed to install python lib: ${lib}`, e); + console.error(`Failed to install python lib: ${lib}:`, e?.message || e); } } + await pyodide.runPythonAsync(` +import sys +import os +import importlib +import builtins + +def _blocked(*args, **kwargs): + raise PermissionError("This operation is restricted in this sandbox") + +# 1. Replace js module with restricted stub +class _RestrictedModule: + def __init__(self, name): + self._name = name + def __getattr__(self, attr): + raise ImportError(f"Access to {self._name}.{attr} is restricted in this sandbox") + +sys.modules['js'] = _RestrictedModule('js') +sys.modules['pyodide.http'] = _RestrictedModule('pyodide.http') + +# 2. Block dangerous os functions +for attr in ['system', 'popen', 'execl', 'execle', 'execlp', 'execv', 'execve', + 'execvp', 'execvpe', 'spawnl', 'spawnle', 'spawnlp', 'spawnv', + 'spawnve', 'spawnvp', 'spawnvpe']: + if hasattr(os, attr): + setattr(os, attr, _blocked) + +# 3. Block subprocess dangerous functions +import subprocess as _subprocess +for attr in ['run', 'call', 'check_call', 'check_output', 'Popen', 'getoutput', 'getstatusoutput']: + if hasattr(_subprocess, attr): + setattr(_subprocess, attr, _blocked) + +# 5. Block socket networking functions +import socket as _socket +for attr in ['socket', 'create_connection', 'create_server', 'getaddrinfo', 'gethostbyname', 'gethostbyaddr']: + if hasattr(_socket, attr): + setattr(_socket, attr, _blocked) + +# 6. Block importlib.reload to prevent undoing patches +import importlib as _importlib +_importlib.reload = _blocked + +# 7. Install import hook to prevent bypassing module restrictions (PEP 451) +from importlib.abc import MetaPathFinder + +_blocked_modules = {'js', 'pyodide.http', 'cffi', '_posixsubprocess'} + +class _SandboxImportBlocker(MetaPathFinder): + def find_spec(self, fullname, path, target=None): + if fullname in _blocked_modules or fullname.startswith(('js.', 'pyodide.http.', 'cffi.')): + raise ImportError(f"Import of {fullname} is restricted in this sandbox") + return None + +sys.meta_path.insert(0, _SandboxImportBlocker()) + +# 8. Guard builtins.__import__ against bypass (closure hides _original_import from __globals__) +def _make_guarded_import(): + _orig = builtins.__import__ + def _guarded_import(name, *args, **kwargs): + if name in _blocked_modules or any(name.startswith(prefix + '.') for prefix in ('js', 'pyodide.http', 'cffi')): + raise ImportError(f"Import of {name} is restricted in this sandbox") + return _orig(name, *args, **kwargs) + return _guarded_import +builtins.__import__ = _make_guarded_import() + +# 9. Clear os.environ last (after all library imports that may set their own vars) +_keep_keys = {'HOME', 'PATH'} +for key in list(os.environ.keys()): + if key not in _keep_keys: + del os.environ[key] +`); + try { await pyodide.runPythonAsync(execFunc); } catch (error) { - console.log('Failed to run python script:', error); + console.error('Failed to run python script:', error); parentPort?.postMessage({ error: error.message, final: true }); } } diff --git a/policy-service/src/policy-engine/helpers/workers/pyodide-warmup.ts b/policy-service/src/policy-engine/helpers/workers/pyodide-warmup.ts new file mode 100644 index 0000000000..618af29d45 --- /dev/null +++ b/policy-service/src/policy-engine/helpers/workers/pyodide-warmup.ts @@ -0,0 +1,38 @@ +import { PYTHON_PACKAGES } from './python-packages.js'; + +/** + * Pre-cache Pyodide packages at startup so Worker Threads + * don't need to download them on first execution. + */ +export async function warmupPyodideCache(): Promise { + if (process.env.PYTHON_SANDBOX_MODE === 'docker') { + console.log('[pyodide-warmup] Skipping — Docker mode enabled'); + return; + } + + console.log('[pyodide-warmup] Pre-caching Python packages...'); + const start = Date.now(); + + try { + const { loadPyodide } = await import('pyodide'); + const pyodide = await loadPyodide(); + + await pyodide.loadPackage('micropip'); + const micropip = pyodide.pyimport('micropip'); + + const libs: string[] = PYTHON_PACKAGES; + + for (const lib of libs) { + try { + await micropip.install(lib); + } catch (e) { + console.error(`[pyodide-warmup] Failed to cache ${lib}:`, (e as Error)?.message || e); + } + } + + const elapsed = ((Date.now() - start) / 1000).toFixed(1); + console.log(`[pyodide-warmup] Done in ${elapsed}s — packages cached for Worker Threads`); + } catch (e) { + console.error('[pyodide-warmup] Failed:', (e as Error)?.message || e); + } +} diff --git a/policy-service/src/policy-engine/helpers/workers/python-packages.ts b/policy-service/src/policy-engine/helpers/workers/python-packages.ts new file mode 100644 index 0000000000..e9040b6431 --- /dev/null +++ b/policy-service/src/policy-engine/helpers/workers/python-packages.ts @@ -0,0 +1,19 @@ +/** + * Shared Python package list for custom logic block. + * Used by: custom-logic-python-worker.ts, pyodide-warmup.ts + * Docker: requirements.txt must be kept in sync (includes cpython-only packages) + */ +export const PYTHON_PACKAGES = [ + 'numpy', + 'scipy', + 'sympy', + 'pandas', + 'pint', + 'cftime', + 'astropy', + 'statsmodels', + 'networkx', + 'scikit-learn', + 'xarray', + 'geopandas' +];