Skip to content

Commit 8594f08

Browse files
committed
Merge branch 'feature/monitoring-telemetry' into 'main'
feat: telemetry reporter for monitoring instances See merge request postgres-ai/postgresai!251
2 parents 5c3f7a1 + e6ba73a commit 8594f08

23 files changed

Lines changed: 1308 additions & 0 deletions

.gitlab-ci.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,24 @@ cli:node:tests:
331331
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
332332
- if: '$CI_COMMIT_BRANCH == "main"'
333333

334+
telemetry:tests:
335+
stage: test
336+
image: oven/bun:1
337+
variables:
338+
GIT_STRATEGY: fetch
339+
before_script:
340+
- cd "$CI_PROJECT_DIR/telemetry" && bun install --frozen-lockfile
341+
script:
342+
- cd "$CI_PROJECT_DIR/telemetry" && bun audit --audit-level=high
343+
- cd "$CI_PROJECT_DIR/telemetry" && bun test && bun run typecheck
344+
rules:
345+
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
346+
changes:
347+
- telemetry/**/*
348+
- if: '$CI_COMMIT_BRANCH == "main"'
349+
changes:
350+
- telemetry/**/*
351+
334352
# Validate helm chart on merge requests and main branch
335353
validate-helm-chart:
336354
stage: validate

telemetry/Dockerfile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM oven/bun:1 AS deps
2+
WORKDIR /app
3+
COPY package.json bun.lock ./
4+
RUN bun install --frozen-lockfile
5+
6+
FROM oven/bun:1
7+
WORKDIR /app
8+
COPY --from=deps /app/node_modules ./node_modules
9+
COPY package.json tsconfig.json ./
10+
COPY bin ./bin
11+
COPY lib ./lib
12+
13+
# /proc and /var/lib/docker (or /) need to be visible from inside the container.
14+
# Compose / k8s manifest must mount:
15+
# - /proc -> /host/proc (readonly)
16+
# - / (or data vol) -> /host/disk (readonly)
17+
# - /var/run/docker.sock -> /var/run/docker.sock (so `docker ps` works)
18+
#
19+
# Mounting the docker socket grants root-equivalent access to the host. Where
20+
# possible, prefer a docker-socket proxy restricted to GET /containers/json.
21+
# See telemetry/README.md for the deployment threat model.
22+
ENV PGAI_TELEMETRY_MEMINFO_PATH=/host/proc/meminfo \
23+
PGAI_TELEMETRY_DISK_PATH=/host/disk
24+
25+
CMD ["bun", "run", "./bin/telemetry.ts"]

telemetry/README.md

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# @postgresai/telemetry
2+
3+
Telemetry reporter for PostgresAI monitoring instances. A small TS+Bun
4+
service that runs on each monitoring host and posts an hourly
5+
mini-healthcheck to the platform.
6+
7+
## What it collects
8+
9+
Each tick gathers four signals and POSTs them to the platform:
10+
11+
| Signal | Source |
12+
|---|---|
13+
| OOM events in the lookback window | `journalctl -k --since "<lookback>"` |
14+
| Faulty containers (exited / dead / restarting / unhealthy) | `docker ps -a --format '{{json .}}'` |
15+
| Free RAM | `MemAvailable` from `/proc/meminfo` (falls back to `MemFree`) |
16+
| Free disk | `fs.statfs` on the configured mount |
17+
18+
The companion platform-side hypertable, RPC, alert evaluator, and
19+
dispatcher live in `postgres-ai/platform-all!365`.
20+
21+
## Configuration
22+
23+
| Var | Required | Default |
24+
|---|---|---|
25+
| `PGAI_PLATFORM_API_URL` | yes ||
26+
| `PGAI_API_TOKEN` | yes ||
27+
| `PGAI_MONITORING_INSTANCE_ID` | yes ||
28+
| `PGAI_TELEMETRY_DISK_PATH` | no | `/` |
29+
| `PGAI_TELEMETRY_MEMINFO_PATH` | no | `/proc/meminfo` |
30+
| `PGAI_TELEMETRY_OOM_LOOKBACK` | no | `24 hours ago` |
31+
| `PGAI_TELEMETRY_INTERVAL_SEC` | no | `3600` (min `60`) |
32+
33+
`PGAI_API_TOKEN` is the existing PostgresAI checkup double-base64 token.
34+
35+
## Build and run locally
36+
37+
```sh
38+
cd telemetry
39+
bun install --frozen-lockfile
40+
bun test
41+
bun run typecheck
42+
bun run start # actually starts reporting
43+
```
44+
45+
## Run in a container
46+
47+
```sh
48+
docker build -t postgresai-telemetry telemetry
49+
docker run --rm \
50+
-e PGAI_PLATFORM_API_URL=https://postgres.ai/api/v1 \
51+
-e PGAI_API_TOKEN=... \
52+
-e PGAI_MONITORING_INSTANCE_ID=... \
53+
--read-only \
54+
-v /proc:/host/proc:ro \
55+
-v /:/host/disk:ro \
56+
-v /var/run/docker.sock:/var/run/docker.sock \
57+
postgresai-telemetry
58+
```
59+
60+
## Deployment requirements
61+
62+
The agent must read host kernel logs, host memory, the host filesystem,
63+
and ask the local Docker daemon for its container list. Mount these:
64+
65+
| Host path | Container path | Mode |
66+
|---|---|---|
67+
| `/proc` | `/host/proc` | read-only |
68+
| `/` (or data volume) | `/host/disk` | read-only |
69+
| `/var/run/docker.sock` | `/var/run/docker.sock` | read-write |
70+
71+
## Threat model
72+
73+
Mounting `/var/run/docker.sock` is **root-equivalent on the host**.
74+
Anyone who execs into this container — or compromises any of its
75+
dependencies — can launch privileged containers and take over the
76+
monitoring host.
77+
78+
Recommended mitigations:
79+
80+
- Prefer a docker-socket proxy (e.g.
81+
[`tecnativa/docker-socket-proxy`](https://github.com/Tecnativa/docker-socket-proxy))
82+
restricted to `CONTAINERS=1` so only `GET /containers/json` is exposed.
83+
- Drop all Linux capabilities the agent doesn't need
84+
(`--cap-drop ALL --cap-add ...`).
85+
- Run as a non-root UID inside the container; the `oven/bun` base image
86+
ships a `bun` user.
87+
- `PGAI_TELEMETRY_MEMINFO_PATH` and `PGAI_TELEMETRY_DISK_PATH` are
88+
security-sensitive: an actor who can flip them can turn the heartbeat
89+
into an arbitrary-file-read primitive. Keep them under config-management
90+
control.
91+
92+
## API contract
93+
94+
`POST /rpc/monitoring_instance_telemetry_report` with a JSON body:
95+
96+
```json
97+
{
98+
"api_token": "<double-base64 token>",
99+
"instance_id": "<uuid>",
100+
"oom_count_24h": 0,
101+
"faulty_containers": ["cadvisor"],
102+
"free_ram_bytes": 8589934592,
103+
"free_disk_bytes": 100000000000,
104+
"metadata": { "collected_at": "2026-04-28T09:00:00.000Z" }
105+
}
106+
```
107+
108+
All `*_bytes` fields are byte counts. `metadata` is an open-ended
109+
JSON object that today carries `collected_at` (ISO 8601 UTC).
110+
111+
## Operational notes
112+
113+
- **Startup tick**: the agent reports once on startup, then on each
114+
`PGAI_TELEMETRY_INTERVAL_SEC` boundary.
115+
- **Graceful shutdown**: SIGTERM / SIGINT cancel the in-flight sleep
116+
immediately. Shutdown latency is bounded by the current tick (not the
117+
interval).
118+
- **Per-collector failure isolation**: each of the four collectors logs
119+
a warning on failure and reports a safe default (`0` / `[]`) so a
120+
single broken signal doesn't silence the heartbeat.

telemetry/bin/telemetry.ts

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env bun
2+
/**
3+
* Entry point for the PostgresAI monitoring instance telemetry agent.
4+
*
5+
* Runs forever, reporting once on startup and then every intervalSec.
6+
* SIGTERM / SIGINT cancel the in-flight sleep and exit cleanly, so
7+
* shutdown latency is bounded by the current tick (not by intervalSec).
8+
*/
9+
10+
import { loadConfigFromEnv, ConfigError } from "../lib/config";
11+
import { collectSnapshot } from "../lib/collect";
12+
import { postReport } from "../lib/reporter";
13+
import type { ReporterConfig } from "../lib/types";
14+
15+
const RESPONSE_LOG_CAP = 512;
16+
17+
export async function tick(config: ReporterConfig): Promise<void> {
18+
const snapshot = await collectSnapshot(config);
19+
const result = await postReport(config, snapshot);
20+
if (!result.ok) {
21+
const raw = JSON.stringify(result.body ?? null);
22+
const capped = raw.length > RESPONSE_LOG_CAP ? `${raw.slice(0, RESPONSE_LOG_CAP)}…` : raw;
23+
const safe = capped.split(config.apiToken).join("[REDACTED]");
24+
console.error(
25+
`[telemetry] report failed status=${result.status} err=${result.error ?? ""} body=${safe}`
26+
);
27+
return;
28+
}
29+
console.log(
30+
`[telemetry] reported snapshot oom=${snapshot.oomCount24h} faulty=${snapshot.faultyContainers.length} ram=${snapshot.freeRamBytes} disk=${snapshot.freeDiskBytes}`
31+
);
32+
}
33+
34+
export interface RunLoopDeps {
35+
tickFn?: (config: ReporterConfig) => Promise<void>;
36+
sleep?: (ms: number, signal: AbortSignal) => Promise<void>;
37+
onSignal?: (handler: () => void) => void;
38+
}
39+
40+
const defaultSleep = (ms: number, signal: AbortSignal): Promise<void> =>
41+
new Promise<void>((resolve) => {
42+
if (signal.aborted) return resolve();
43+
const t = setTimeout(resolve, ms);
44+
signal.addEventListener(
45+
"abort",
46+
() => {
47+
clearTimeout(t);
48+
resolve();
49+
},
50+
{ once: true }
51+
);
52+
});
53+
54+
const defaultOnSignal = (handler: () => void): void => {
55+
process.once("SIGTERM", handler);
56+
process.once("SIGINT", handler);
57+
};
58+
59+
export async function runLoop(config: ReporterConfig, deps: RunLoopDeps = {}): Promise<void> {
60+
const tickFn = deps.tickFn ?? tick;
61+
const sleep = deps.sleep ?? defaultSleep;
62+
const onSignal = deps.onSignal ?? defaultOnSignal;
63+
64+
const ac = new AbortController();
65+
let stopped = false;
66+
const stop = () => {
67+
stopped = true;
68+
ac.abort();
69+
};
70+
onSignal(stop);
71+
72+
await tickFn(config).catch((err) => {
73+
console.error(`[telemetry] tick error: ${err instanceof Error ? err.message : String(err)}`);
74+
});
75+
76+
while (!stopped) {
77+
await sleep(config.intervalSec * 1000, ac.signal);
78+
if (stopped) break;
79+
await tickFn(config).catch((err) => {
80+
console.error(`[telemetry] tick error: ${err instanceof Error ? err.message : String(err)}`);
81+
});
82+
}
83+
}
84+
85+
async function main(): Promise<void> {
86+
let config: ReporterConfig;
87+
try {
88+
config = loadConfigFromEnv();
89+
} catch (err) {
90+
if (err instanceof ConfigError) {
91+
console.error(`[telemetry] config error: ${err.message}`);
92+
process.exit(2);
93+
}
94+
throw err;
95+
}
96+
97+
await runLoop(config);
98+
}
99+
100+
if (import.meta.main) {
101+
main().catch((err) => {
102+
console.error(`[telemetry] fatal: ${err instanceof Error ? err.stack ?? err.message : String(err)}`);
103+
process.exit(1);
104+
});
105+
}

telemetry/bun.lock

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

telemetry/lib/collect.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/**
2+
* High-level "run all collectors" helper. The entry script calls this on
3+
* each tick. Splitting this out keeps bin/telemetry.ts tiny and lets us
4+
* unit-test the orchestration with stubbed collectors.
5+
*/
6+
7+
import { collectOomCount } from "./collectors/oom";
8+
import { collectFaultyContainers } from "./collectors/containers";
9+
import { collectFreeRamBytes } from "./collectors/memory";
10+
import { collectFreeDiskBytes } from "./collectors/disk";
11+
import type { ReporterConfig, TelemetrySnapshot } from "./types";
12+
13+
export interface CollectorOverrides {
14+
oom?: () => Promise<number>;
15+
containers?: () => Promise<string[]>;
16+
memory?: () => Promise<number>;
17+
disk?: () => Promise<number>;
18+
}
19+
20+
export async function collectSnapshot(
21+
config: ReporterConfig,
22+
overrides: CollectorOverrides = {}
23+
): Promise<TelemetrySnapshot> {
24+
const oomFn = overrides.oom ?? (() => collectOomCount({ lookback: config.oomLookback }));
25+
const containersFn = overrides.containers ?? (() => collectFaultyContainers());
26+
const memoryFn = overrides.memory ?? (() => collectFreeRamBytes({ meminfoPath: config.meminfoPath }));
27+
const diskFn = overrides.disk ?? (() => collectFreeDiskBytes({ path: config.diskPath }));
28+
29+
const [oomCount24h, faultyContainers, freeRamBytes, freeDiskBytes] = await Promise.all([
30+
oomFn(),
31+
containersFn(),
32+
memoryFn(),
33+
diskFn(),
34+
]);
35+
36+
return {
37+
oomCount24h,
38+
faultyContainers,
39+
freeRamBytes,
40+
freeDiskBytes,
41+
metadata: {
42+
collected_at: new Date().toISOString(),
43+
},
44+
};
45+
}

0 commit comments

Comments
 (0)