diff --git a/.dockerignore b/.dockerignore index 9a73daf1..bc928902 100644 --- a/.dockerignore +++ b/.dockerignore @@ -8,6 +8,7 @@ config/db-config.conf certs/ # Runtime output dirs +docker/volumes/ logs/ stats/ diff --git a/.gitignore b/.gitignore index ad6ec5a5..64c9bcad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Compiled binaries bin/ -jetmon2 +/jetmon2 # Editor and OS files .DS_Store @@ -24,6 +24,7 @@ veriflier2/config/veriflier.json *.pb.go # Runtime output dirs +docker/volumes/ logs/*.log stats/* !logs/.gitkeep diff --git a/AGENTS.md b/AGENTS.md index 385a698d..2be63a6c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,39 +6,69 @@ You are an expert Go developer with extensive knowledge about WordPress, enterpr Jetmon is a parallel HTTP uptime monitoring service that checks Jetpack websites at scale. Jetmon 2 is a complete rewrite of the original Node.js + C++ native addon service into a single Go binary. It retains full drop-in compatibility with all external interfaces — MySQL schema, WPCOM API payload, StatsD metric names, and log file format — while dramatically increasing concurrency, reducing memory usage, and eliminating the native addon compilation dependency. -The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency. The protocol between Monitor and Verifliers is upgraded from custom HTTPS to gRPC. +The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency. JSON-over-HTTP on the configured Veriflier port is the v2 production Monitor-to-Veriflier transport; the proto contract is retained only as a schema reference for a possible future transport. See `PROJECT.md` for the full project description, feature list, and performance benefit estimates. ## Architecture ``` -┌───────────────────────────────────────────────────────┐ -│ jetmon2 (single binary) │ -│ │ -│ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ │ -│ │ Orchestrator│ │ Check Pool │ │ gRPC Server │ │ -│ │ goroutine │ │ (goroutines)│ │ (Veriflier) │ │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬───────┘ │ -│ │ │ │ │ -│ ┌──────┴────────────────┴────────────────┴───────┐ │ -│ │ Internal channels │ │ -│ └────────────────────────────────────────────────┘ │ -└────────────┬──────────────────────────┬───────────────┘ +┌──────────────────────────────────────────────────────────────────────┐ +│ jetmon2 (single binary) │ +│ │ +│ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ │ +│ │ Orchestrator│ │ Check Pool │ │ Veriflier │ │ +│ │ goroutine │ │ (goroutines)│ │ transport │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬───────┘ │ +│ │ │ │ │ +│ ┌──────┴────────────────┴────────────────┴───────┐ │ +│ │ Internal channels │ │ +│ └─────────────────────┬──────────────────────────┘ │ +│ │ │ +│ ┌────────────────────┴────────────────────┐ │ +│ │ eventstore (jetmon_events + │ │ +│ │ jetmon_event_transitions writes) │ │ +│ └────────────────────┬────────────────────┘ │ +│ │ │ +│ ┌────────────┐ ┌────┴────────────┐ ┌──────────────────────┐ │ +│ │ REST API │ │ Webhook │ │ Alerting │ │ +│ │ /api/v1/ │ │ delivery │ │ delivery │ │ +│ │ + auth + │ │ worker │ │ worker │ │ +│ │ ratelimit │ │ (HMAC POST) │ │ (email/PD/Slack/Tm) │ │ +│ └─────┬──────┘ └────────┬────────┘ └──────────┬───────────┘ │ +│ │ │ │ │ +│ ┌─────┴──────┐ ┌──────┴──────────┐ ┌────────┴──────────────┐ │ +│ │ Operator │ │ Webhook │ │ Alert contact │ │ +│ │ dashboard │ │ receivers │ │ destinations │ │ +│ │ (SSE) │ │ (HTTPS) │ │ (HTTPS / SMTP / API) │ │ +│ └────────────┘ └─────────────────┘ └───────────────────────┘ │ +└────────────┬──────────────────────────┬──────────────────────────────┘ │ │ MySQL WPCOM API - StatsD (unchanged) - Log files - (all unchanged) + StatsD (legacy notification path, + Log files still active alongside + alert contacts) ``` -**Orchestrator goroutine** (`internal/orchestrator/`): Fetches site batches from MySQL, dispatches work to the check pool via channels, processes results, manages the local retry queue, coordinates Veriflier confirmation requests, and sends WPCOM status-change notifications. Owns all DB access and all outbound WPCOM calls. +**Orchestrator goroutine** (`internal/orchestrator/`): Fetches site batches from MySQL, dispatches work to the check pool via channels, processes results, manages the local retry queue, coordinates Veriflier confirmation requests, and emits WPCOM legacy notifications. Owns all DB access for site state and writes events through `eventstore`. -**Check Pool** (`internal/checker/`): A bounded goroutine pool that performs HTTP checks using Go's `net/http` and `net/http/httptrace`. Records DNS, TCP connect, TLS handshake, and TTFB timings for every check. Pool size auto-scales against queue depth within configured min/max bounds. No process spawning — adding a worker is a channel send. +**Check Pool** (`internal/checker/`): A bounded goroutine pool that performs HTTP checks using Go's `net/http` and `net/http/httptrace`. Records DNS, TCP connect, TLS handshake, and TTFB timings for every check. Pool size auto-scales against queue depth within configured min/max bounds. -**Veriflier transport** (`internal/veriflier/`): JSON-over-HTTP client/server for Monitor↔Veriflier communication. Replaces the previous SSL server and custom HTTPS protocol. Run `make generate` to swap in generated gRPC stubs once protoc is set up. +**Eventstore** (`internal/eventstore/`): The single writer for `jetmon_events` and `jetmon_event_transitions`. Every status / severity / state change is written transactionally so the event row's projection and the transition log can never disagree. Both downstream workers (webhooks, alerting) consume `jetmon_event_transitions` via a high-water mark. -**Veriflier** (`veriflier2/`): Standalone Go binary deployed at remote locations. Receives check batches from the Monitor via gRPC, performs HTTP checks, and returns results. Replaces the Qt C++ Veriflier. +**REST API** (`internal/api/`): The internal API surface (`/api/v1/...`) used by the gateway, alerting workers, dashboards, and CI tooling. Per-consumer Bearer-token auth (`internal/apikeys/`), per-key rate limiting, Stripe-style idempotency keys on POSTs. Sites CRUD, events list / single / transitions, SLA stats, webhooks CRUD, alert-contacts CRUD, manual delivery retry. + +**Webhook delivery worker** (`internal/webhooks/`): Polls `jetmon_event_transitions`, matches each new transition against active webhooks (event-type + site + state filters), and POSTs HMAC-signed payloads to consumer URLs. Retry ladder 1m / 5m / 30m / 1h / 6h then abandon. Per-webhook in-flight cap and shared dispatch pool. + +**Alerting delivery worker** (`internal/alerting/`): Same shape as the webhook worker but for managed channels — email (via `wpcom`/`smtp`/`stub` senders), PagerDuty Events API v2, Slack incoming webhooks, Microsoft Teams. Filter is simpler (`site_filter` + `min_severity`); per-contact `max_per_hour` rate cap absorbs pager storms. Send-test endpoint exercises the same dispatch path without requiring a real event. + +**Current delivery-owner constraint:** In the single-binary v2 deployment, `API_PORT > 0` starts the API server and makes webhook / alert-contact delivery workers eligible to run. Delivery rows are claimed transactionally, so multiple active delivery workers do not claim the same pending row. Use `DELIVERY_OWNER_HOST` as a rollout guard when intentionally keeping delivery single-owner during migration from embedded to standalone delivery. + +**Veriflier transport** (`internal/veriflier/`): JSON-over-HTTP client/server for Monitor↔Veriflier communication. Replaces the previous SSL server and custom HTTPS protocol. This is the v2 production transport. + +**Veriflier** (`veriflier2/`): Standalone Go binary deployed at remote locations. Receives check batches from the Monitor, performs HTTP checks, and returns results. Replaces the Qt C++ Veriflier. + +**Future shape:** the API server, webhook worker, and alerting worker are independently scalable concerns and the natural target for the multi-binary split tracked in `ROADMAP.md`. Today they coexist in `jetmon2` and the MySQL schema is the bus between them; tomorrow the deliverer becomes its own binary handling all outbound dispatch (webhooks + alerting + WPCOM legacy migrated behind it). ## Key Files @@ -52,9 +82,17 @@ See `PROJECT.md` for the full project description, feature list, and performance | `internal/config/` | Config loading, SIGHUP hot-reload | | `internal/metrics/` | StatsD client, stats file writer | | `internal/wpcom/` | WPCOM API client, circuit breaker | -| `internal/audit/` | Audit log writes to `jetmon_audit_log` | +| `internal/audit/` | Operational log writes to `jetmon_audit_log` (WPCOM, retries, verifier RPCs, config reloads) | +| `internal/eventstore/` | Event-sourced site state — manages `jetmon_events` + `jetmon_event_transitions` writes in single transactions | +| `internal/api/` | Internal REST API server (`/api/v1/...`) — auth, rate limiting, idempotency, sites/events/SLA/webhooks/alert-contacts handlers | +| `internal/apikeys/` | API key registry, sha256-hashed at rest; `./jetmon2 keys` CLI | +| `internal/webhooks/` | Webhook registry + delivery worker — outbound HMAC-signed POSTs of event transitions, retry ladder 1m/5m/30m/1h/6h | +| `internal/alerting/` | Alert contact registry + delivery worker — managed channels (email/PagerDuty/Slack/Teams) with site_filter + severity gate + per-hour rate cap | | `internal/dashboard/` | Operator dashboard, SSE handler | | `veriflier2/` | Go Veriflier binary | +| `API.md` | Internal REST API reference (auth, all endpoints, payload shapes) | +| `ROADMAP.md` | Deferred features and architectural roadmap (multi-binary split, public-API path) | +| `docs/adr/` | Architecture Decision Records — load-bearing decisions ("why is X like this") with context, decision, and consequences | | `PROJECT.md` | Full project description and feature specification | ## Build and Run @@ -66,12 +104,16 @@ docker compose up --build # Rebuild binary and start docker compose down # Stop services docker compose down -v # Stop and remove volumes (fresh start) -# Build binary directly -go build ./cmd/jetmon2/ +# Build binaries directly +make all + +# Use a non-default Go binary when needed +make GO=/path/to/go all # Run tests -go test ./... -go test -race ./... +make test +make test-race +make lint # Run with race detector go run -race ./cmd/jetmon2/ @@ -84,6 +126,10 @@ go run -race ./cmd/jetmon2/ ./jetmon2 migrate ./jetmon2 status ./jetmon2 audit --blog-id 12345 --since 2h +./jetmon2 rollout pinned-check +./jetmon2 rollout dynamic-check +./jetmon2 rollout projection-drift +./jetmon2 site-tenants import --file site-tenants.csv --dry-run ./jetmon2 drain ./jetmon2 reload ``` @@ -104,7 +150,9 @@ Copy `config/config-sample.json` to `config/config.json`. All keys from the orig - `BUCKET_TOTAL`: Total bucket range (e.g. 1000); replaces static `BUCKET_NO_MIN/MAX` - `BUCKET_TARGET`: Maximum buckets this host should own - `BUCKET_HEARTBEAT_GRACE_SEC`: Seconds before an unresponsive host's buckets are reclaimed (suggested: 2× round time) +- `PINNED_BUCKET_MIN/MAX`: Migration-only static bucket range for replacing one v1 host with one v2 host; disables `jetmon_hosts` dynamic ownership while set. Legacy `BUCKET_NO_MIN/MAX` are accepted as aliases for this mode. - `ALERT_COOLDOWN_MINUTES`: Default cooldown between repeated alerts for the same site +- `LEGACY_STATUS_PROJECTION_ENABLE`: Keep v1 `site_status` / `last_status_change` projection updated during shadow-v2-state migration - `LOG_FORMAT`: `text` (default, drop-in compatible) or `json` (structured logging) - `DASHBOARD_PORT`: Internal port for the operator dashboard (0 to disable) - `DEBUG_PORT`: localhost-only pprof port, default 6060 (0 to disable; never exposed remotely) @@ -153,10 +201,17 @@ Every HTTPS check inspects `tls.ConnectionState` for: - Cipher suite — recorded in audit log **Downtime Verification:** -1. Local check fails → enter local retry queue -2. After `NUM_OF_CHECKS` local failures → dispatch to Verifliers +1. Local check fails → open a `Seems Down` event (severity 3) and enter the local retry queue. The event opens on the **first** failure so `started_at` reflects the actual incident start. Subsequent failures during retry are no-ops on the events table (idempotent dedup). +2. After `NUM_OF_CHECKS` local failures → dispatch to Verifliers (event stays Seems Down) 3. `PEER_OFFLINE_LIMIT` Veriflier agreements required to confirm -4. Confirmed down → WPCOM notification via same payload as original +4. Verifier outcomes: + - **Confirms** → Promote event to `Down` (severity 4) with `reason = verifier_confirmed`. WPCOM notification via same payload as original. + - **Disagrees** → Close event with `resolution_reason = false_alarm`. +5. Recovery (any successful probe while an event is open): + - From `Seems Down` → close with `resolution_reason = probe_cleared`. + - From `Down` → close with `resolution_reason = verifier_cleared` and send recovery notification. + +Shadow-v2-state migration keeps incidents authoritative in `jetmon_events` + `jetmon_event_transitions` while `jetpack_monitor_sites` remains the legacy site/config table. When `LEGACY_STATUS_PROJECTION_ENABLE` is true, the `jetpack_monitor_sites.site_status` / `last_status_change` projection is updated in the same transaction as every event mutation (no drift). v1 mapping: open Seems Down → `site_status = SITE_DOWN (0)`; promoted to Down → `site_status = SITE_CONFIRMED_DOWN (2)`; closed → `site_status = SITE_RUNNING (1)`. After legacy readers move to the v2 API/event tables, this projection can be disabled. **Alert Deduplication:** After an alert fires, subsequent alerts for the same site are suppressed for `alert_cooldown_minutes`. Suppression is recorded in the audit log. @@ -190,13 +245,15 @@ New tables introduced by Jetmon 2: | Table | Purpose | |-------|---------| | `jetmon_hosts` | MySQL-coordinated bucket ownership and heartbeat | -| `jetmon_audit_log` | Full event history per site | +| `jetmon_events` | Current state of every incident — one row per `(blog_id, endpoint_id, check_type, discriminator)` while open; mutable until `ended_at` is set, then frozen | +| `jetmon_event_transitions` | Append-only history of every mutation to `jetmon_events` (open, severity change, state change, cause link, close) | +| `jetmon_audit_log` | Operational trail — WPCOM notifications, retry dispatch, verifier RPCs, alert/maintenance suppression, config reloads. Site-state changes do **not** flow through here | | `jetmon_check_history` | RTT and timing samples for trending | | `jetmon_false_positives` | Veriflier non-confirmation events | ## Multi-Host Bucket Coordination -Jetmon 2 replaces static `BUCKET_NO_MIN/MAX` config with runtime bucket ownership via the `jetmon_hosts` table. On startup, each instance claims unclaimed or expired bucket ranges using `SELECT ... FOR UPDATE` transactions. A heartbeat query runs each round; hosts with stale heartbeats (older than `BUCKET_HEARTBEAT_GRACE_SEC`) have their buckets absorbed by surviving peers. On SIGINT, the instance releases its buckets immediately. +Jetmon 2 normally replaces static `BUCKET_NO_MIN/MAX` config with runtime bucket ownership via the `jetmon_hosts` table. On startup, each instance claims unclaimed or expired bucket ranges using `SELECT ... FOR UPDATE` transactions. A heartbeat query runs each round; hosts with stale heartbeats (older than `BUCKET_HEARTBEAT_GRACE_SEC`) have their buckets absorbed by surviving peers. On SIGINT, the instance releases its buckets immediately. During the initial v1-to-v2 migration only, `PINNED_BUCKET_MIN/MAX` (or legacy `BUCKET_NO_MIN/MAX`) can pin one v2 host to its v1 predecessor's exact bucket range and disables `jetmon_hosts` ownership for that host. This enables zero-config horizontal scaling (spin up a host, it claims buckets) and self-healing coverage (a failed host's buckets are absorbed within one grace period) without a cluster orchestrator. @@ -224,7 +281,9 @@ Rolling updates require no simultaneous restart of all hosts and leave no sites These decisions govern how Jetmon models site state. They must be maintained consistently across all changes. Full design rationale is in [`TAXONOMY.md`](TAXONOMY.md) (Parts 2–3) and [`EVENTS.md`](EVENTS.md). -**Events are the source of truth.** Site status is event-sourced. The event log is canonical; the site row stores a denormalized projection for read performance. Update both in the same transaction — they must not drift. If the projection is ever suspect, rebuild it from the log. +**Events are the source of truth.** Site status is event-sourced across two tables: `jetmon_events` (one row per incident, holding the current severity/state/metadata) and `jetmon_event_transitions` (append-only history of every mutation). The site row stores a denormalized projection for read performance. Update events, transitions, and the projection in the same transaction — they must not drift. If the projection is ever suspect, rebuild it from the events tables. + +**Every event mutation writes a transition row in the same transaction.** Open, severity bump, state change, cause-link change, close — no carve-outs. The `eventstore` package is the only writer for `jetmon_events` and `jetmon_event_transitions`; external callers must go through it. This keeps the invariant testable with one integration test surface. **Severity and state are separate fields.** Severity is numeric — use it for ordering, thresholds, and rollup. State is a human-readable label — use it for display and lifecycle transitions. A live event's severity can be updated in place without changing its state (a worsening degradation is not a new kind of problem). @@ -249,12 +308,14 @@ Up → Seems Down → Down → Resolved **Retry Queue Persistence:** The local retry queue must persist between rounds. Do not flush it at round start — a site must accumulate `NUM_OF_CHECKS` failures before Veriflier escalation, and flushing resets that counter, preventing downtime confirmation. -**Bucket Claiming Races:** The `SELECT ... FOR UPDATE` transaction on `jetmon_hosts` is the only safe way to claim buckets. Do not claim buckets outside a transaction — two hosts starting simultaneously will both see the same unclaimed range and must not both write it. +**Bucket Claiming Races:** When dynamic ownership is active, the `SELECT ... FOR UPDATE` transaction on `jetmon_hosts` is the only safe way to claim buckets. Do not claim buckets outside a transaction — two hosts starting simultaneously will both see the same unclaimed range and must not both write it. Pinned v1-to-v2 migration hosts intentionally do not claim buckets in `jetmon_hosts`. **Circuit Breaker Floor:** The WPCOM API circuit breaker queue is bounded. If the queue fills, the oldest pending notifications are dropped with an error log. Monitor the circuit breaker state in the operator dashboard during any WPCOM API incident. **Veriflier Quorum Floor:** When Verifliers are marked unhealthy and excluded, `PEER_OFFLINE_LIMIT` adjusts dynamically, but there is a configured floor to prevent a single healthy Veriflier from confirming downtime alone. Ensure the floor is set appropriately for the number of deployed Verifliers. +**Delivery Ownership During Rollout:** Webhook and alert-contact workers claim delivery rows transactionally. Use `DELIVERY_OWNER_HOST` when you want to keep only one delivery owner active per database cluster during migration from embedded `jetmon2` delivery to standalone `jetmon-deliverer`. + **Maintenance Windows:** Checks continue during a maintenance window and data is recorded in the audit log, but no alerts fire. Verify that `maintenance_end` is correctly set — an open-ended maintenance window silently suppresses all alerts for that site indefinitely. **Memory Pressure Drain:** If RSS exceeds the configured threshold, the goroutine pool shrinks by 10% via graceful drain. This reduces throughput temporarily. If memory pressure is sustained, investigate for goroutine leaks using the pprof endpoint at `http://localhost:/debug/pprof/` (localhost only) before increasing `WORKER_MAX_MEM_MB`. diff --git a/API.md b/API.md new file mode 100644 index 00000000..076f1837 --- /dev/null +++ b/API.md @@ -0,0 +1,1077 @@ +# Jetmon Internal API — Reference and Design Notes + +This document is the reference for Jetmon 2's internal REST API and the design notes behind it. The API server, Bearer-token auth, site/event/SLA endpoints, webhooks, alert contacts, idempotency handling, and delivery retry surfaces are implemented in `internal/api/`, `internal/apikeys/`, `internal/webhooks/`, and `internal/alerting/`. Sections that describe future expansion or deferred behavior call that out explicitly. + +**Audience: internal systems only.** Jetmon does not expose this API to end customers directly. A separate gateway service handles all customer-facing access — authentication, tenant isolation, customer rate limiting, plan-based feature gating, public error vocabulary, etc. — and calls Jetmon over this internal interface. Other internal services (operator dashboard, alerting workers, batch reporting jobs, the gateway itself) are the only direct callers. The gateway/tenant boundary and remaining public-exposure prerequisites are documented in [`docs/public-api-gateway-tenant-contract.md`](docs/public-api-gateway-tenant-contract.md). + +**Gateway tenant context.** Requests from the internal consumer named `gateway` +may include `X-Jetmon-Tenant-ID`, `X-Jetmon-Public-Scopes`, and +`X-Jetmon-Gateway-Request-ID` (plus optional actor/plan headers). Jetmon +rejects those headers from any other consumer. When accepted, the context is +recorded in API audit metadata and used to owner-scope webhook and alert-contact +CRUD, delivery history, manual delivery retry, and alert-contact send-test +routes. Site, event, SLA/stat, and trigger-now routes are scoped through the +`jetmon_site_tenants` mapping table. Normal internal callers that omit these +headers keep the unscoped operator behavior described below. + +This shapes several design choices: authentication is per-consumer rather than per-customer, scopes are coarse rather than granular, error messages are verbose rather than guarded, and key management is an ops-only concern rather than a self-service feature. The trust boundary is "is this a known internal system?", not "is this user allowed to see this site?". + +The goal is to expose Jetmon's distinctive data model — the five-layer test taxonomy, the site → endpoint → event hierarchy, the multi-state vocabulary, and the event-sourced architecture (`TAXONOMY.md`, `EVENTS.md`) — over a shape that internal consumers can integrate against confidently. We took inspiration from Better Stack, UptimeRobot v3, Pingdom, and Atlassian Statuspage but did not copy any of their shapes wholesale; Jetmon's richer model (multi-state, layered tests, causal links, separate severity) wouldn't fit cleanly into a flat "monitors" API. + +## Principles + +1. **Read API is source-of-truth, not just a snapshot.** Consumers should be able to ask "what is the current state of this site?" and "how did this incident evolve from severity 3 to 4 to closed?" with separate, narrow endpoints — not by polling a coarse "monitor" record. That's what the events/transitions tables exist for. + +2. **Severity and state are both first-class.** Many competitor APIs collapse to a single "status" string (UptimeRobot returns `up`/`down`; Better Stack adds `paused`/`maintenance`/`validating`). Jetmon exposes both: numeric severity for ordering, thresholds, and SLA math; human-readable state for display. They never disagree because they're stored as separate columns updated in lockstep. + +3. **Cursor pagination, never offset.** Offset pagination breaks under concurrent writes (an event closing during traversal shifts page boundaries). Cursors keyed on stable timestamps (`started_at`, `changed_at`) survive that. + +4. **Versioned URLs, conservative additions.** All endpoints under `/api/v1/`. New fields on existing responses are additive (consumers ignore unknowns); shape-breaking changes get `/api/v2/` and a deprecation window. Severity values 0–4 today, room to add new values up to 255 without a version bump. + +5. **No shape-shifting based on permissions.** A read-scope token sees the same JSON shape for `GET /api/v1/sites/{id}` as an admin token — fields aren't hidden, they're empty/null where data isn't applicable. Easier to test, easier to document. + +6. **Errors carry a stable code, a human message, and (when relevant) a reference id.** Consumers branch on the `code` field, not on parsing the message. + +7. **Bulk operations must be explicit when added.** v1 currently exposes single-resource write endpoints only. If bulk updates are added later, they should have dedicated request and response shapes instead of encouraging "list 10,000 sites and then loop one update at a time" client behavior. + +## Authentication + +**Per-consumer Bearer tokens.** Each calling system gets one (or more) tokens identifying it. The tokens are not user-delegated — there's no concept of "an end user authenticated via this token." A token *is* a service identity. + +``` +Authorization: Bearer jm_a1b2c3d4e5f6... +``` + +Tokens are 32-byte high-entropy random strings, sha256-hashed at rest (sha256 not bcrypt — bcrypt is for human-chosen passwords; high-entropy tokens just need a fast cryptographic hash). Stored in `jetmon_api_keys`: + +``` +jetmon_api_keys: + id BIGINT PK + key_hash CHAR(64) -- sha256 hex + consumer_name VARCHAR(128) -- e.g. "gateway", "alerts-worker", "dashboard" + scope ENUM('read','write','admin') + rate_limit_per_minute INT + expires_at TIMESTAMP NULL -- NULL = never + revoked_at TIMESTAMP NULL -- revoke time; future value = rotation grace window + last_used_at TIMESTAMP NULL + created_at TIMESTAMP + created_by VARCHAR(128) -- ops user / automation that created the key +``` + +**Scopes — three coarse buckets:** + +- `read` — every GET endpoint. +- `write` — every POST/PATCH/DELETE on sites, events, webhooks, and alert contacts. +- `admin` — write + ability to force operations like "recompute SLA from event log" or "close all events in maintenance mode." Reserved for ops tooling, not regular consumers. + +We deliberately did not split into `sites:read` / `events:read` / `webhooks:read` etc. Internal consumers tend to need the whole read surface — the gateway needs to read everything to mediate it; an alerts worker reads sites, events, *and* webhooks. Granular scopes would create more configuration burden than they solve. + +**Per-consumer audit logging.** Every authenticated request is logged to `jetmon_audit_log` with the consumer name, endpoint, status code, and latency. This is the load-bearing accountability mechanism — if "alerts-worker is hammering the trigger-now endpoint," that's visible in the audit log without parsing access logs. The audit log already exists for operational events (`EVENTS.md`); API access becomes another `event_type` value (`api_access`). + +**Key management is ops-only.** No `/api/v1/keys` endpoints. Keys are created and revoked via the `./jetmon2` CLI: + +``` +./jetmon2 keys create --consumer gateway --scope read [--expires 90d] +./jetmon2 keys list +./jetmon2 keys revoke +./jetmon2 keys rotate # creates a new key for the same consumer; revokes old after grace +``` + +The CLI talks to the database directly (via `jetmon_api_keys`), prints the new token once, and never exposes hashes. There's no self-service surface because there are no end customers — keys are infrastructure config, not user-managed credentials. + +`revoked_at` and `expires_at` are both half-open cutoffs: a key is valid for times strictly before the cutoff and rejected at or after it. During key rotation, the CLI may set `revoked_at` in the future so the old key remains valid for the grace window while consumers deploy the replacement. Immediate revocation sets `revoked_at` to the current time. + +**Single key format.** No live/test split. The token format is `jm_`. The gateway is responsible for any environment separation (dev/staging/prod) at its own layer. + +**Why not mTLS / IP allowlists alone?** Either could replace Bearer tokens for service-to-service auth, but tokens make per-consumer identity trivial to log and revoke. mTLS rotation is heavier; IP allowlists don't survive containerized deployments cleanly. Bearer tokens are the lowest-friction option that gives us per-consumer accountability. + +**Why not OAuth?** Same reasoning as before, now stronger: there are no user delegations to model. Every caller is a server. + +## Common patterns + +### Base URL and versioning + +``` +https://api.jetmon.example.com/api/v1 +``` + +Hosted in the `jetmon2` binary on a dedicated port (`API_PORT`), separate from the operator dashboard (`DASHBOARD_PORT`) and the Veriflier transport port (`VERIFLIER_PORT`). + +### Content negotiation + +`Content-Type: application/json` for both request and response. UTF-8. No XML, no form-encoded, no JSON-API envelope (Better Stack uses JSON:API; we don't because it adds an `attributes` indirection that obscures field names without buying us anything Jetmon-specific). + +### Response envelope + +Every list response wraps the data in a small envelope: + +```json +{ + "data": [ ... ], + "page": { + "next": "eyJzdGFydGVkX2F0IjoiMjAyNi0wNC0yMVQxNjo...", + "limit": 50 + } +} +``` + +Every single-resource response is just the resource: + +```json +{ + "id": 487291, + "blog_id": 12345, + ... +} +``` + +Reasoning: keeping list and single-resource shapes distinct means consumers don't write `if (Array.isArray(response.data))` everywhere. The list envelope holds pagination; the resource envelope is the resource. + +### Resource IDs + +All resource `id` fields are raw `BIGINT UNSIGNED` integers serialized as JSON numbers (not strings). Sites use the existing `blog_id`; events, transitions, webhooks, deliveries, and contacts use their respective table's auto-increment primary key. There is no type prefix or ULID encoding. + +Type context comes from the **endpoint path** (`/api/v1/sites/12345` vs `/api/v1/events/12345`) and from explicit `type` fields where ambiguity would otherwise hurt — for example, error messages always name the resource type: + +```json +{ "error": { "code": "event_not_found", "message": "Event 12345 does not exist", "request_id": "..." } } +``` + +Webhook payloads include `"type": "event.opened"` so the consumer never has to infer from a bare numeric id which table the id refers to. Operational/trace identifiers (request IDs, webhook delivery IDs, idempotency keys) follow their own conventions described in the relevant sections. + +### Pagination + +Cursor-based, opaque tokens. Each list endpoint accepts `?cursor=...&limit=N`. Default limit 50, max 200. + +``` +GET /api/v1/sites?cursor=eyJzdGFydGVkX2F0IjoiMjAyNi0wNC0yMVQxNjo...&limit=100 +``` + +The cursor is an opaque base64-encoded JSON of `{started_at, id}` (or `{changed_at, id}` for transition lists). Consumers shouldn't decode it; we reserve the right to change the encoding inside it. + +`page.next` is null on the last page. `page.prev` is intentionally not provided — most consumers walk forward, and offering prev would force us to support reverse iteration in indexes we don't currently have. + +### Filtering and sorting + +Most list endpoints accept filter query params. The convention: + +- Equality filters: `?state=Down&check_type=http` +- Range filters: `?started_at__gte=2026-04-01T00:00:00Z&started_at__lt=2026-05-01T00:00:00Z` +- Set filters: `?state__in=Down,Seems%20Down` + +Sorting is fixed per endpoint to one of two sensible defaults (newest-first for incidents, alphabetical for sites). We do not expose `?order_by=...` — letting consumers pick arbitrary sort columns means we have to maintain indexes for all of them. + +### Error model + +```json +{ + "error": { + "code": "site_not_found", + "message": "Site with id 12345 does not exist or is not visible to this token", + "request_id": "req_018f9a2c..." + } +} +``` + +Error `code` values are documented per endpoint and stable across versions. The `message` is for humans and may improve over time. `request_id` matches a server-side log line for support tickets. + +HTTP status codes used: + +- `200` — success +- `201` — resource created (CRUD POST) +- `204` — success, no body (DELETE) +- `400` — malformed request (bad JSON, invalid filter syntax, unknown field) +- `401` — missing or invalid token +- `403` — token valid but lacks required scope +- `404` — resource genuinely doesn't exist +- `409` — idempotent re-attempt with different body (state already different) +- `422` — semantic validation failure (e.g. invalid URL format) +- `429` — rate limit exceeded +- `500` — server error +- `503` — temporarily unavailable (DB down, etc.) + +403 vs 404 are honest here: a `read`-scope token hitting a `write`-only endpoint gets a real 403, not a 404. Internal consumers benefit from accurate semantics over the "hide existence" pattern public APIs use to avoid information leakage — and the gateway in front of Jetmon handles any customer-facing 403↔404 collapsing it wants. + +Error messages are verbose by design — for an internal API, "table 'jetmon_events' is locked, retry in 30s" beats "internal server error" by a wide margin during incident response. The gateway can sanitize before forwarding to customers. + +### Rate limiting + +Per-key bucket, configurable per consumer at key-creation time. The current implementation uses one in-memory bucket per key, sized by that key's `rate_limit_per_minute`. Defaults are 60 req/min for `read` and `admin`, and 30 req/min for `write`. Internal consumers usually need higher limits than the default — the gateway and dashboard might be set to 600 req/min, while a daily batch job stays at 60. + +Standard headers on every response: + +``` +X-RateLimit-Limit: 60 +X-RateLimit-Remaining: 47 +X-RateLimit-Reset: 1714685400 +``` + +`429` responses include `Retry-After` in seconds. + +This is service-protection rate limiting, not customer-fairness rate limiting — the gateway handles the latter. If trigger-now traffic needs a separate bucket later, add it as a route-specific extension rather than overloading the base per-key limit. + +### Idempotency + +POST endpoints that create, trigger, test, retry, rotate, or manually close resources accept an `Idempotency-Key` header. PATCH and DELETE endpoints are already idempotent on this schema and do not use the idempotency cache. The server stores `(token_id, idempotency_key) → response` for 24 hours. Replays with the same body return the cached response; replays with a different body return `409 idempotency_conflict`. + +This is the same pattern Stripe uses; it's the right call for monitor management where retries are common. + +### Time + +All timestamps are ISO 8601 with millisecond precision and `Z` suffix: + +``` +"started_at": "2026-04-25T03:18:38.329Z" +``` + +The server is always UTC. Clients converting to local time is their problem. + +--- + +## Status and state vocabulary + +The API exposes the same vocabulary the orchestrator and event store use. From `TAXONOMY.md` Part 3 and `EVENTS.md`: + +**State** (string, human-readable): + +| Value | Meaning | +|-------|---------| +| `Up` | All checks passing. | +| `Warning` | Something needs attention but isn't user-facing yet (cert expiring, version behind). | +| `Degraded` | Some checks failing or thresholds exceeded; site is serving content. | +| `Seems Down` | First failure detected, awaiting verifier confirmation. Transient. | +| `Down` | Confirmed failures on critical checks. | +| `Paused` | Monitoring suspended by user. | +| `Maintenance` | Scheduled maintenance window active. | +| `Unknown` | Monitor couldn't determine state (probe crashed, region offline, agent silent). | +| `Resolved` | (Events only) The condition cleared; event is closed. | + +**Severity** (integer 0–255, ordered): + +| Value | Default state mapping | +|-------|----------------------| +| 0 | Up | +| 1 | Warning | +| 2 | Degraded | +| 3 | Seems Down | +| 4 | Down | + +Higher severity = worse. Severity climbs independently of state — a worsening Degraded event bumps severity without changing state. New severity values can be added (e.g. 5 for "data loss confirmed") without breaking ordering. Consumers should treat severity as a numeric comparison, not a switch on specific values. + +**Why expose both?** Severity is for thresholds (`severity >= 3 ? page on-call : email digest`); state is for human-readable rendering (`incident.state == "Seems Down" ? badge.color = yellow`). Competitors that collapse to one field force consumers to either parse a string for ordering or build their own numeric mapping. + +--- + +## Endpoints + +The full surface is grouped into five capability families, matching `ROADMAP.md`. The implemented route table lives in `internal/api/routes.go`; design-only additions and deferred behavior are called out where they appear. + +### Family 1: Sites and current state + +#### `GET /api/v1/sites` + +List sites visible to this token. + +**Scopes:** `read` + +Normal internal callers see the full site table. Gateway-routed requests only +see rows mapped to `X-Jetmon-Tenant-ID` in `jetmon_site_tenants`. + +**Query parameters:** + +| Param | Type | Description | +|-------|------|-------------| +| `cursor` | string | Pagination cursor | +| `limit` | int (1–200) | Default 50 | +| `state` | string | Filter by current state (e.g. `Down`) | +| `state__in` | csv | Multiple states | +| `severity__gte` | int | Minimum severity | +| `monitor_active` | bool | Filter active vs paused | +| `q` | string | URL substring search | + +**Response 200:** + +```json +{ + "data": [ + { + "id": 12345, + "blog_id": 12345, + "monitor_url": "https://example.com", + "monitor_active": true, + "bucket_no": 0, + "check_interval": 5, + "current_state": "Up", + "current_severity": 0, + "active_event_id": null, + "last_checked_at": "2026-04-25T03:24:11.123Z", + "last_status_change_at": "2026-04-21T09:14:00.000Z", + "ssl_expiry_date": "2026-08-12", + "check_keyword": null, + "redirect_policy": "follow", + "maintenance_start": null, + "maintenance_end": null, + "alert_cooldown_minutes": null + } + ], + "page": { "next": "eyJ...", "limit": 50 } +} +``` + +`id` and `blog_id` are the same value for now; `id` is the public field name (`blog_id` is the historical column name). Consumers should rely on `id`. + +`current_state`, `current_severity`, and `active_event_id` are derived from +open rows in `jetmon_events`. During shadow-v2-state migration the legacy +`site_status` column is only a fallback for sites with no active v2 event while +`LEGACY_STATUS_PROJECTION_ENABLE` is true; once the projection is disabled, a +site with no active v2 event is reported as `Up` regardless of stale legacy +status values. + +#### `GET /api/v1/sites/{id}` + +Single site, same shape as a list entry plus an `active_events` array for any open events: + +```json +{ + "id": 12345, + ... + "active_events": [ + { + "id": 487291, + "check_type": "http", + "severity": 4, + "state": "Down", + "started_at": "2026-04-25T03:18:38.329Z" + }, + { + "id": 487288, + "check_type": "tls_expiry", + "severity": 1, + "state": "Warning", + "started_at": "2026-04-23T00:00:00.000Z" + } + ] +} +``` + +`active_events` is the simplest answer to "tell me everything wrong with this site right now." Ordered by severity descending. + +Gateway-routed single-site, event/history, SLA/stat, and trigger-now routes all +derive visibility through `jetmon_site_tenants`. A site or event outside the +tenant mapping is returned as not found. + +#### `POST /api/v1/sites` + +Create a site. + +**Scopes:** `write` + +**Request body:** + +```json +{ + "blog_id": 12345, + "monitor_url": "https://example.com", + "monitor_active": true, + "bucket_no": 0, + "check_keyword": null, + "redirect_policy": "follow", + "timeout_seconds": null, + "custom_headers": {}, + "alert_cooldown_minutes": null, + "check_interval": 5 +} +``` + +**Response 201:** the site object. + +When the `gateway` consumer creates a site with tenant context, Jetmon inserts +the site row and the `(tenant_id, blog_id)` mapping in one transaction. Internal +creates without tenant context keep the existing unscoped behavior. + +**Errors:** + +| Code | Meaning | +|------|---------| +| `invalid_blog_id` | `blog_id` is missing or not a positive integer | +| `invalid_url` | `monitor_url` doesn't parse | +| `invalid_redirect_policy` | `redirect_policy` is not `follow`, `alert`, or `fail` | +| `invalid_custom_headers` | `custom_headers` is not a valid string map | +| `site_exists` | A site with this `blog_id` already exists | + +#### `PATCH /api/v1/sites/{id}` + +Partial update. Send only the fields you want to change. + +#### `DELETE /api/v1/sites/{id}` + +Soft-delete (sets `monitor_active = false` and tombstones). Closes any active events with `resolution_reason = manual_override`. + +Delete is intentionally idempotent and preserves the site row. Repeating +`DELETE /api/v1/sites/{id}` returns `204 No Content`, and a later +`GET /api/v1/sites/{id}` returns `200 OK` with the same site object and +`monitor_active: false`. Consumers should treat `monitor_active:false` as the +readable deleted/paused state rather than expecting a `404` after delete. + +#### `POST /api/v1/sites/{id}/pause`, `POST /api/v1/sites/{id}/resume` + +Convenience verbs for the common pause/resume flow. Pause closes any active events with `resolution_reason = manual_override` and sets `current_state = "Paused"`. Resume reverts. + +#### `POST /api/v1/sites/{id}/trigger-now` + +Force an immediate check, returning the result inline under the caller's normal per-key rate limit. Useful for "I just deployed a fix, is it back up?" + +```json +{ + "result": { + "http_code": 200, + "error_code": 0, + "success": true, + "rtt_ms": 412, + "dns_ms": 8, + "tcp_ms": 22, + "tls_ms": 35, + "ttfb_ms": 142, + "ssl_expires_at": "2026-08-12T00:00:00.000Z" + }, + "current_state": "Up", + "active_events_closed": [487291] +} +``` + +Trigger-now runs one synchronous check with a 30-second server-side timeout. +On success it closes any open events with `resolution_reason=probe_cleared`. +On failure it returns the failed check result but does not open a new event; +the orchestrator remains the single owner of failure detection and event +opening on its regular round. + +### Family 2: Events and history + +#### `GET /api/v1/sites/{id}/events` + +Incident history for a site. Default sort: most recent `started_at` first. + +**Query parameters:** + +| Param | Type | Description | +|-------|------|-------------| +| `cursor`, `limit` | | Standard | +| `state` / `state__in` | string | Filter by state | +| `check_type` / `check_type__in` | string | `http`, `tls_expiry`, etc. | +| `started_at__gte` / `started_at__lt` | ISO timestamp | Time range | +| `active` | bool | `true` → only open events; `false` → only closed | + +**Response:** + +```json +{ + "data": [ + { + "id": 487291, + "site_id": 12345, + "endpoint_id": null, + "check_type": "http", + "discriminator": null, + "severity": 4, + "state": "Down", + "started_at": "2026-04-25T03:18:38.329Z", + "ended_at": "2026-04-25T03:21:17.290Z", + "resolution_reason": "verifier_cleared", + "cause_event_id": null, + "metadata": { + "http_code": 503, + "error_code": 0, + "rtt_ms": 84, + "url": "https://example.com" + }, + "duration_ms": 158961, + "transition_count": 5 + } + ], + "page": { "next": "eyJ...", "limit": 50 } +} +``` + +`duration_ms` is a server-computed convenience: `(ended_at or now) - started_at`. `transition_count` lets the consumer decide whether to fetch the full transition log. + +#### `GET /api/v1/sites/{id}/events/{event_id}` + +Single event, same shape, plus a `transitions` array (full history, no pagination — events have bounded transition counts). + +```json +{ + "id": 487291, + ... + "transitions": [ + { + "id": 1, + "severity_before": null, + "severity_after": 3, + "state_before": null, + "state_after": "Seems Down", + "reason": "opened", + "source": "host-us-west-1", + "metadata": { "http_code": 503, "rtt_ms": 84 }, + "changed_at": "2026-04-25T03:18:38.329Z" + }, + { + "id": 2, + "severity_before": 3, + "severity_after": 4, + "state_before": "Seems Down", + "state_after": "Down", + "reason": "verifier_confirmed", + "source": "host-us-west-1", + "metadata": { "verifier_results": [...], "verifier_confirmed": 2 }, + "changed_at": "2026-04-25T03:18:55.412Z" + } + ] +} +``` + +#### `GET /api/v1/sites/{id}/events/{event_id}/transitions` + +Same transition data, but as its own paginated list when an event has accumulated many transitions (long-running degradation events with hundreds of severity bumps). + +#### `GET /api/v1/events/{event_id}` + +Direct event lookup without site context. Useful for webhook payloads that link directly to an incident page. + +#### `POST /api/v1/sites/{id}/events/{event_id}/close` + +Manually close an open event (for the operator dashboard or for handling false alarms the verifier missed). + +**Scopes:** `write` + +**Request body:** + +```json +{ + "reason": "manual_override", + "note": "Confirmed maintenance was running, alert fired before window started" +} +``` + +`note` ends up in the closing transition's metadata. + +### Family 3: SLA and statistics + +#### `GET /api/v1/sites/{id}/uptime` + +Uptime and downtime stats over a rolling window. + +**Query parameters:** + +| Param | Type | Description | +|-------|------|-------------| +| `window` | enum | `1h`, `24h` / `1d`, `7d`, `30d`, `90d` | +| `from` / `to` | ISO timestamp | Custom range; overrides `window` | + +**Response:** + +```json +{ + "window": { "from": "2026-03-26T00:00:00Z", "to": "2026-04-25T00:00:00Z" }, + "uptime_percent": 99.847, + "total_seconds": 2592000, + "down_seconds": 3960, + "degraded_seconds": 600, + "warning_seconds": 86400, + "maintenance_seconds": 0, + "unknown_seconds": 0, + "incident_count": 4, + "mttr_seconds": 990, + "mtbf_seconds": 647760 +} +``` + +**How uptime is computed:** sum of `(ended_at or now) - started_at` for events with `state in (Down, Seems Down)` within the window, divided by total window duration. Degraded, Warning, Maintenance, and Unknown durations are returned separately but are not subtracted from the denominator in the current implementation. The math is event-driven, not check-driven, which means SLA reports stay accurate even if check frequency changes. + +#### `GET /api/v1/sites/{id}/response-time` + +Response time percentiles over a window, sourced from `jetmon_check_history`. + +**Response:** + +```json +{ + "window": { "from": "2026-04-24T00:00:00Z", "to": "2026-04-25T00:00:00Z" }, + "samples": 17280, + "p50_ms": 187, + "p95_ms": 412, + "p99_ms": 891, + "max_ms": 4200, + "mean_ms": 215, + "truncated": false +} +``` + +Percentiles are computed from raw `jetmon_check_history` samples in the window. The handler caps the in-memory sample set at 100,000 rows; `truncated: true` means the response used the most recent capped subset. + +#### `GET /api/v1/sites/{id}/timing-breakdown` + +DNS / TCP / TLS / TTFB breakdown — one of Jetmon's distinctive features (most competitors only return total response time). + +**Response:** + +```json +{ + "window": { "from": "2026-04-24T00:00:00Z", "to": "2026-04-25T00:00:00Z" }, + "samples": 17280, + "truncated": false, + "dns": { "p50_ms": 8, "p95_ms": 45, "p99_ms": 80, "max_ms": 120 }, + "tcp": { "p50_ms": 22, "p95_ms": 78, "p99_ms": 140, "max_ms": 220 }, + "tls": { "p50_ms": 35, "p95_ms": 110, "p99_ms": 180, "max_ms": 260 }, + "ttfb": { "p50_ms": 142, "p95_ms": 391, "p99_ms": 760, "max_ms": 1200 } +} +``` + +### Family 4: Alert contacts and webhooks + +#### Webhook management endpoints + +Implemented routes: + +- `GET /api/v1/webhooks` +- `POST /api/v1/webhooks` +- `GET /api/v1/webhooks/{id}` +- `PATCH /api/v1/webhooks/{id}` +- `DELETE /api/v1/webhooks/{id}` +- `POST /api/v1/webhooks/{id}/rotate-secret` +- `GET /api/v1/webhooks/{id}/deliveries` +- `POST /api/v1/webhooks/{id}/deliveries/{delivery_id}/retry` + +Standard CRUD. A webhook is: + +```json +{ + "id": 42, + "url": "https://hooks.slack.com/...", + "active": true, + "events": ["event.opened", "event.severity_changed", "event.closed"], + "site_filter": { "site_ids": [12345, 67890] }, + "state_filter": { "states": ["Down", "Seems Down"] }, + "secret": "whsec_a1b2c3...", + "created_at": "2026-04-01T00:00:00Z" +} +``` + +`secret` is the only string-prefixed identifier in the API surface — it's a shared secret, not a resource id, and the `whsec_` prefix is a Stripe-style hint to anyone scanning logs/leaks ("this is a webhook signing secret, treat as sensitive"). It is shown only on creation; afterward only `secret_preview` is returned (last 4 chars). + +#### Filter semantics + +Filters compose **AND across dimensions, whitelist within each, empty = match all**. A delivery fires when: + +``` +event_type ∈ events (or events == []) +AND site_id ∈ site_filter.site_ids (or site_filter == {}) +AND state ∈ state_filter.states (or state_filter == {}) +``` + +Empty fields mean "no restriction on this dimension," matching the everyday English meaning of an empty filter. Same convention as Stripe, GitHub, and Slack webhooks — consumers can omit dimensions they don't care about and progressively narrow as needed. Blacklist/exclude fields are not supported in v1. + +#### Webhook delivery format + +When an event fires, Jetmon POSTs to the webhook URL: + +```json +{ + "type": "event.opened", + "delivered_at": "2026-04-25T03:18:38.500Z", + "delivery_id": 9182734, + "event": { ... full event object ... }, + "site": { ... full site object ... } +} +``` + +Headers: + +``` +Content-Type: application/json +X-Jetmon-Event: event.opened +X-Jetmon-Delivery: 9182734 +X-Jetmon-Signature: t=1714685400,v1=5257a869e7ec... +``` + +The signature is HMAC-SHA256 of `{timestamp}.{body}` with the webhook's `secret`, formatted Stripe-style (timestamp + scheme version + signature). The timestamp prevents replay; consumers should reject deliveries older than 5 minutes. + +#### Webhook event types + +- `event.opened` — new event row inserted +- `event.severity_changed` — severity escalated or de-escalated +- `event.state_changed` — state changed (e.g. Seems Down → Down) +- `event.cause_linked` / `event.cause_unlinked` +- `event.closed` — event resolved (any reason) + +`event.*` types fire once per transition row written to `jetmon_event_transitions` — i.e., once per actual mutation. The 1:1 invariant the eventstore maintains is what makes detection reliable. + +**Deferred:** `site.state_changed` (rollup from events to the site-row projection) is **not** in v1. Rolling up cleanly without races requires changes to the orchestrator, and event-level webhooks already give consumers everything they need. Tracked in ROADMAP.md. + +#### Detection mechanism + +Webhook delivery uses **pull-based detection**: a worker polls `jetmon_event_transitions WHERE id > last_seen` on a 1s interval and creates one delivery row per matching transition. This is the long-term answer for Jetmon's architecture — the orchestrator's flap suppression already adds 10s+ between detection and confirmed events, so 1s poll latency is invisible in the practical budget. + +Current v2 deployment constraint: in the single-binary shape, `API_PORT` makes webhook and alert-contact workers eligible to run. Delivery rows are claimed transactionally, so multiple active delivery workers do not claim the same pending row. `DELIVERY_OWNER_HOST` can still restrict actual delivery to one named host when operators want a single-owner rollout while moving from embedded `jetmon2` delivery to standalone `jetmon-deliverer`. + +Push-based or hybrid detection is not on the roadmap. If a future consumer demands sub-second webhook latency, that's the trigger to introduce a pub/sub layer — not before. + +#### Retry policy + +Each `jetmon_webhook_deliveries` row is one webhook firing. Each delivery has up to 6 attempts on this exponential schedule: + +| Attempt | Delay from previous | +|---------|---------------------| +| 1 | immediate | +| 2 | 1m | +| 3 | 5m | +| 4 | 30m | +| 5 | 1h | +| 6 | 6h | + +A delivery succeeds when any attempt returns 2xx. After 6 failed attempts, the row is marked `status = 'abandoned'`. Abandoned rows stay in the table — `GET /api/v1/webhooks/{id}/deliveries?status=abandoned` lists them, and `POST /api/v1/webhooks/{id}/deliveries/{delivery_id}/retry` lets a consumer re-fire after fixing their endpoint. + +`GET /api/v1/webhooks/{id}/deliveries` returns the full delivery history with `status` (`pending` / `delivered` / `failed` / `abandoned`), `attempt`, `last_status_code`, and a truncated `last_response` body for debugging. + +#### Signing and secret rotation + +Signature: HMAC-SHA256 of `{timestamp}.{body}` with the webhook's secret, sent as `X-Jetmon-Signature: t=,v1=`. The timestamp prevents replay; consumers should reject deliveries older than 5 minutes. + +Format chosen for: wide library support across consumer languages, explicit version (`v1=`) to allow future algorithm rotation without breaking consumers, replay protection via timestamp baked into the signature input, and the ability to coexist with multiple `v1=` values during a grace-period rotation (deferred). Alternatives considered and not chosen: GitHub-style (no replay protection), Slack-style (functionally equivalent, two-header form), JWT-based (wrong abstraction for "POST JSON + signature header"), HTTP Message Signatures / RFC 9421 (over-engineered for our scope), asymmetric / Ed25519 (compelling for public APIs without a gateway in front; not warranted while a gateway re-signs for end customers). + +When to revisit: a public-API-without-gateway requirement (then asymmetric becomes attractive — no per-consumer secret distribution), or a standards-driven third-party integration that requires RFC 9421. Migration path in either case is "add a `v2=` signature alongside `v1=` for a transition window, switch consumers, deprecate `v1=`" — same shape as algorithm rotation we already designed for. + +Secret rotation in v1: **immediate revocation only**. `POST /api/v1/webhooks/{id}/rotate-secret` returns a new secret once, replaces the stored hash, and the old secret stops working immediately. Failed deliveries during the consumer's deploy window go into the retry queue. + +**Deferred:** grace-period rotation (server signs with both old and new secrets for a configurable window so consumers can roll over without coordinated downtime) is in ROADMAP.md. The signature header format already supports multiple `v1=...,v1=...` values per Stripe convention, so adding grace-period rotation later is non-breaking. + +#### Backpressure + +Delivery uses a **shared worker pool** (default 50 goroutines, configurable) with a **per-webhook in-flight cap** (default 3 concurrent). The shared pool bounds total goroutine count; the per-webhook cap prevents a slow or hung webhook URL from monopolizing the pool and starving other webhooks' deliveries. + +Implementation: at dispatch time, the worker checks a `map[webhook_id]int` counter under a mutex. If a webhook is already at its cap, the row stays `pending` and is picked up on the next poll tick. The counter decrements when a delivery attempt completes (success or failure). + +#### Schema + +``` +jetmon_webhooks: + id, url, active, owner_tenant_id VARCHAR(128) NULL, + events JSON, site_filter JSON, state_filter JSON, + secret VARCHAR(80), secret_preview VARCHAR(8), + created_by VARCHAR(128), created_at, updated_at + +jetmon_webhook_deliveries: + id, webhook_id, transition_id, event_id, event_type, + payload JSON, -- frozen at fire time, never updated + status ENUM('pending','delivered','failed','abandoned'), + attempt INT, + next_attempt_at TIMESTAMP NULL, -- when the worker should pick up + last_status_code INT NULL, + last_response VARCHAR(2048) NULL, -- truncated body, debugging aid + last_attempt_at TIMESTAMP NULL, + delivered_at TIMESTAMP NULL, + created_at +``` + +Indexes: +- `(status, next_attempt_at)` on deliveries — the worker's "what's ready?" query +- `(webhook_id, created_at)` on deliveries — the deliveries-list endpoint +- `(active)` on webhooks — the dispatcher's filter for live webhooks +- `(owner_tenant_id)` on webhooks — scopes gateway-routed CRUD and delivery visibility while normal internal callers remain unscoped + +`payload` is **frozen at delivery creation**: the consumer sees the event as it was when the webhook fired, not as it is now. A closed-and-amended event would not change a delivery's payload — that's the contract consumers expect ("this is what I was told happened, not whatever it became"). + +#### Webhook ownership and scope + +Webhooks are managed by any `write`-scope token. `created_by` records the consumer name from the API key for audit purposes only — there is no per-consumer ownership boundary, and any `write`-scope token can read/edit/delete any webhook. + +This is appropriate **only** because Jetmon is internal-only with all consumers trusted. Per-consumer ownership doesn't add value at this scale; the gateway in front of Jetmon handles tenant isolation for any customer-facing webhooks. + +The table includes nullable `owner_tenant_id`. Normal internal handlers remain +unscoped when no gateway context is present, so existing internal behavior is +unchanged. Gateway-routed creates set `owner_tenant_id`, and gateway-routed +list/get/update/delete/rotate-secret paths filter by it. Delivery history and +manual retry visibility are derived by first verifying ownership of the parent +webhook. + +**Ramifications if Jetmon ever becomes a public API:** + +- This model would need to change. Customer-facing consumers cannot be allowed to read or modify each other's webhooks. +- Migration path: continue requiring `owner_tenant_id` on gateway-routed + creates; add granular public `webhooks` scopes or a formal account/tenant + boundary before any direct customer exposure. +- The `created_by` field is forward-compatible — it's already capturing the consumer identity, just not enforcing it. +- Existing webhooks would need a backfill migration before being exposed publicly. +- Webhook secrets would need stronger isolation (currently any write-scope can rotate any secret; in a public API this would be a privilege escalation). + +The decision to defer ownership today should be reread before any public-API conversation actually starts. + +### Family 5: Alert contacts + +Managed notification channels for human destinations: email, PagerDuty, Slack, Microsoft Teams. Where webhooks (Family 4) deliver a raw signed event stream that the consumer renders, alert contacts deliver a Jetmon-rendered notification through a transport Jetmon owns end-to-end (subject lines, message formatting, transport-specific quirks). + +#### When to use which + +- **Alert contact** — you want a person notified through a managed channel (their email, your team's PagerDuty service, your team's Slack channel). You don't want to operate a receiver, you want Jetmon to handle rendering and retries. +- **Webhook** — you want a *system* notified, you control the receiver, and you want the raw signed event payload to render or route however you want. Use this for custom Slack bots that aren't a vanilla incoming-webhook URL, internal SIEM ingestion, custom alerting middleware, or anything that wants the structured event rather than a pre-formatted message. + +The two surfaces share the same event source (`jetmon_event_transitions`); a customer can use both simultaneously without dedup concerns at the source. + +#### Alert contact management endpoints + +Implemented routes: + +- `GET /api/v1/alert-contacts` +- `POST /api/v1/alert-contacts` +- `GET /api/v1/alert-contacts/{id}` +- `PATCH /api/v1/alert-contacts/{id}` +- `DELETE /api/v1/alert-contacts/{id}` +- `POST /api/v1/alert-contacts/{id}/test` +- `GET /api/v1/alert-contacts/{id}/deliveries` +- `POST /api/v1/alert-contacts/{id}/deliveries/{delivery_id}/retry` + +Standard CRUD. An alert contact is: + +```json +{ + "id": 17, + "label": "platform-oncall", + "active": true, + "transport": "pagerduty", + "destination": { "integration_key": "***" }, + "site_filter": { "site_ids": [12345, 67890] }, + "min_severity": "Down", + "max_per_hour": 60, + "destination_preview": "abcd", + "created_by": "alerts-admin", + "created_at": "2026-04-25T00:00:00Z" +} +``` + +`destination` shape varies by transport (see below); credential fields are write-only and only `destination_preview` (last 4 chars of the credential) is returned on subsequent reads. + +#### Transports + +| Transport | `destination` shape | Notes | +|-----------|---------------------|-------| +| `email` | `{ "address": "ops@example.com" }` | Rendered as a plain-text + HTML email. Sent via the configured email transport (see "Email delivery" below). | +| `pagerduty` | `{ "integration_key": "" }` | Posts to PagerDuty Events API v2. Jetmon severity maps to PagerDuty severity: `Down`/`SeemsDown` → `critical`, `Degraded` → `warning`, `Warning` → `info`, `Up` → resolves the alert. | +| `slack` | `{ "webhook_url": "https://hooks.slack.com/..." }` | Posts to a Slack incoming-webhook URL. Renders a Block Kit message with site, state, severity, and an event link. | +| `teams` | `{ "webhook_url": "https://outlook.office.com/webhook/..." }` | Posts to a Microsoft Teams incoming-webhook URL. Renders an Adaptive Card with the same fields as Slack. | + +Custom transports (Slack via OAuth bot, OpsGenie, internal SIEM, etc.) go through the webhooks API instead — register a webhook, render however you want. + +#### Filter semantics + +Alert contacts use a simpler filter model than webhooks: **site list + severity gate**. A contact fires when: + +``` +site_id ∈ site_filter.site_ids (or site_filter == {} → all sites) +AND new_severity >= min_severity (Up=0 < Warning=1 < Degraded=2 < SeemsDown=3 < Down=4) +``` + +Empty `site_filter` means "all sites." `min_severity` is required and defaults to `Down` on create — this is the most common case (page me only on real outages) and avoids accidental noise from new contacts. + +The severity values match `internal/eventstore.Severity*` constants directly; the API exposes them by string name in JSON (`"Down"`, `"SeemsDown"`, etc.) and stores them as the underlying `uint8` in the database. + +The simpler filter model is intentional. Most alert contact configs are "this person, these sites, only when something serious happens"; event-type and state filters (which webhooks support) are rarely useful for human pagers — if you got the open page you almost always want the close page too. Customers who need finer-grained filtering register a webhook instead. + +#### Severity gate + +Severity ordering: `Up < Warning < Degraded < SeemsDown < Down`. The gate matches `new_severity >= min_severity` on each transition; events that *increase* into the gated band send a page, events that *resolve back to `Up`* send a recovery notification, events that move between two severities both below the gate are silently dropped. + +This lets agencies and VIPs configure low-severity contacts (e.g. `min_severity: "Warning"`) that catch every flicker while still letting normal users configure `Down`-only contacts that only fire on real outages — both from the same plumbing. + +#### Per-contact rate cap + +`max_per_hour` (default 60, set to `0` for unlimited) caps how many notifications a single contact can receive per rolling hour. Designed against the pager-storm scenario where a regional outage flips 200 sites at once; without a cap, on-call gets paged 200 times in 30 seconds. When the cap is hit, further transitions for that contact are marked `abandoned` with a rate-limit note and are not dispatched. Digest notifications are deferred. + +This is a per-contact field, not global — different contacts have different tolerance (a Slack channel can take far more than a PagerDuty oncall can). + +#### Send-test + +``` +POST /api/v1/alert-contacts/{id}/test +``` + +Sends a synthetic notification through the contact's transport — same rendering, same dispatch path, but with payload `{"test": true, "message": "Jetmon test notification", ...}`. Used by operators to verify a newly-created contact actually reaches its destination. Test sends are exempt from `max_per_hour`, are logged in `jetmon_audit_log` under `event_type=alert_test`, and bypass the severity gate (always delivered). + +Honors `Idempotency-Key` like the other write POSTs — a retried request with the same key returns the original response without re-firing the test, so a network blip during the operator's "click to test" doesn't double-page the destination. + +Returns `200 OK` with the test delivery row, or surfaces the transport error (e.g. invalid Slack webhook URL) directly so operators can debug without spelunking through worker logs. + +#### Email delivery + +Email is unique among the transports in that there is no equivalent of "post to this URL" — it requires a sender. Three implementations selectable at startup via `EMAIL_TRANSPORT` config: + +| `EMAIL_TRANSPORT` | Use case | Behavior | +|-------------------|----------|----------| +| `wpcom` | Production | Calls existing WPCOM email infrastructure. Default in production deploys. | +| `smtp` | Local dev / staging | Connects to an SMTP server (e.g. Mailpit in the Docker Compose stack). Configurable host/port/auth. | +| `stub` | Local dev / unit testing / disabled email | Logs the rendered email; no actual send. | + +The `Sender` interface is internal to the alerting package, so swapping transports is a config change — no code path differences. SMTP support specifically exists so docker-based integration tests can verify rendering and addressing end-to-end without depending on WPCOM infrastructure. + +`stub` is the default and the empty-string compatibility alias. Startup and `jetmon2 validate-config` both warn when the resolved transport is `stub` so operators know any alert contact with `transport="email"` will be logged but not delivered. + +#### Subscription assignment + +Site assignment is via `site_filter.site_ids` on the contact row itself, not a separate join table. Mirrors the webhooks API. Empty list = all sites. Setting `site_filter: {"site_ids": []}` or `{}` is "subscribe to all sites." On create, omitting `site_filter` also produces the empty match-all filter; on PATCH, omitting `site_filter` leaves the existing filter unchanged. + +#### Detection mechanism + +Same as webhooks — pull-only, polling `jetmon_event_transitions` on a high-water mark. Different worker (`internal/alerting/`) with the same dispatch shape: claim → match contacts → enqueue per-contact deliveries in `jetmon_alert_deliveries` → dispatch with retry. Worker placement is intentionally parallel to webhooks rather than unified; see ROADMAP for the rationale and the future revisit point. + +#### Retry policy + +Same schedule as webhooks: 1m, 5m, 30m, 1h, 6h, then abandon. Different transports have different idempotency stories — PagerDuty Events API is idempotent on `dedup_key`, Slack webhooks are not — so each transport implementation owns its retry-safety guarantee. Worker-level retry is conservative; if the transport library returns success, we never re-send. + +#### Relationship to legacy WPCOM notifications + +The existing WPCOM notification flow (orchestrator-side, hard-coded recipients) **continues to operate independently** in v1. Alert contacts are a parallel programmable path; they don't replace WPCOM notifications, they coexist. + +This means: +- An incident may notify the same human twice if they're configured in both paths. Document this on the operator side and avoid duplicate configuration. +- The two paths have separate retry state, separate metrics, separate audit trails. +- Migrating WPCOM notifications behind alert contacts is a future cleanup tracked in the roadmap, gated on alert contacts proving out in production. + +The boundary is: WPCOM = built-in path for existing internal Jetpack notifications; alert contacts = customer-managed destinations through the API. Anything new should go through alert contacts. + +#### Schema + +```sql +jetmon_alert_contacts ( + id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, + label VARCHAR(80) NOT NULL, + active TINYINT(1) NOT NULL DEFAULT 1, + owner_tenant_id VARCHAR(128) NULL, + transport ENUM('email','pagerduty','slack','teams') NOT NULL, + destination JSON NOT NULL, -- transport-specific, secret in plaintext (outbound dispatch needs raw value) + destination_preview VARCHAR(8) NOT NULL, + site_filter JSON NOT NULL, -- {"site_ids":[...]} or {} for all + min_severity TINYINT UNSIGNED NOT NULL DEFAULT 4, -- matches eventstore.Severity* (0=Up..4=Down); default 4=Down + max_per_hour INT NOT NULL DEFAULT 60, + created_by VARCHAR(80) NOT NULL, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) + +jetmon_alert_deliveries ( + -- mirrors jetmon_webhook_deliveries; dedup on (alert_contact_id, transition_id) +) + +jetmon_alert_dispatch_progress ( + -- mirrors jetmon_webhook_dispatch_progress; high-water mark for the worker +) +``` + +`destination` stores the credential in plaintext. Same rationale as `jetmon_webhooks.secret`: outbound dispatch needs the raw value (PagerDuty integration key, Slack webhook URL, SMTP password) at every send — a hash is useless because we'd have to recover the original to call the transport. The threat model is the database itself; encryption-at-rest on the storage layer is the correct mitigation, not application-level hashing. + +#### Alert contact ownership + +Same internal model as webhooks: any `write`-scope token can manage any alert +contact when no gateway context is present, and `created_by` is audit-only. +Gateway-routed creates set `owner_tenant_id`; gateway-routed +list/get/update/delete/test paths filter by it. Delivery history and manual +retry visibility are derived by first verifying ownership of the parent alert +contact. + +### Family 6: Identity and utility + +#### `GET /api/v1/me` + +Returns the identity associated with the current token: consumer name, scope, rate limit. Useful for a service to confirm at startup that its token is valid and has the expected permission level. + +```json +{ + "consumer_name": "alerts-worker", + "scope": "read", + "rate_limit_per_minute": 600, + "expires_at": null +} +``` + +This is the only API surface for keys. **Creation, listing, and revocation are CLI-only** (`./jetmon2 keys ...`); see Authentication above. There is no `/api/v1/keys` endpoint. + +#### `GET /api/v1/health` + +Unauthenticated. Returns `{ "status": "ok" }` if the API can talk to the database. For load balancers and external uptime monitors (yes, including external monitors monitoring the monitor). + +#### `GET /api/v1/openapi.json` + +Returns the route-driven OpenAPI 3.1 contract for the internal API. Requires `read` scope like other internal introspection routes. The spec is generated from the same route table used to build the running server mux, so new routes must be added to that table before they can be served or documented. + +The current contract publishes paths, methods, auth scope, idempotency headers, path parameters, request/response component schemas derived from the handler structs, and the standard error envelope. `internal/api` tests resolve every component `$ref` and type-check a generated Go client smoke source from the published operation IDs and component names. Stricter public compatibility checks are tracked in `ROADMAP.md`. + +--- + +## What we deliberately did not include + +- **No Statuspage-style public status pages.** That's a separate product; Jetmon focuses on monitoring. If you want a public status page, the API gives you what you need to build one. +- **No "monitor groups" / "tags" in v1.** Most consumers organize by `owner_blog_id`; tagging is a complexity multiplier we'd rather defer until requested. +- **No GraphQL.** REST + cursor pagination + filters covers everything the v1 use cases need. If a future consumer needs nested-fetch optimization (sites + active events + recent transitions in one round-trip), we'd add a single `/api/v1/sites/{id}/full` endpoint before reaching for GraphQL. +- **No per-region SLA breakdown.** All sites are checked from the orchestrator's bucket assignment, not a multi-region fleet (yet — see `TAXONOMY.md` v2/v3 vantage-point work). When that ships, the SLA endpoint gains a `?vantage_point=us-west-1` filter. +- **No streaming.** Webhooks cover event-driven needs; long-poll/SSE/WebSocket support is overkill for the current consumer set. Could be added on `/api/v1/sites/{id}/events/stream` if a consumer asks. + +## Implementation Phase Map + +Phase 1 (read-only foundation, implemented): +- `jetmon_api_keys` migration + sha256 hashing helpers +- `./jetmon2 keys create/list/revoke/rotate` CLI +- Auth middleware (Bearer token validation, scope enforcement, audit logging via `jetmon_audit_log`) +- Health check + `GET /api/v1/me` +- Family 1 read endpoints (sites list, single site) +- Family 2 (events list, single event with transitions, transitions list) +- Family 3 (uptime, response-time, timing-breakdown) +- Per-key rate limiting + standard headers + +Phase 2 (write surface, implemented): +- Family 1 write endpoints (POST/PATCH/DELETE sites, pause/resume, trigger-now) +- Family 2 manual close +- Idempotency keys on POST routes +- Route-driven OpenAPI 3.1 contract at `GET /api/v1/openapi.json` + +Phase 3 (webhook delivery, implemented): +- Family 4 webhooks (CRUD + delivery infrastructure with HMAC signing + retry backoff) + +Phase 3.x (alert contacts, implemented): +- Family 5 alert contacts: managed channels (email, PagerDuty, Slack, Teams) +- `internal/alerting/` package — parallel to `internal/webhooks/`, same dispatch shape +- Email transport interface with `wpcom` / `smtp` / `stub` implementations +- Per-contact severity gate + per-hour rate cap +- `POST /alert-contacts/{id}/test` send-test endpoint +- Legacy WPCOM notification flow continues to operate in parallel; future migration tracked in ROADMAP + +Phase 4 (polish, future): +- Consumer-specific OpenAPI generator validation if API consumers standardize on a tool +- Bulk endpoints if real consumers need them +- Per-region filters when vantage-point work ships + +--- + +## Resolved design questions + +These were the open questions from the original draft. All resolved during review; recorded here so the rationale doesn't get lost when the doc evolves. + +1. **Resource ID format → raw numeric integers across all resources.** Initially proposed type-prefixed ids (`evt_12345`, `whk_42`) for self-documenting log lines, but on review the costs outweighed the benefits: dual representation between logs/DB/API, JSON type inconsistency (sites as numbers, others as strings), a real silent-coercion bug class under default MySQL `SQL_MODE`, and forward-sharding friction not actually solved by prefixes. Resolution: every resource `id` is a raw `BIGINT UNSIGNED` serialized as a JSON number. Type context is provided by endpoint paths and explicit `type` fields in error messages and webhook payloads, not embedded in the id. (Webhook signing secrets keep the `whsec_` prefix because they're shared secrets, not resource ids — the prefix is a leak-detection hint.) + +2. **Bulk site list cap → 200/page, no `include_inactive` opt-in flag.** The existing `monitor_active` filter does the same job; a separate flag would duplicate it. The 200-page cap alone is sufficient guardrail for full-table walks (100k sites at 200/page = 500 round trips, adequate for daily SLA batch jobs). If a consumer ever needs higher per-page volume, we add a `?limit_max=1000` opt-in tied to a special scope at that point — not now. + +3. **Webhook signing → Stripe-style versioned HMAC, single algorithm at a time.** Header format `t=,v1=`. The `v1=` prefix reserves space for a v2 algorithm rotation (e.g. ed25519) without breaking consumer parsers. Don't build multi-algorithm signing upfront — when rotation is actually triggered, transition period emits both `v1=...,v2=...` so consumers verify whichever they support. + +4. **`trigger-now` semantics → synchronous with a 30s server-side timeout, no async path in v1.** Matches operator and gateway expectations ("I just deployed, is it up?"), keeps the API surface narrow (one request → one response), and the existing trigger-now rate limit (1/min default per consumer) bounds connection-pool exposure. If a batch-verification consumer ever shows up, we add `?async=true` returning a 202 with a job id — but not before there's a real consumer for it. + +5. **Event metadata sanitization → single `metadata` field, no public/private split.** With this being an internal API and a gateway in front of any customer-facing surface, the `metadata` JSON can carry full operational detail (verifier hostnames, internal RPC ids, full HTTP response excerpts). The gateway is responsible for any redaction before forwarding to customers. + +--- + +## Sources / inspiration + +The patterns above were informed by reviewing the documented APIs of: + +- [Better Stack Uptime API](https://betterstack.com/docs/uptime/api/) — JSON:API envelope (we rejected), incident status enum (we extended), Bearer token auth (we adopted). +- [UptimeRobot v3 API](https://uptimerobot.com/api/v3/) — Bearer JWT, REST verbs, cursor pagination (we adopted), JSON-only (we adopted). +- [Pingdom API 3.1](https://docs.pingdom.com/api/) — OpenAPI 3.0 spec (we adopted), `summary.average` SLA endpoint shape (informed our `/uptime` design). +- [Atlassian Statuspage API](https://developer.statuspage.io/) — incident updates timeline (we extended into transitions table), component status enum `operational/degraded/partial_outage/major_outage` (we rejected — too coarse for our taxonomy). +- [Stripe API](https://stripe.com/docs/api) — error model with stable codes (we adopted), idempotency keys (we adopted), webhook signing scheme (we adopted). + +None of these were copied; each pattern was evaluated against Jetmon's data model and either adopted, modified, or rejected with rationale. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 872ffa27..08bde826 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -8,12 +8,15 @@ call flow used to determine and report site status. System Overview --------------- -Jetmon 2 is a single Go binary. Multiple instances can run on different hosts, -each owning a non-overlapping range of site buckets claimed from MySQL. +Jetmon 2 runs as a Go monitor binary (`jetmon2`). Multiple monitor instances can +run on different hosts, each owning a non-overlapping range of site buckets +claimed from MySQL. Outbound webhooks and alert contacts can still run embedded +inside one API-enabled `jetmon2` process, or through the standalone +`jetmon-deliverer` binary as the first step toward the post-v2 process split. ``` ┌─────────────────────────────────────────┐ - │ jetmon2 (single binary) │ + │ jetmon2 │ │ │ ┌──────────┐ sites │ ┌─────────────┐ ┌─────────────────┐ │ │ MySQL │──────────► │ │ Orchestrator│───►│ Checker Pool │ │ @@ -40,6 +43,18 @@ Multiple jetmon2 instances coordinate through MySQL bucket leases: Host C ────── (takes over Host B's range if B goes offline) ``` +Shadow-v2-state migration model: + +- `jetmon_events` and `jetmon_event_transitions` are the authoritative incident + state for Jetmon v2. +- `jetpack_monitor_sites` remains the legacy site/config table during migration. +- While `LEGACY_STATUS_PROJECTION_ENABLE` is true, every v2 incident mutation + also projects the v1-compatible `site_status` / `last_status_change` fields + back to `jetpack_monitor_sites` in the same transaction. +- Once legacy readers have moved to the v2 API/event tables, disable + `LEGACY_STATUS_PROJECTION_ENABLE`; v2 incident state continues to be written + to the event tables. + Package Map ----------- @@ -47,6 +62,7 @@ Package Map ``` jetmon/ ├── cmd/jetmon2/ Entry point, CLI subcommands, signal handling +├── cmd/jetmon-deliverer/ Standalone outbound delivery worker ├── internal/ │ ├── orchestrator/ Round loop, bucket coordination, retry queue, │ │ failure escalation, status notifications @@ -58,6 +74,11 @@ jetmon/ │ ├── veriflier/ Veriflier client (JSON-over-HTTP) and server │ ├── wpcom/ WPCOM notification client with circuit breaker │ ├── audit/ Structured audit log (read + write) +│ ├── eventstore/ Authoritative incident event + transition writer +│ ├── api/ Internal REST API, auth, rate limits, idempotency +│ ├── deliverer/ Shared webhook + alert-contact worker wiring +│ ├── webhooks/ Webhook registry + HMAC-signed delivery worker +│ ├── alerting/ Managed alert-contact registry + delivery worker │ ├── metrics/ StatsD UDP client, stats file writer │ └── dashboard/ HTTP + SSE operator dashboard └── veriflier2/cmd/ Standalone veriflier binary @@ -129,8 +150,8 @@ This is the end-to-end path from database query to WPCOM notification. └─────────────┘ │ │ │ Stage 3 — Confirm down │ │ confirmDown(site, entry, vResults) │ - │ if DB_UPDATES_ENABLE: │ - │ dbUpdateSiteStatus(→ confirmed_down) │ + │ if LEGACY_STATUS_PROJECTION_ENABLE: │ + │ project site_status(→ confirmed_down) │ │ if inMaintenance(): suppress + audit │ │ else if !isAlertSuppressed(): Notify() │ │ retries.clear(blogID) │ @@ -315,7 +336,9 @@ Veriflier Transport ◄── {"status":"OK","version":"1.2.3"} ``` -The transport is JSON-over-HTTP (a placeholder for gRPC; swap after `make generate`). +The transport is JSON-over-HTTP for v2 production. `proto/veriflier.proto` +remains as a schema reference for a possible future transport, but generated +gRPC stubs are not required to build or deploy v2. Bucket Distribution — Multi-Host Scaling @@ -366,11 +389,12 @@ Database Tables ---------------- ``` - jetpack_monitor_sites Core site list (pre-existing, extended by Jetmon 2) + jetpack_monitor_sites Legacy site/config table plus compatibility projection blog_id WordPress site identifier bucket_no Determines which monitor instance owns this site monitor_url URL to check - site_status 1=running, 2=confirmed_down + site_status Legacy v1 projection; derived from v2 events + last_status_change Legacy v1 projection; derived from v2 transitions last_checked_at Used to order fetch by least-recently-checked ssl_expiry_date Updated after each TLS handshake check_keyword Optional body text to require @@ -387,13 +411,26 @@ Database Tables last_heartbeat Updated every round; expiry triggers rebalance status active / draining - jetmon_audit_log Immutable event record for compliance/debugging - event_type check | status_transition | wpcom_sent | - wpcom_retry | retry_dispatched | veriflier_sent | + jetmon_events Authoritative v2 incident current state + id Incident identifier + blog_id Site identifier + check_type Probe family (http, tls_expiry, ...) + severity/state Current incident projection + started_at/ended_at Incident window + resolution_reason Required close reason + + jetmon_event_transitions Append-only mutation history for jetmon_events + event_id Incident row being mutated + severity/state before/after + reason/source Why and who caused the mutation + changed_at Transition time + + jetmon_audit_log Operational trail for compliance/debugging + event_type check | wpcom_sent | wpcom_retry | + retry_dispatched | veriflier_sent | veriflier_result | maintenance_active | - alert_suppressed + alert_suppressed | api_access | config_reload blog_id, source, http_code, error_code, rtt_ms - old_status, new_status (for transition events) jetmon_check_history Per-check timing samples rtt_ms, dns_ms, tcp_ms, tls_ms, ttfb_ms @@ -401,6 +438,20 @@ Database Tables jetmon_false_positives Checks local failed but verifliers passed blog_id, http_code, error_code, rtt_ms + jetmon_api_keys Internal API Bearer-token registry + key_hash, consumer_name, scope, rate_limit_per_minute + + jetmon_webhooks Registered webhook receivers and filters + jetmon_webhook_deliveries + Per-transition webhook delivery attempts + jetmon_webhook_dispatch_progress + Webhook worker transition high-water marks + + jetmon_alert_contacts Managed notification destinations + jetmon_alert_deliveries Per-transition alert delivery attempts + jetmon_alert_dispatch_progress + Alert worker transition high-water marks + jetmon_schema_migrations Idempotent migration tracking ``` diff --git a/CHANGELOG.md b/CHANGELOG.md index de0802ed..ce0d3185 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,136 @@ Breaking changes are marked **BREAKING**. ## Unreleased +### v2 branch — site health platform + +The v2 branch builds on the Go rewrite to turn Jetmon from a status-flipper +into a full event-sourced health platform with an internal REST API, +HMAC-signed webhooks, and managed alert contacts. Kept on a parallel branch +because it is intentionally **not** drop-in with the Jetmon 1 wire format +(see PR #61 — DO NOT MERGE). + +**New — event sourcing:** +- `jetmon_events` (current authoritative state per incident) and + `jetmon_event_transitions` (every status/severity change, append-only) + tables; `internal/eventstore` writes both in a single transaction +- Shadow-v2-state migration: while `LEGACY_STATUS_PROJECTION_ENABLE` is + true, event mutations also maintain the v1 `site_status` / + `last_status_change` projection for legacy consumers +- Five-layer severity ladder: `Up < Warning < Degraded < SeemsDown < Down` + matching `internal/eventstore.Severity*` constants + +**New — internal REST API (`/api/v1/`, internal-only behind a gateway):** +- Per-consumer Bearer token auth with three scopes (`read` / `write` / + `admin`); `./jetmon2 keys create/list/revoke/rotate` CLI +- Per-key token-bucket rate limiter with `X-RateLimit-*` headers +- Stripe-style idempotency keys on POST endpoints +- Sites CRUD + pause/resume/trigger-now +- Events list + single + transitions list + manual close +- SLA endpoints: uptime, response-time, timing-breakdown +- Audit logging via `jetmon_audit_log` with `event_type=api_access` +- See API.md for full surface and design rationale + +**New — webhooks (Phase 3):** +- `jetmon_webhooks` registry + `jetmon_webhook_deliveries` per-fire records +- Stripe-style HMAC-SHA256 signatures (`t=,v1=` over + `{ts}.{body}`); plaintext secret storage with documented threat model +- Filter dimensions: `events` + `site_filter` + `state_filter` (AND across, + whitelist within, empty=match all) +- Delivery worker with per-webhook in-flight cap (default 3) and shared + pool (default 50), retry ladder 1m / 5m / 30m / 1h / 6h then abandon +- Frozen-at-fire-time payload contract — consumer sees the event as it was + when the webhook fired, not as it is now +- POST `/webhooks/{id}/rotate-secret` (immediate revocation; grace-period + rotation deferred — see ROADMAP.md) +- POST `/webhooks/{id}/deliveries/{delivery_id}/retry` for operator manual + retry of abandoned rows + +**New — alert contacts (Phase 3.x):** +- Managed channels for human destinations: `email`, `pagerduty`, `slack`, + `teams`. Boundary with webhooks: alert contacts deliver Jetmon-rendered + notifications through Jetmon-owned transports; webhooks deliver the raw + signed event stream for custom rendering +- Filter shape: `site_filter` + `min_severity` (default `Down`); per-contact + `max_per_hour` rate cap (default 60) as pager-storm insurance +- POST `/alert-contacts/{id}/test` for synthetic send-tests through the + same dispatch path +- Email transport pluggable via `EMAIL_TRANSPORT` config: `wpcom` + (production), `smtp` (dev / staging with MailHog), `stub` (default + log-only / tests, with startup and validate-config warnings) +- PagerDuty Events API v2 with severity mapping and event_action + trigger/resolve based on the recovery flag +- Slack Block Kit + Microsoft Teams Adaptive Card rendering +- Plaintext credential storage in `destination` JSON; same outbound-dispatch + rationale as webhook secrets, threat model documented inline +- Legacy WPCOM notification flow continues alongside; migration tracked + in ROADMAP.md + +**Verifier hardening:** +- Body size cap and empty-token guard on the JSON-over-HTTP transport +- Verifier config validation: required `host` and `grpc_port` per entry, + PID file location now respects `JETMON_PID_FILE` env var + +**Worker fixes:** +- Soft-lock fix for both webhooks and alerting deliver loops: `ClaimReady` + pushes `next_attempt_at` out by 60s so the 1s tick doesn't re-claim a + still-in-flight row. Without this, the per-contact in-flight cap (3) + was producing concurrent dispatches that inflated the attempt counter + and effectively skipped retry-schedule steps; the documented 7h36m + retry window was being collapsed to ~1h. +- `ClaimReady` now repeats the readiness predicate during the soft-lock + update and returns only rows whose update affected a row, so overlapping + claim attempts skip stale SELECT results instead of doing duplicate + dispatch work. Multi-instance row-claim caveat (SELECT ... FOR UPDATE + SKIP LOCKED) still tracked alongside the deliverer-binary extraction in + ROADMAP.md. + +**Docs / tooling:** +- `make all` now builds the currently implemented `jetmon2` and + `veriflier2` binaries without requiring `protoc`; generated Veriflier + gRPC stubs remain an explicit `make generate` step for the future + transport swap. +- Makefile targets now share a configurable `GO` command and fall back to + `/usr/local/go/bin/go` when `go` is not on `PATH`; they also use an + overrideable `/tmp` Go build cache so checks do not depend on a + writable home-directory cache. +- Developer docs now point at the Makefile build path and document why + code generation is separate from the default build. +- Added a top-level docs index and a post-v2 probe-agent architecture + options document for revisiting the v3 direction after v2 is stable in + production. +- Clarified that the current Veriflier transport is JSON-over-HTTP and + that the public API roadmap is about a future customer-facing contract, + not the already-implemented internal `/api/v1`. + +**Polish:** +- `alerting.Update` now validates `label` (must be non-empty) and + `max_per_hour` (must be ≥ 0) at input time, surfacing 422 + `invalid_alert_contact` instead of letting an empty label silently + persist or a negative `max_per_hour` surface as a generic 500 from + MySQL's `INT UNSIGNED` constraint. Validations that don't depend on + the existing row run before the DB lookup so obviously bad PATCH + bodies don't pay for a round-trip. +- Email transport strips CR and LF from MIME header values + (`From` / `To` / `Subject`) as defense-in-depth against header + injection via untrusted strings (`monitor_url` is operator-controlled + but the column doesn't enforce CRLF-free). Body content with newlines + is unaffected. +- `POST /api/v1/alert-contacts/{id}/test` now honors `Idempotency-Key` + like the other write POSTs, so a retried "click to test" during a + network blip doesn't double-page the destination. +- API list-site rollup of the worst open event no longer relies on + `ROW_NUMBER()` window functions, so the query is compatible with + MySQL 5.7. Pagination caps the IN list and a site rarely has more + than one open event, so reducing in Go is cheap. +- API key cutoffs (`revoked_at` and `expires_at`) now share half-open + semantics: a key is valid for times strictly before the cutoff and + rejected at or after it. Future `revoked_at` continues to act as a + rotation grace window. See API.md. +- `LEGACY_STATUS_PROJECTION_ENABLE` is announced at startup + (`config: legacy_status_projection=enabled|disabled`) and surfaced by + `./jetmon2 validate-config`, so operators can confirm projection + state without reading the running config file. + ### Jetmon 2 — initial Go rewrite Complete rewrite of the Node.js + C++ uptime monitor as a single static Go binary. @@ -22,7 +152,9 @@ Drop-in replacement for Jetmon 1; all existing MySQL schema columns are preserve - `jetmon2 audit` — query per-site audit log from CLI - Operator dashboard on configurable port with SSE state stream - pprof debug server on localhost-only `DEBUG_PORT` (default 6060) -- `DB_UPDATES_ENABLE` double-gate: requires both config flag and `JETMON_UNSAFE_DB_UPDATES=1` env var +- `LEGACY_STATUS_PROJECTION_ENABLE` controls v1 `site_status` / + `last_status_change` compatibility writes; `DB_UPDATES_ENABLE` remains + as a deprecated alias - Graceful shutdown with 30-second hard-exit backstop - Non-root Docker images (`jetmon` / `veriflier` system users) - Healthcheck-gated MySQL dependency in docker-compose diff --git a/EVENTS.md b/EVENTS.md index 9bedcedf..0033f52e 100644 --- a/EVENTS.md +++ b/EVENTS.md @@ -6,25 +6,64 @@ This document describes the event-sourced architecture that underlies site state Early designs used a mutable `state` column on the site row as the primary record of truth. That approach loses history, makes retries ambiguous, and couples severity changes to state changes in ways that don't reflect reality (a worsening degradation isn't a new outage). Moving to an event log fixes this: -- Full history is preserved for free. +- Full history is preserved across both event boundaries (open/close) and intra-event mutations (severity bumps, state transitions, cause links). - Severity can evolve within a single event without inventing artificial state transitions. - Retries and duplicate probe results become idempotent rather than destructive. - Derived/denormalized fields on the site row can be rebuilt from the log if they ever drift. -## The event +## The two-table split -An event represents a condition affecting a site over a time range. +The model splits the event into two tables: -| Field | Type | Notes | -|----------------------|-----------------|------------------------------------------------------------| -| `id` | identifier | Idempotent — see "Identity" below. | -| `site_id` | FK | The site this event is about. | -| `start_timestamp` | timestamp | When the condition began. | -| `end_timestamp` | timestamp, null | When the condition resolved. Null while active. | -| `severity` | numeric | Ordered, suitable for thresholds and escalation. | -| `state` | enum/string | Human-readable lifecycle label. | -| `resolution_reason` | enum, null | Why the event ended. Null while active. | -| `probe_type` | enum | Which probe observed this (HTTP, DNS, TCP, etc.). | +- **`jetmon_events`** — one row per incident, holding the *current* (or final) severity, state, and metadata. Mutable while the incident is open; frozen on close. +- **`jetmon_event_transitions`** — append-only history of every mutation made to a `jetmon_events` row. One row per change, never updated, never deleted. + +The events row is the authoritative current-state projection. The transitions table is the full audit trail of how it got there. Together they give you: + +- Cheap "what's the current state of incident X" reads (single row in `jetmon_events`). +- Complete "how did incident X evolve over time" reads (`SELECT * FROM jetmon_event_transitions WHERE event_id = ? ORDER BY changed_at`). +- Independent retention policies — incidents can be pruned aggressively for the live table while transitions are kept long enough for SLA reports. + +**Operational logging stays in `jetmon_audit_log`.** That table records what the *monitor* did (WPCOM retries, verifier RPCs, config reloads, alert suppressions). Site-state changes do not flow through it — those go to the events tables. See "Relationship to `jetmon_audit_log`" below. + +## The event row + +`jetmon_events` represents a condition affecting a site over a time range. There is at most one *open* row per `(blog_id, endpoint_id, check_type, discriminator)` tuple at any given time (see "Identity and idempotency"). + +| Field | Type | Notes | +|----------------------|------------------|--------------------------------------------------------------------------| +| `id` | BIGINT UNSIGNED | Primary key. | +| `blog_id` | BIGINT UNSIGNED | The site this event is about. (`site_id` in TAXONOMY.md terms.) | +| `endpoint_id` | BIGINT UNSIGNED, null | The endpoint, when applicable. Null for site-level events. | +| `check_type` | VARCHAR(64) | Which probe observed this — `http`, `dns`, `tls_expiry`, etc. | +| `discriminator` | VARCHAR(128), null | Optional tiebreaker for tuples that can have multiple concurrent failures (e.g. multiple keyword checks on the same endpoint). | +| `severity` | TINYINT UNSIGNED | Ordered, suitable for thresholds and escalation. | +| `state` | VARCHAR(32) | Human-readable lifecycle label. | +| `started_at` | TIMESTAMP(3) | When the condition began. Frozen across severity/state changes. | +| `ended_at` | TIMESTAMP(3), null | When the condition resolved. Null while active. | +| `resolution_reason` | VARCHAR(64), null | Why the event ended. Null while active. | +| `cause_event_id` | BIGINT UNSIGNED, null | Causal link to a root-cause event (separate from rollup). | +| `metadata` | JSON, null | Check-type-specific payload (HTTP code, RTT, days-to-expiry, etc.). | +| `updated_at` | TIMESTAMP(3) | ON UPDATE CURRENT_TIMESTAMP — convenience for the dedup path. | +| `dedup_key` | VARCHAR generated | Stored generated column carrying the identity tuple while the event is open, NULL once closed. Backed by a unique index — see "Identity and idempotency". | + +## The transition row + +`jetmon_event_transitions` is the append-only history. Every mutation to a `jetmon_events` row writes exactly one transition row, in the same database transaction. + +| Field | Type | Notes | +|--------------------|------------------|--------------------------------------------------------------------------------| +| `id` | BIGINT UNSIGNED | Primary key. | +| `event_id` | BIGINT UNSIGNED | The event this transition applies to. | +| `blog_id` | BIGINT UNSIGNED | Denormalized from `jetmon_events.blog_id` — avoids a join for SLA queries. | +| `severity_before` | TINYINT UNSIGNED, null | Severity before the change. Null on `opened`. | +| `severity_after` | TINYINT UNSIGNED, null | Severity after the change. Null on `closed`. | +| `state_before` | VARCHAR(32), null | State before the change. Null on `opened`. | +| `state_after` | VARCHAR(32), null | State after the change. Null on `closed` (or set to `Resolved`). | +| `reason` | VARCHAR(64) | Why the transition occurred. See "Transition reasons" below. | +| `source` | VARCHAR(255) | Who caused it: `local`, `veriflier:us-west`, `operator:user@host`, `system:timeout`. | +| `metadata` | JSON, null | Transition-specific context (HTTP code on escalation, cause id on link, etc.). | +| `changed_at` | TIMESTAMP(3) | Millisecond precision; SLA report ordering needs sub-second tiebreakers. | ### Severity vs. state @@ -36,7 +75,26 @@ Keeping these separate avoids conflating "this got worse" with "this is a differ ### Identity and idempotency -Event `id` is derived from a stable set of inputs — typically `(site_id, probe_type, start_timestamp_bucket)` or equivalent — so that repeated probe results for the same underlying condition resolve to the same event row. This makes writes idempotent: a retried probe result updates the existing event rather than creating a new one. +Event identity is the tuple `(blog_id, endpoint_id, check_type, discriminator)`. Repeated probe results for the same underlying condition must resolve to the same `jetmon_events` row — a retried result updates the existing row rather than creating a new one. + +MySQL has no partial unique indexes, so the schema enforces "at most one *open* event per tuple" with a generated column trick: + +- `dedup_key` is a `VARCHAR GENERATED ALWAYS AS (... ) STORED` column. +- It evaluates to a `CONCAT_WS` of the tuple while `ended_at IS NULL`, and to `NULL` once the event is closed. +- A `UNIQUE KEY` on `dedup_key` rejects two open rows with the same tuple. Multiple `NULL`s are allowed by MySQL's unique-index semantics, so closed events never conflict. + +The probe runner's insert path collapses to a single statement: + +```sql +INSERT INTO jetmon_events (blog_id, endpoint_id, check_type, discriminator, severity, state, ...) +VALUES (?, ?, ?, ?, ?, ?, ...) +ON DUPLICATE KEY UPDATE + severity = VALUES(severity), + state = VALUES(state), + metadata = VALUES(metadata); +``` + +No `SELECT … FOR UPDATE` dance, no optimistic-concurrency loop. The dedup logic is enforced by the schema and the `eventstore` package wraps it so external callers never touch the table directly. ## Lifecycle @@ -58,29 +116,69 @@ No active event. Probes are succeeding. A probe has failed but the verifier has not yet confirmed. This is a **real state**, not an implementation detail — dashboards show it, alert rules can key off it, and it has its own severity range. -The verifier path has two outcomes: -- **Confirmed** → transition to `Down`. -- **Disagreed** → event ends with `resolution_reason = false_alarm`, site returns to `Up`. +**The event opens on the first local failure**, not when the local retry queue eventually escalates to verifiers. This is non-negotiable: `started_at` must equal "first time we saw something wrong" so incident duration is honest. Subsequent local-retry failures are no-ops on the events table — the schema's idempotent `dedup_key` collapses them into the same row, and the `eventstore` writer skips a transition row when severity and state are unchanged. + +The first failure writes both an event row (`state = Seems Down`, `severity = 3`, `started_at = now`) and an `opened` transition row in one transaction. + +Three outcomes from Seems Down: + +- **Local probe recovers** before reaching verifier escalation → event closes with `resolution_reason = probe_cleared`. No verifier was involved; this is the "transient blip the local retry caught" path. The count of these is itself a useful signal — a baseline rate of probe-cleared closes tells you how noisy your detection is. +- **Verifier confirms** → state changes to `Down` in place, severity bumps to 4; one transition row records `state_before = Seems Down`, `state_after = Down`, `severity_before = 3`, `severity_after = 4`, `reason = verifier_confirmed`. `started_at` does not change. +- **Verifier disagrees** → event closes with `resolution_reason = false_alarm`; one transition row records `state_after = Resolved`, `reason = false_alarm`. ### Down -Outage confirmed. Severity may continue to evolve in place as additional probes report. +Outage confirmed. Severity may continue to evolve in place as additional probes report. **Each severity bump writes a transition row** (`severity_before`, `severity_after`, `reason = severity_escalation` or `severity_deescalation`). The `jetmon_events` row stores only the latest severity; the history lives in `jetmon_event_transitions`. + +Recovery from Down — the next successful local probe — closes the event with `resolution_reason = verifier_cleared`. (V1 of the integration trusts the local probe on the recovery path; a future "verifier-on-recovery" check would distinguish probe-cleared from verifier-cleared on this path too.) ### Resolved -Condition has cleared. `end_timestamp` is set, `resolution_reason` is recorded. The event row is now historical — it is not deleted or mutated further. +Condition has cleared. `ended_at` is set, `resolution_reason` is recorded, and a transition row with `reason = ` is appended. The event row is now historical — it is not deleted or mutated further. ## The site row projection -For read performance (dashboards, API queries, bulk lists), the current derived state is denormalized onto the site row: +During the v2 migration, `jetpack_monitor_sites` remains the legacy site/config +table and compatibility projection. The authoritative incident state is the +v2 event model: + +- `jetmon_events` stores the current incident row. +- `jetmon_event_transitions` stores every mutation. +- `jetpack_monitor_sites.site_status` and `last_status_change` are derived + compatibility fields for v1 readers. + +While `LEGACY_STATUS_PROJECTION_ENABLE` is true, the legacy projection is updated +in the same transaction as the event write. There is no eventual consistency in +migration mode: event mutation, transition row, and v1 projection commit or roll +back together. -- `current_state` -- `current_severity` -- `active_event_id` (null when Up) +Once all downstream readers have moved to the v2 API/event tables, +`LEGACY_STATUS_PROJECTION_ENABLE` can be set to false. At that point the legacy +status fields stop being maintained and must not be treated as source of truth. -**This projection is updated in the same transaction as the event write.** Always. There is no eventual consistency here — if they drift, we have a bug. +The compatibility projection is rebuildable from `jetmon_events` (current state) +plus `jetmon_event_transitions` (full history). If the projection is ever +suspected to be wrong during migration, rebuild it; don't patch it by hand. -The projection is rebuildable from the event log. If it's ever suspected to be wrong, rebuild it; don't patch it. +## Relationship to `jetmon_audit_log` + +`jetmon_audit_log` is the **operational** log — it records what the monitor did, not what happened to a site: + +- WPCOM notification sends and retries +- Verifier RPC dispatch +- Retry-queue dispatch +- Alert suppression and maintenance-window swallowing decisions +- Config reloads + +Site-state changes do **not** go through the audit log. Those flow through `jetmon_events` (current state) and `jetmon_event_transitions` (history). The audit log links to events through a nullable `event_id` so an operator can pivot from "this WPCOM retry" to "the incident it was for" with one query. + +The split exists because the two trails have different consumers and different retention needs: + +| Trail | Consumer | Retention shape | +|-------|----------|-----------------| +| `jetmon_events` + `jetmon_event_transitions` | Public API incident timelines, SLA reports | Long — 30/90 days at full fidelity, then rolled up | +| `jetmon_audit_log` | Operators investigating "why did the alert fire" | Short — aggressive pruning is fine once the incident is closed | +| `jetmon_check_history` | Response-time trending, baseline learning | Medium — granular timing is high volume | ## Causal links @@ -102,27 +200,40 @@ All probe types share a single runner. The runner is responsible for: New probe types plug into this runner. They do not implement their own dedup. -## Resolution reasons +## Transition reasons -Every event close records why. Current reasons: +Every transition row records *why* the change happened. The seeded vocabulary, in approximate order of frequency: -- `verifier_cleared` — verifier confirms the site is back up. -- `false_alarm` — verifier disagreed with the initial failure signal. -- `manual_override` — an operator closed the event. +- `opened` — first transition for a new event. +- `severity_escalation` — severity went up on the same state (e.g. degradation worsening). +- `severity_deescalation` — severity went down on the same state. +- `verifier_confirmed` — Seems Down → Down. +- `verifier_cleared` — site returns to Up after a verifier-confirmed Down; closes the event. +- `probe_cleared` — site returns to Up while still in Seems Down (verifier was never invoked or never confirmed); closes the event. Count of these per site over time is the false-positive rate of local detection. +- `false_alarm` — verifier disagreed with the initial failure signal; closes the event. +- `manual_override` — an operator changed state or closed the event. +- `maintenance_swallowed` — event closed because a maintenance window started. +- `superseded` — closed because a broader event subsumed it. - `auto_timeout` — event aged out per retention/timeout policy. +- `cause_linked` / `cause_unlinked` — `cause_event_id` was set or cleared on an open event. + +The "closed" reasons (`verifier_cleared`, `probe_cleared`, `false_alarm`, `manual_override`, `maintenance_swallowed`, `superseded`, `auto_timeout`) are also written to `jetmon_events.resolution_reason` on close, so the live row carries the immediate "why is this closed" answer without needing a join. -New reasons should be added as explicit enum values, not free-text. +New reasons should be added as explicit enum values in code, not free-text. The column is `VARCHAR(64)` (not MySQL `ENUM`) so adding a value doesn't require a schema migration. ## Open questions - **Retention**: how long do we keep closed events at full fidelity before rolling them up? - **Causal graph consumers**: who reads the causal links and what query shapes do they need? That dictates indexing. -- **Cross-probe severity**: when multiple probe types fire on the same site, does the site-row `current_severity` take the max, a weighted sum, or something else? +- **Cross-probe severity**: when multiple probe types fire on the same site, should the API rollup use max severity, a weighted sum, or something else? ## Invariants worth testing -1. Event write and site-row projection update are atomic. -2. Replaying the same probe result twice produces the same single event. -3. `Seems Down → Up` (false alarm) correctly closes the event with `resolution_reason = false_alarm`. -4. Severity updates on a live event do not create a new event row. -5. Closed events are never mutated (except possibly by a backfill/migration, which should be audited). +1. Event write and legacy status projection update are atomic while `LEGACY_STATUS_PROJECTION_ENABLE` is true. +2. **Every** mutation of a `jetmon_events` row writes exactly one row into `jetmon_event_transitions` in the same transaction. Open, severity change, state change, cause-link change, close — no carve-outs. +3. Replaying the same probe result twice produces the same single event and a single `opened` transition row (idempotent insert path). +4. `Seems Down → Up` (false alarm) correctly closes the event with `resolution_reason = false_alarm` and writes a transition row with `reason = false_alarm`. +5. Severity updates on a live event do not create a new event row, but **do** create a transition row. +6. Closed events are never mutated (except possibly by a backfill/migration, which should be audited). +7. After closing an event for tuple T, a new failure for tuple T can immediately open a new event without conflicting on `dedup_key`. +8. Replaying every transition row for an event in `changed_at` order reconstructs the event's current `severity` and `state`. diff --git a/Makefile b/Makefile index f1e97210..872bb167 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,28 @@ BINARY := bin/jetmon2 +DELIVERER := bin/jetmon-deliverer VERIFLIER := bin/veriflier2 +GO ?= $(shell if command -v go >/dev/null 2>&1; then command -v go; elif [ -x /usr/local/go/bin/go ]; then printf /usr/local/go/bin/go; else printf go; fi) +GOCACHE ?= /tmp/jetmon-go-cache +GO_ENV := GOCACHE=$(GOCACHE) BUILD_FLAGS := -ldflags "-X main.version=$(shell git describe --tags --always --dirty) \ -X main.buildDate=$(shell date -u +%Y-%m-%dT%H:%M:%SZ) \ - -X main.goVersion=$(shell go version | awk '{print $$3}')" + -X main.goVersion=$(shell $(GO) version | awk '{print $$3}')" -.PHONY: all build build-veriflier generate test test-race lint clean +.PHONY: all build build-deliverer build-veriflier generate test test-race lint clean -all: generate build build-veriflier +all: build build-deliverer build-veriflier build: mkdir -p bin - CGO_ENABLED=0 go build $(BUILD_FLAGS) -o $(BINARY) ./cmd/jetmon2/ + $(GO_ENV) CGO_ENABLED=0 $(GO) build $(BUILD_FLAGS) -o $(BINARY) ./cmd/jetmon2/ + +build-deliverer: + mkdir -p bin + $(GO_ENV) CGO_ENABLED=0 $(GO) build $(BUILD_FLAGS) -o $(DELIVERER) ./cmd/jetmon-deliverer/ build-veriflier: mkdir -p bin - CGO_ENABLED=0 go build $(BUILD_FLAGS) -o $(VERIFLIER) ./veriflier2/cmd/ + $(GO_ENV) CGO_ENABLED=0 $(GO) build $(BUILD_FLAGS) -o $(VERIFLIER) ./veriflier2/cmd/ generate: @@ -23,13 +31,13 @@ generate: proto/veriflier.proto test: - go test ./... + $(GO_ENV) $(GO) test ./... test-race: - go test -race ./... + $(GO_ENV) $(GO) test -race ./... lint: - go vet ./... + $(GO_ENV) $(GO) vet ./... clean: - rm -f $(BINARY) $(VERIFLIER) + rm -f $(BINARY) $(DELIVERER) $(VERIFLIER) diff --git a/PROJECT.md b/PROJECT.md index dd983978..f83c14c2 100644 --- a/PROJECT.md +++ b/PROJECT.md @@ -17,7 +17,7 @@ The current architecture uses forked Node.js processes (8–16MB RSS each at sta - **Built-in profiling** via `pprof`, race detector via `go test -race`, and a mature testing ecosystem - **Graceful goroutine lifecycle management** replaces the fragile worker spawn/recycle/evaporate lifecycle -The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency with a lightweight Go HTTP service. The protocol between Monitor and Verifliers moves from custom HTTPS to gRPC, providing type-safe contracts, built-in retries, and bidirectional streaming for future use. +The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency with a lightweight Go HTTP service. The v2 production Monitor-to-Veriflier transport is JSON-over-HTTP on the configured Veriflier port. The proto contract is kept in `proto/` as a schema reference for a possible future transport, not as the v2 deployment path. --- @@ -25,11 +25,11 @@ The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency with a ``` ┌──────────────────────────────────────────────────────┐ -│ jetmon2 (single binary) │ +│ jetmon2 │ │ │ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ │ -│ │ Orchestrator│ │ Check Pool │ │ gRPC Server │ │ -│ │ goroutine │ │ (goroutines)│ │ (Veriflier) │ │ +│ │ Orchestrator│ │ Check Pool │ │ Veriflier │ │ +│ │ goroutine │ │ (goroutines)│ │ transport │ │ │ └──────┬──────┘ └──────┬──────┘ └──────┬───────┘ │ │ │ │ │ │ │ ┌──────┴────────────────┴────────────────┴───────┐ │ @@ -43,7 +43,7 @@ The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency with a (all unchanged) ``` -The monolithic process replaces the master/worker/SSL-cluster process tree. Concurrency is managed through Go channels and a bounded goroutine worker pool. The orchestrator goroutine owns DB access and WPCOM notifications. The check pool goroutines own HTTP connections. The gRPC server goroutines receive Veriflier results. All three communicate via typed channels with no shared mutable state. +The monitor process replaces the master/worker/SSL-cluster process tree. Concurrency is managed through Go channels and a bounded goroutine worker pool. The orchestrator goroutine owns DB access and WPCOM notifications. The check pool goroutines own HTTP connections. The Veriflier client/server code handles remote confirmation batches over JSON-over-HTTP and is isolated behind `internal/veriflier/`. Outbound webhook and alert-contact delivery can run embedded in one API-enabled `jetmon2` process today, or through the standalone `jetmon-deliverer` entry point as that responsibility moves toward its own deployable process. --- @@ -77,7 +77,7 @@ Go's `time.Ticker` fires with OS-level timer precision. RTT measurements from `n Current deployment requires `npm install`, a `node-gyp` rebuild of the native C++ addon (which must match the installed Node.js version), and a coordinated process restart. A failed addon compilation blocks deployment entirely. -Jetmon 2 deploys as a single static binary with no runtime dependencies. Deployment is: copy binary, `systemctl restart jetmon2`. Total deployment time drops from several minutes to under 30 seconds. There is no compilation step on the target host and no dependency on a matching Node.js version. +Jetmon 2 deploys as static Go binaries with no runtime language dependencies. The conservative v2 monitor deployment is: copy `jetmon2`, run migrations, and `systemctl restart jetmon2`. Total deployment time drops from several minutes to under 30 seconds. There is no compilation step on the target host and no dependency on a matching Node.js version. ### Mean Time to Recovery @@ -155,10 +155,10 @@ Add a `redirect_policy` column to `jetpack_monitor_sites` with three options: `f ## Tooling and Developer Experience **Docker Compose Environment** -The existing Docker Compose setup is updated for the Go binary. A single `docker compose up` starts MySQL, the Jetmon 2 binary, one or more Veriflier instances, the simulated site server, StatsD + Graphite, and the operator dashboard. No npm, no node-gyp, no manual build steps. `docker compose up --build` rebuilds the Go binary in a reproducible multi-stage Docker build. +The existing Docker Compose setup is updated for the Go binary. A single `docker compose up` starts MySQL, the Jetmon 2 binary, one or more Veriflier instances, Mailpit for local email capture, StatsD + Graphite, and the operator dashboard. No npm, no node-gyp, no manual build steps. `docker compose up --build` rebuilds the Go binary in a reproducible multi-stage Docker build. A simulated site server remains a planned addition for deterministic local failure scenarios. -**Simulated Site Server** -A dedicated HTTP service included in the Docker Compose environment that simulates configurable site states without requiring real external sites: +**Planned Simulated Site Server** +A dedicated HTTP service should be added to the Docker Compose environment to simulate configurable site states without requiring real external sites: - Static response codes (200, 404, 500, 503) - Configurable response delay (simulates slow sites and timeouts) @@ -168,7 +168,7 @@ A dedicated HTTP service included in the Docker Compose environment that simulat - Redirect chains (tests the redirect-following logic) - Abrupt TCP close (tests connection reset handling) -States are toggled via a simple HTTP API so integration tests can script site behaviour programmatically. +States should be toggled via a simple HTTP API so integration tests can script site behaviour programmatically. **Structured Logging** All log output is available in two formats: the existing plain-text line format (for drop-in compatibility with current log consumers) and an optional structured JSON format enabled via `config.json`. The JSON format emits the same fields — level, timestamp, message, blog_id, http_code, error_code, RTT — as a machine-readable object, making log ingestion into Elasticsearch, Loki, or any log aggregation platform straightforward without a custom parser. Both formats write to the same log file paths. @@ -180,7 +180,7 @@ Given a site `blog_id` and a time range, the replay tool reconstructs the full d End-to-end integration tests that run against the Docker Compose environment: - Unit tests for the check logic (status classification, retry transitions, COMPARE mode comparison) -- Integration tests that insert sites into the test database, configure the simulated site server to return specific states, and assert that the correct WPCOM notification is sent within a defined time window +- Integration tests that insert sites into the test database, configure deterministic local test endpoints to return specific states, and assert that the correct WPCOM notification is sent within a defined time window - Timeout and TLS failure scenarios - Maintenance window suppression - SSL expiry detection @@ -190,7 +190,7 @@ End-to-end integration tests that run against the Docker Compose environment: - MySQL-coordinated bucket claiming: two hosts starting simultaneously claim non-overlapping ranges - MySQL-coordinated bucket failover: a host's heartbeat is artificially expired and surviving hosts absorb its buckets within one grace period - Alert cooldown suppression: a flapping site does not fire repeated alerts within the cooldown window -- Redirect policy: `follow`, `alert`, and `fail` modes behave correctly against the simulated site server +- Redirect policy: `follow`, `alert`, and `fail` modes behave correctly against deterministic local test endpoints All tests run with `go test ./...` and are included in CI. @@ -198,10 +198,13 @@ All tests run with `go test ./...` and are included in CI. A standalone binary (`jetmon2 validate-config`) that: - Parses `config.json` and checks all required keys are present -- Validates value ranges (e.g., `PEER_OFFLINE_LIMIT` must be <= number of configured Verifliers) -- Attempts a test connection to MySQL and verifies the expected tables exist -- Attempts a test connection to each configured Veriflier -- Verifies the WPCOM API certificate is valid and not near expiry +- Validates value ranges and required per-mode settings +- Attempts a test connection to MySQL +- Reports legacy projection and email transport modes +- Prints the matching rollout preflight and projection-drift investigation + commands for the configured bucket ownership mode +- Warns when the email transport resolves to the log-only `stub` sender +- Lists configured Verifliers as best-effort operator context - Outputs a pass/fail summary with specific error messages Intended to run as a pre-deployment check in CI and as an operator tool when diagnosing connectivity issues. @@ -209,39 +212,42 @@ Intended to run as a pre-deployment check in CI and as an operator tool when dia **Operator Dashboard** A lightweight web UI served by the binary itself (no separate process) on a configurable internal port. Displays in real time: -- Worker goroutine count, active checks, idle goroutines -- Per-worker memory allocation and GC pressure +- Worker goroutine count and active checks - Check queue depth and drain rate -- Sites per second (current and 5-minute rolling average) -- Round completion time and time to next round +- Sites per second +- Round completion time - Local retry queue depth -- Veriflier queue depth and per-Veriflier response times -- DB connection pool utilisation -- WPCOM API success/failure rate (last 100 calls) -- Top 20 slowest sites by RTT (rolling 5-minute window) -- Top 20 most frequently down sites (rolling 24-hour window) +- Owned bucket range +- Bucket ownership mode, legacy projection mode, delivery-worker ownership, and + rollout preflight / projection-drift commands +- RSS memory usage +- WPCOM circuit-breaker state and queued notification depth +- Live dependency health for MySQL, configured Verifliers, WPCOM, StatsD, and + log/stats directory writes -Updates via server-sent events — no WebSocket library needed, no JavaScript framework. A plain HTML page with `` is sufficient and has no build toolchain dependency. +Updates via server-sent events and lightweight JSON polling — no WebSocket library needed, no JavaScript framework. A plain HTML page with `` and `fetch` is sufficient and has no build toolchain dependency. **System Health Map** -A separate view on the operator dashboard that shows all external dependencies as a live status grid: +The operator dashboard health grid publishes: -- MySQL (primary + replicas): connection state, query latency, last successful batch -- Each configured Veriflier: reachability, last response time, last batch sent/received -- WPCOM API: last successful notification, current error rate -- StatsD: last successful flush -- Disk (log and stats files): free space, last write time +- MySQL: connection state and ping latency +- Each configured Veriflier: reachability and status latency +- WPCOM API: circuit-breaker state and queued notification depth +- StatsD: local client initialization state +- Disk: writable `logs/` and `stats/` directories -Each cell is green/amber/red with a hover tooltip showing the last error message if applicable. Intended to give an operator an instant answer to "is everything healthy?" without reading logs. +Future refinements can add primary/replica breakdowns, last successful +orchestrator batch, WPCOM request error-rate windows, and disk free-space +thresholds once production operating data shows which signals are worth paging +on. **False Positive Tracker** Every time the system escalates a site to Veriflier confirmation and the Verifliers do NOT confirm it as down (i.e., the queue entry times out or all Verifliers report the site as up), the event is recorded in a `jetmon_false_positives` table with timestamp, site, HTTP code, error code, and RTT from the local check. A view in the operator dashboard surfaces sites with high false positive rates, helping operators tune per-site `NUM_OF_CHECKS` or `TIME_BETWEEN_CHECKS_SEC` settings. **Internal Audit Log** -Every state-relevant event for every site is written to a `jetmon_audit_log` table: +Operational activity for every site is written to a `jetmon_audit_log` table: - Check performed: timestamp, source (local/veriflier name), result (HTTP code, error code, RTT) -- Status transition: old status, new status, reason - WPCOM notification sent: timestamp, payload hash, response code - WPCOM notification retry: timestamp, reason - Local retry dispatched: timestamp, retry count @@ -250,12 +256,17 @@ Every state-relevant event for every site is written to a `jetmon_audit_log` tab - Maintenance window active: timestamp, window end - Config change: timestamp, which keys changed +Authoritative incident state transitions live in `jetmon_event_transitions`, written by the `eventstore` package in the same transaction as the matching `jetmon_events` mutation. The audit log is intentionally operational context, not the source of truth for site state. + Queryable by `blog_id` and time range via a CLI tool (`jetmon2 audit --blog-id 12345 --since 2h`) and via the operator dashboard. Designed specifically for Happiness Engineers investigating customer-reported alert issues. **Deployment Tooling** - `jetmon2 version` — prints binary version, build date, Go version, and git commit hash - `jetmon2 migrate` — applies pending DB schema migrations idempotently - `jetmon2 status` — connects to a running instance's internal API and prints a one-line health summary (equivalent to reading `stats/totals` but richer) +- `jetmon2 rollout pinned-check` — validates a pinned v1-to-v2 cutover host before or during host replacement +- `jetmon2 rollout dynamic-check` — validates full `jetmon_hosts` coverage after the fleet transitions from pinned to dynamic ownership +- `jetmon2 rollout projection-drift` — lists active sites whose legacy `site_status` projection disagrees with the authoritative event state - `jetmon2 drain --worker N` — gracefully removes one worker pool slot, waiting for in-flight checks to complete before reducing concurrency - `jetmon2 reload` — sends SIGHUP to the running process (convenience wrapper) @@ -275,7 +286,7 @@ The worker pool monitors queue depth against a configurable high-water mark. Whe The binary ships with a systemd unit file. `Restart=on-failure` with a short `RestartSec` ensures the process is automatically restarted if it crashes or exits unexpectedly. `StartLimitIntervalSec` and `StartLimitBurst` prevent restart loops from hammering a broken dependency. The unit file also enforces resource limits (`MemoryMax`, `LimitNOFILE`) to keep the process within safe bounds on shared hosts. A watchdog integration via `sd_notify` lets systemd detect and restart a process that has stopped making progress without actually crashing. **MySQL-Coordinated Bucket Ownership** -A `jetmon_hosts` table replaces the static `BUCKET_NO_MIN`/`BUCKET_NO_MAX` config values with runtime-negotiated bucket ownership. Hosts claim, hold, and release bucket ranges autonomously using MySQL transactions as the coordination mechanism — no cluster orchestrator required. +A `jetmon_hosts` table replaces the static `BUCKET_NO_MIN`/`BUCKET_NO_MAX` config values with runtime-negotiated bucket ownership. Hosts claim, hold, and release bucket ranges autonomously using MySQL transactions as the coordination mechanism — no cluster orchestrator required. For the initial v1-to-v2 production migration, `PINNED_BUCKET_MIN`/`PINNED_BUCKET_MAX` (with `BUCKET_NO_MIN`/`BUCKET_NO_MAX` accepted as aliases) temporarily pins a v2 host to the exact static range of the v1 host it replaces; remove those keys after the fleet is on v2 to enable dynamic ownership. Table structure: ```sql @@ -288,9 +299,9 @@ CREATE TABLE jetmon_hosts ( ); ``` -On startup, the instance upserts its own row, then scans for rows whose `last_heartbeat` is older than the grace period (suggested: 2× normal round time). Expired rows are presumed dead. The instance claims their uncovered bucket ranges by deleting the dead rows and inserting its own covering range inside a `SELECT ... FOR UPDATE` transaction, preventing two hosts from racing to claim the same range simultaneously. The instance derives its active range from what it successfully claimed — `BUCKET_NO_MIN`/`BUCKET_NO_MAX` are no longer needed in `config.json`. +In dynamic ownership mode, on startup the instance upserts its own row, then scans for rows whose `last_heartbeat` is older than the grace period (suggested: 2× normal round time). Expired rows are presumed dead. The instance claims their uncovered bucket ranges by deleting the dead rows and inserting its own covering range inside a `SELECT ... FOR UPDATE` transaction, preventing two hosts from racing to claim the same range simultaneously. The instance derives its active range from what it successfully claimed — `BUCKET_NO_MIN`/`BUCKET_NO_MAX` are only needed as aliases for the temporary pinned migration mode. -Each round, the orchestrator issues a single `UPDATE jetmon_hosts SET last_heartbeat = NOW() WHERE host_id = ?`. If a host stalls, is OOM-killed, or loses network, its heartbeat stops updating. Surviving hosts detect the stale row at the start of their next round and absorb its buckets up to their configured `BUCKET_TARGET` maximum. +In dynamic ownership mode, each round the orchestrator issues a single `UPDATE jetmon_hosts SET last_heartbeat = NOW() WHERE host_id = ?`. If a host stalls, is OOM-killed, or loses network, its heartbeat stops updating. Surviving hosts detect the stale row at the start of their next round and absorb its buckets up to their configured `BUCKET_TARGET` maximum. In pinned migration mode, the host skips `jetmon_hosts` entirely and checks only its configured static range. On SIGINT, the instance sets `status = 'draining'`, completes in-flight checks, then deletes its own row. Surviving hosts can reclaim those buckets at the start of their next round without waiting for heartbeat expiry. A hard-killed host leaves its row in place; the grace period determines how long before its buckets are reclaimed. @@ -322,7 +333,7 @@ Check that a domain resolves to expected IPs on a schedule, using Go's `net.Look Attempt a TCP connection to an arbitrary host:port on a schedule. No HTTP layer — a successful connection is "up". Useful for database ports, SMTP, and custom application services. A small extension of the existing connection logic. **Heartbeat / Cron Monitoring** -New inbound endpoint on the gRPC server (or a separate lightweight HTTPS endpoint) where monitored jobs ping Monitor on completion. If the expected ping doesn't arrive within the configured interval plus grace period, an alert fires. Deep integration with the Jetpack heartbeat for zero-configuration WP-Cron health detection. +New inbound endpoint on the Monitor's HTTP/API surface where monitored jobs ping on completion. If the expected ping doesn't arrive within the configured interval plus grace period, an alert fires. Deep integration with the Jetpack heartbeat for zero-configuration WP-Cron health detection. **Response Time Anomaly Detection** Using the granular timing breakdown (DNS/TCP/TLS/TTFB) collected in the rewrite, build a per-site baseline over a rolling window and alert when response time exceeds N standard deviations from baseline — even if the site is technically returning 200. Detects slow-but-not-down conditions that users notice but current monitoring misses. @@ -344,4 +355,3 @@ Within-Jetpack on-call scheduling: route alerts to different contacts at differe **Distributed Tracing** Instrument the full check pipeline with OpenTelemetry spans: DB fetch → work dispatch → HTTP check (with DNS/TCP/TLS sub-spans) → Veriflier request → WPCOM notification. Export to Jaeger or any OTLP-compatible backend. Makes debugging latency anomalies and check delays straightforward without relying on log correlation. - diff --git a/README.md b/README.md index 177cb324..f93fc059 100644 --- a/README.md +++ b/README.md @@ -4,51 +4,95 @@ jetmon2 Overview -------- -Jetmon is a parallel HTTP uptime monitoring service that checks Jetpack websites at scale. Jetmon 2 is a complete rewrite of the original Node.js + C++ service as a single Go binary, delivering a large reduction in memory usage, a significant increase in concurrent checks per host, and a simpler deployment model with no native addon compilation. +Jetmon is the parallel HTTP health monitoring service for Jetpack-connected sites at scale. Jetmon 2 turns it from a binary up/down status flipper into a full event-sourced health platform — the same low-false-positive Veriflier-confirmed detection core, now with a five-layer severity model, an internal REST API, HMAC-signed webhooks, managed alert contacts (email, PagerDuty, Slack, Teams), and a complete operational audit trail. -Jetmon periodically loops over a list of Jetpack sites and performs HTTP checks. When a site appears down, local retries are attempted before geographically distributed Veriflier services are asked to confirm the outage. WPCOM is notified only after confirmation, keeping false positive rates low. +The whole thing ships as a single static Go binary with embedded migrations. No `node_modules`, no native addons, no worker process tree. Every check, retry, Veriflier confirmation, and notification lands in `jetmon_audit_log`; every status transition lands in `jetmon_event_transitions`. An operator can replay any incident, end-to-end, from the database alone. -Jetmon 2 is a drop-in replacement: the MySQL schema, WPCOM notification payload, StatsD metric names, log file format, and config file keys are all backwards-compatible. See `PROJECT.md` for the full feature specification and performance estimates. +The Jetmon 1 detection pipeline is preserved verbatim — periodic check rounds, local retries before escalation, geo-distributed Veriflier confirmation before WPCOM is notified. v2 keeps WPCOM compatibility through a shadow-state migration: the v2 event tables are authoritative, and `jetpack_monitor_sites.site_status` / `last_status_change` continue to be projected transactionally for legacy consumers until they cut over (`LEGACY_STATUS_PROJECTION_ENABLE`). + + +What's new in v2 +---------------- + +v2 keeps the Jetmon 1 detection pipeline (local retries → geo-distributed Veriflier confirmation → notify) and rebuilds everything around it. + +| Capability | Jetmon 1 | Jetmon 2 | +|---|---|---| +| Status model | Binary `up` / `down` (`confirmed_down` for re-detections) | Five-layer severity ladder: `Up < Warning < Degraded < SeemsDown < Down`, paired with separate state vocabulary | +| State storage | Single mutable `site_status` column | Event-sourced — `jetmon_events` (current authoritative state) + append-only `jetmon_event_transitions` (every mutation) | +| Failure classifications | `down` | `server`, `client`, `blocked`, `https`, `intermittent`, `redirect`, `ssl_expiry`, `tls_deprecated`, `keyword_missing`, `success` | +| Notification channels | WPCOM only | WPCOM + HMAC-signed webhooks + managed alert contacts (email, PagerDuty, Slack, Teams) | +| API surface | None | Internal REST API at `/api/v1`: Bearer auth, three coarse scopes, per-key rate limit, Stripe-style idempotency, cursor pagination, full audit logging | +| Per-site config | Bucket + check interval | + custom headers, timeout override, redirect policy, alert cooldown, maintenance windows, keyword content check, SSL-expiry alerts at 30 / 14 / 7 days | +| Operational audit | Basic logging | Full audit trail (`jetmon_audit_log`) over every check, retry, Veriflier dispatch, alert suppression, API call, and config reload | +| Process model | Node master + Node workers + C++ native addon + Qt C++ Veriflier | Go monitor (`jetmon2`) + optional outbound deliverer (`jetmon-deliverer`) + Go Veriflier (`veriflier2`) | +| Worker scaling | Spawn / kill child processes | In-process goroutine pool that auto-scales by queue depth | +| Deployment friction | `npm` + `node-gyp` + Qt | Static binary + `./jetmon2 migrate` + `./jetmon2 validate-config` | +| Multi-host coordination | Manual `bucket_min` / `bucket_max` per host | MySQL-coordinated `jetmon_hosts` table with heartbeat-and-reclaim | +| Observability | StatsD | StatsD + structured logs + audit trail + operator dashboard (SSE) + localhost pprof | +| Hot reload | Restart | `SIGHUP` for config; `SIGINT` for graceful drain | + +A few specifics worth bragging about: + +- **Webhooks with Stripe-style HMAC signatures.** `t=,v1=` over `{ts}.{body}`, per-webhook in-flight cap, retry ladder 1m → 5m → 30m → 1h → 6h before abandon. Frozen-at-fire-time payload contract — consumers see the event as it was when the webhook fired, not as it is now. +- **Idempotent write endpoints.** POSTs accept `Idempotency-Key`; replays return the original response, so a retried "click to test" through a network blip won't double-page the destination. +- **Rotation grace windows on API keys.** `revoked_at` and `expires_at` are half-open cutoffs; setting `revoked_at` in the future keeps the old key valid until consumers deploy the replacement. +- **Migrations embedded in the binary.** `./jetmon2 migrate` walks the schema forward; `./jetmon2 validate-config` checks config + DB connectivity + email transport mode + verifier list before deploy, prints the matching rollout preflight command, and warns loudly when alert-contact email is set to the log-only stub. +- **MySQL 5.7+ compatible.** No window functions, no JSON-path expressions in SELECT — the v2 schema and queries land cleanly on the legacy production database. Architecture ------------ ``` -┌──────────────────────────────────────────────────────┐ -│ jetmon2 (single binary) │ -│ │ -│ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ │ -│ │ Orchestrator│ │ Check Pool │ │ gRPC Server │ │ -│ │ goroutine │ │ (goroutines)│ │ (Veriflier) │ │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬───────┘ │ -│ │ │ │ │ -│ ┌──────┴────────────────┴────────────────┴───────┐ │ -│ │ Internal channels │ │ -│ └────────────────────────────────────────────────┘ │ -└────────────┬──────────────────────────┬──────────────┘ - │ │ - MySQL WPCOM API - StatsD (unchanged) - Log files - (all unchanged) +┌──────────────────────────────────────────────────────────────┐ +│ jetmon2 │ +│ │ +│ ┌────────────┐ ┌────────────┐ ┌────────────────────┐ │ +│ │Orchestrator│ │ Check pool │ │ Veriflier │ │ +│ │ goroutine │ │(goroutines)│ │ transport │ │ +│ └─────┬──────┘ └─────┬──────┘ └────────┬───────────┘ │ +│ │ │ │ │ +│ ┌─────┴───────────────┴──────────────────┴────────────┐ │ +│ │ Eventstore + Audit log │ │ +│ └─────┬─────────────────┬──────────────────┬──────────┘ │ +│ │ │ │ │ +│ ┌─────┴──────┐ ┌───────┴────────┐ ┌──────┴──────────┐ │ +│ │ REST API │ │ Webhook worker │ │ Alert-contact │ │ +│ │ /api/v1/ │ │ embedded or │ │ worker embedded │ │ +│ │ │ │ deliverer │ │ or deliverer │ │ +│ └────────────┘ └────────────────┘ └─────────────────┘ │ +└────────┬─────────────────────────────────────────┬───────────┘ + │ │ + MySQL WPCOM · custom webhooks + StatsD · email · PagerDuty + Log files · Slack · Teams ``` The **Orchestrator goroutine** fetches site batches from MySQL, dispatches work to the check pool, manages the local retry queue, coordinates Veriflier confirmation, and sends WPCOM notifications. It owns all database access and all outbound WPCOM calls. The **Check Pool** is a bounded goroutine pool that performs HTTP checks using Go's `net/http` and `net/http/httptrace`. It records DNS, TCP, TLS, and TTFB timings on every check and auto-scales against queue depth without spawning new processes. -The **gRPC Server** receives confirmation results from remote Veriflier instances, replacing the previous custom HTTPS protocol. +The **Veriflier transport** sends confirmation batches to remote Veriflier instances. JSON-over-HTTP on the configured Veriflier port is the v2 production transport; the proto definition in `proto/` is retained only as a schema reference for a possible future transport. + +The **Veriflier** is a standalone Go binary deployed at remote locations. It replaces the Qt C++ Veriflier and uses the same JSON-over-HTTP transport as the Monitor-side client. -The **Veriflier** is a standalone Go binary deployed at remote locations. It replaces the Qt C++ Veriflier, communicating with the Monitor via gRPC. +The v2 platform layer sits below the detection pipeline: -Status change flows: +- **Eventstore** is the sole writer for `jetmon_events` and `jetmon_event_transitions`. Every state change — open, escalate, close, recover, manual override — is an atomic transition with full history. Audit log writes share the same MySQL handle. +- **REST API** exposes the v2 surface at `/api/v1/` (enable with `API_PORT`). Bearer-token auth, three coarse scopes (`read` / `write` / `admin`), per-key token-bucket rate limiting, Stripe-style idempotency keys on POSTs. Every authenticated request lands in `jetmon_audit_log` with the consumer name, status, latency, and request id. +- **Webhook worker** delivers HMAC-signed `event.*` posts to registered consumers. Per-webhook in-flight cap, retry ladder 1m → 5m → 30m → 1h → 6h, frozen-at-fire-time payload. +- **Alert-contact worker** delivers Jetmon-rendered notifications through Jetmon-owned transports (email, PagerDuty Events API v2, Slack Block Kit, Teams Adaptive Cards). Per-contact `max_per_hour` rate cap as pager-storm insurance. -| Previous Status | Current Status | Action | -|-----------------|-------------------|---------------------------------------------------| +WPCOM notification flow (preserved from Jetmon 1, used during shadow-state migration): + +| Previous Status | Current Status | Action | +|-----------------|-------------------|-------------------------------------------------------| | UP | DOWN | Local retries → Veriflier confirmation → notify WPCOM | -| DOWN | UP | Notify WPCOM site recovered | -| DOWN | DOWN (confirmed) | Notify WPCOM confirmed down | +| DOWN | UP | Notify WPCOM site recovered | +| DOWN | DOWN (confirmed) | Notify WPCOM confirmed down | + +v2 emits richer events to webhook and alert-contact subscribers (full event lifecycle including escalations and severity transitions) — the WPCOM table above describes only the legacy notification path. Installation @@ -62,7 +106,15 @@ Installation cd docker && cp .env-sample .env -4) Edit `docker/.env` for your local environment +4) Edit `docker/.env` for your local environment. The file is only for local + host-side bind address / `*_HOST_PORT` overrides, credentials, and user ids. + `BIND_ADDR` keeps non-API services local by default; `API_BIND_ADDR` controls + whether the REST API is reachable by other systems. Container-side service + ports are hardcoded in `docker-compose.yml`. + `MYSQL_ROOT_PASSWORD` is used only for local container setup; Jetmon connects + with the non-root `MYSQL_USER` / `MYSQL_PASSWORD` credentials. + New Docker-generated Jetmon configs use `EMAIL_TRANSPORT=smtp` through + Mailpit so alert-contact emails can be inspected locally. 5) Build and start all services: @@ -89,10 +141,15 @@ Key settings: | `BUCKET_TOTAL` | 1000 | Total bucket range across all hosts | | `BUCKET_TARGET` | 500 | Maximum buckets this host should own | | `BUCKET_HEARTBEAT_GRACE_SEC` | 600 | Seconds before a silent host's buckets are reclaimed | +| `PINNED_BUCKET_MIN` / `PINNED_BUCKET_MAX` | unset | Migration-only static bucket range; disables `jetmon_hosts` ownership for v1-compatible host-by-host cutover | | `ALERT_COOLDOWN_MINUTES` | 30 | Default cooldown between repeated alerts per site | +| `LEGACY_STATUS_PROJECTION_ENABLE` | true | Keep `jetpack_monitor_sites.site_status` / `last_status_change` updated for v1 consumers during migration | | `LOG_FORMAT` | `text` | `text` for plain-text logs or `json` for structured logs | | `DASHBOARD_PORT` | 8080 | Internal port for the operator dashboard (0 to disable) | +| `API_PORT` | 0 | Internal REST API port (0 to disable). Also makes webhook and alert-contact delivery workers eligible to run. | +| `DELIVERY_OWNER_HOST` | empty | Optional hostname allowed to run delivery workers when `API_PORT` is enabled; set this on shared production configs so only one API-enabled host dispatches outbound deliveries. | | `DEBUG_PORT` | 6060 | localhost-only pprof port (`127.0.0.1:PORT`); 0 to disable | +| `EMAIL_TRANSPORT` | `stub` | Alert-contact email sender: `stub` (log only), `smtp`, or `wpcom` | See `config/config.readme` for the full option reference. @@ -116,6 +173,11 @@ To stop: docker compose down +After pulling Docker service or volume changes, clear stale stopped containers +before restarting: + + docker compose down --remove-orphans + Database -------- @@ -148,14 +210,30 @@ New columns added by Jetmon 2 (applied via `jetmon2 migrate`): | `redirect_policy` | ENUM NULL | `follow`, `alert`, or `fail` | | `alert_cooldown_minutes` | SMALLINT NULL | Per-site cooldown override | +Jetmon 2 uses a shadow-v2-state migration model. Incident state is authoritative +in the v2 event tables, while `jetpack_monitor_sites` remains the legacy site +configuration table and compatibility projection during migration. With +`LEGACY_STATUS_PROJECTION_ENABLE: true`, every v2 incident mutation also updates +the v1 `site_status` / `last_status_change` fields in the same transaction. Once +legacy readers have moved to the v2 API/event tables, disable that projection. + New tables added by Jetmon 2: | Table | Purpose | |-------|---------| | `jetmon_hosts` | MySQL-coordinated bucket ownership and heartbeat | -| `jetmon_audit_log` | Full event history per site | +| `jetmon_events` | Authoritative current state of each v2 incident | +| `jetmon_event_transitions` | Append-only history of every mutation to `jetmon_events` | +| `jetmon_audit_log` | Operational trail for checks, retries, WPCOM calls, suppression, API access, and config reloads | | `jetmon_check_history` | RTT and timing samples for trending | | `jetmon_false_positives` | Veriflier non-confirmation events | +| `jetmon_api_keys` | Internal REST API Bearer-token registry | +| `jetmon_webhooks` | Webhook registrations and HMAC signing secrets | +| `jetmon_webhook_deliveries` | Outbound webhook delivery attempts and retry state | +| `jetmon_webhook_dispatch_progress` | Webhook worker high-water marks over event transitions | +| `jetmon_alert_contacts` | Managed alert destinations such as email, PagerDuty, Slack, and Teams | +| `jetmon_alert_deliveries` | Outbound alert-contact delivery attempts and retry state | +| `jetmon_alert_dispatch_progress` | Alert worker high-water marks over event transitions | Apply migrations before starting for the first time: @@ -173,15 +251,28 @@ For Developers ### Building - go build ./cmd/jetmon2/ - go build ./veriflier2/ + make all # Build bin/jetmon2, bin/jetmon-deliverer, and bin/veriflier2 + make build # Build only bin/jetmon2 + make build-deliverer # Build only bin/jetmon-deliverer + make build-veriflier # Build only bin/veriflier2 + +If `go` is not on `PATH`, the Makefile falls back to +`/usr/local/go/bin/go` when present. Override with `make GO=/path/to/go ...` +for other local layouts. Make targets use `GOCACHE=/tmp/jetmon-go-cache` by +default so builds do not depend on a writable home-directory cache; override +with `make GOCACHE=/path/to/cache ...` when needed. + +`make generate` is intentionally separate from `make all`. It requires +`protoc` and the Go protobuf plugins, and is reserved for experimental proto +stub generation; generated stubs are not part of the v2 production transport. ### Running Tests - go test ./... - go test -race ./... + make test + make test-race + make lint -Tests require the Docker Compose environment to be running for integration tests. Unit tests run standalone. +The current `go test ./...` suite runs standalone. Use the Docker Compose environment for manual end-to-end checks against MySQL, StatsD, and Veriflier services. ### Docker Development Loop @@ -190,11 +281,18 @@ Tests require the Docker Compose environment to be running for integration tests docker compose logs -f jetmon # Follow logs docker compose exec jetmon bash # Shell into the container +Mailpit captures Docker-local alert-contact emails. Open the web UI at +`http://localhost:8025` by default, or at the `BIND_ADDR` / +`MAILPIT_HOST_PORT` values from `docker/.env`. Jetmon sends SMTP to the +internal `mailpit:1025` address; that SMTP port is not published to the host. +Existing `config/config.json` files are not rewritten automatically, so remove +or update a stale local config if you want it to use Mailpit. + ### Adding Test Sites Connect to the test database: - docker compose exec mysqldb mysql -u root -p123456 jetmon_db + docker compose exec mysqldb mysql -u jetmon -pjetmon_dev_password jetmon_db Insert sites to check: @@ -205,37 +303,46 @@ Insert sites to check: (3, 0, 'https://httpstat.us/500', 1, 1), (4, 0, 'https://httpstat.us/200?sleep=15000', 1, 1); -### Enabling Database Updates +### Legacy Status Projection -Edit `config/config.json`: +During migration, keep the legacy v1 status fields updated: - { "DB_UPDATES_ENABLE": true } + { "LEGACY_STATUS_PROJECTION_ENABLE": true } -Then set the guard environment variable in `docker/.env`: +This does not make the legacy row the source of truth. Jetmon v2 writes +`jetmon_events` and `jetmon_event_transitions` first, then projects +`site_status` and `last_status_change` back to `jetpack_monitor_sites` for +legacy consumers. After all consumers read from the v2 API/event tables, set +`LEGACY_STATUS_PROJECTION_ENABLE` to `false`. - JETMON_UNSAFE_DB_UPDATES=1 +### Simulated Site Server -Both must be set together. The binary refuses to start with `DB_UPDATES_ENABLE: true` unless `JETMON_UNSAFE_DB_UPDATES=1` is also present in the environment. +The Docker Compose environment does not yet include the planned simulated site +server. Use external test endpoints or local ad-hoc services for response-code, +timeout, redirect, keyword, and TLS scenarios until that service is added. -**WARNING:** Never enable in production. +### Config Validation -### Simulated Site Server + ./jetmon2 validate-config -The Docker Compose environment includes a simulated site server. Toggle site states via its HTTP API to test specific scenarios without depending on external services: +Checks all required keys, validates value ranges, tests MySQL connectivity, +reports legacy projection and email transport modes, warns when alert-contact +email uses the log-only `stub` sender, and lists configured Verifliers. +Veriflier reachability is informational here rather than a validation failure. -- Static response codes (200, 404, 500, 503) -- Configurable response delay for timeout testing -- Flapping mode (alternates up/down on a schedule) -- SSL with a self-signed certificate -- Keyword presence and absence for content check testing -- Redirect chains -- Abrupt TCP close +### Tenant Mapping Backfill -### Config Validation +Gateway-routed site reads and writes are scoped through +`jetmon_site_tenants`. Before customer traffic depends on Jetmon-side tenant +enforcement, import the gateway/customer source of truth as CSV: - ./jetmon2 validate-config + ./jetmon2 site-tenants import --file site-tenants.csv --dry-run + ./jetmon2 site-tenants import --file site-tenants.csv --source gateway -Checks all required keys, validates value ranges, tests MySQL connectivity, tests Veriflier connectivity, and verifies the WPCOM API certificate. +The CSV format is `tenant_id,blog_id` with an optional header row. The import +upserts mappings and skips duplicate rows in the input; it does not delete +missing mappings, because pruning requires a source-specific reconciliation +policy. ### Debugging @@ -258,14 +365,20 @@ The debug port is configurable via `DEBUG_PORT` (default 6060). Set to 0 to disa | Path | Purpose | |------|---------| | `cmd/jetmon2/` | Binary entry point | +| `cmd/jetmon-deliverer/` | Standalone outbound delivery worker entry point | | `internal/orchestrator/` | Round scheduling, DB fetch, WPCOM notifications | | `internal/checker/` | HTTP check goroutine pool | -| `internal/veriflier/` | JSON-over-HTTP Veriflier transport (proto3 service defined in `proto/`) | +| `internal/veriflier/` | JSON-over-HTTP Veriflier transport | | `internal/db/` | MySQL access, bucket heartbeat | | `internal/config/` | Config loading and hot-reload | | `internal/metrics/` | StatsD client, stats file writer | | `internal/wpcom/` | WPCOM API client and circuit breaker | | `internal/audit/` | Audit log | +| `internal/eventstore/` | Authoritative event and transition writer | +| `internal/api/` | Internal REST API server | +| `internal/deliverer/` | Shared outbound delivery worker wiring | +| `internal/webhooks/` | HMAC-signed webhook registry and delivery worker | +| `internal/alerting/` | Managed alert-contact registry and delivery worker | | `internal/dashboard/` | Operator dashboard and SSE handler | | `veriflier2/` | Go Veriflier binary | @@ -287,13 +400,14 @@ Check that sites are being processed: docker compose exec jetmon cat stats/sitesqueue docker compose exec jetmon ps aux -Check the StatsD dashboard at http://localhost:8088 under: +Check the StatsD dashboard at `http://localhost:8088` by default, or at the +`BIND_ADDR` / `GRAPHITE_HOST_PORT` values from `docker/.env`, under: `Metrics > stats > com > jetpack > jetmon > docker > jetmon` ### Key Test Scenarios **Downtime detection and confirmation:** -Insert a site pointing to `https://httpstat.us/500`. With `DB_UPDATES_ENABLE: true`, Jetmon should detect the failure, retry locally, escalate to the Veriflier, confirm down, and update `site_status` to `2`. +Insert a site pointing to `https://httpstat.us/500`. With `LEGACY_STATUS_PROJECTION_ENABLE: true`, Jetmon should detect the failure, retry locally, escalate to the Veriflier, confirm down, write the v2 event transition, and project `site_status` to `2`. **SSL certificate expiry:** Insert an HTTPS site. After a check round, verify `ssl_expiry_date` is populated in the database. @@ -331,7 +445,19 @@ Simulate a host failure by manually expiring a row in `jetmon_hosts`. Verify the ### Operator Dashboard -The dashboard is available at http://localhost:8080 (configurable via `DASHBOARD_PORT`). It shows goroutine counts, check queue depth, sites per second, Veriflier status, WPCOM API health, slowest sites, and most frequently down sites. +The dashboard is available at http://localhost:8080 (configurable via +`DASHBOARD_PORT`). It shows worker count, active checks, queue depth, retry +queue depth, sites per second, round time, owned buckets, rollout guard state, +RSS, WPCOM circuit-breaker state, and live dependency health for MySQL, +configured Verifliers, WPCOM, StatsD, and log/stats directory writes. + +### Internal API and Delivery Workers + +The internal API is disabled by default. Set `API_PORT` to a non-zero port to enable `/api/v1/...`. + +In the embedded v2 deployment, `API_PORT` also makes the webhook and alert-contact delivery workers eligible to run inside `jetmon2`. Set `DELIVERY_OWNER_HOST` to exactly one hostname per database cluster when you want additional API-enabled hosts to serve API traffic without owning delivery during a staged rollout. If `DELIVERY_OWNER_HOST` is empty, the host keeps the legacy behavior and starts delivery workers whenever `API_PORT` is enabled; startup and `validate-config` warn about that fallback. + +`bin/jetmon-deliverer` is the first standalone process boundary for outbound delivery. It starts the same webhook and alert-contact workers without starting the monitor, API, dashboard, or bucket ownership loop. Delivery rows are claimed transactionally, so multiple active delivery workers do not claim the same pending row; use `DELIVERY_OWNER_HOST` when you want an explicit single-owner rollout during the transition from embedded to standalone delivery. ### Cleanup @@ -352,17 +478,107 @@ Jetmon runs on multiple production hosts managed by the Systems team. Each host 1) Install the `jetmon2` binary to `/opt/jetmon2/` 2) Install `systemd/jetmon2.service` to `/etc/systemd/system/` and run `systemctl daemon-reload` 3) Install `systemd/jetmon2-logrotate` to `/etc/logrotate.d/jetmon2` -4) Create `/opt/jetmon2/config/jetmon2.env` with the database credentials and auth tokens (see `config/db-config-sample.conf` for the required keys) -5) Copy `config/config.json` from an existing host (or generate from `config-sample.json`) -6) Set `BUCKET_TARGET` to the desired maximum bucket count for this host -7) Run `./jetmon2 migrate` to apply any pending schema migrations -8) Start the service: `systemctl enable --now jetmon2` +4) Create `/opt/jetmon2/logs` and `/opt/jetmon2/stats`, owned by the `jetmon` service user +5) Create `/opt/jetmon2/config/jetmon2.env` with the database credentials and auth tokens (see `config/db-config-sample.conf` for the required keys) +6) Copy `config/config.json` from an existing host (or generate from `config-sample.json`) +7) Set `BUCKET_TARGET` to the desired maximum bucket count for this host +8) Run `./jetmon2 migrate` to apply any pending schema migrations +9) Start the service: `systemctl enable --now jetmon2` The new host will claim unclaimed buckets from the pool on first startup. No existing hosts need reconfiguration. -### Rolling Updates (Zero Downtime) +Manual CLI commands such as `migrate`, `validate-config`, and `rollout` need +the same `DB_*` environment that systemd reads from +`/opt/jetmon2/config/jetmon2.env`; systemd's `EnvironmentFile` is not loaded +automatically for commands run directly from a shell. + +### Deploying Standalone Delivery Workers + +Standalone delivery is optional during the initial v2 rollout. Use it when +outbound webhook and alert-contact dispatch should run outside API-enabled +`jetmon2` processes. + +1) Install `bin/jetmon-deliverer` to `/opt/jetmon2/bin/jetmon-deliverer` +2) Install `systemd/jetmon-deliverer.service` to `/etc/systemd/system/` and run `systemctl daemon-reload` +3) Create `/opt/jetmon2/config/deliverer.json` from the same schema as `config/config.json` +4) Set `DELIVERY_OWNER_HOST` in process-specific configs so only the intended process class delivers during cutover +5) Run `JETMON_CONFIG=/opt/jetmon2/config/deliverer.json /opt/jetmon2/bin/jetmon-deliverer validate-config` with the same `DB_*` environment used by the service +6) Start the service: `systemctl enable --now jetmon-deliverer` + +See `docs/jetmon-deliverer-rollout.md` for the full embedded-to-standalone +delivery migration runbook and rollback path. + +### v1 to v2 Pinned Rolling Migration + +For the first production migration from v1, replace one v1 host at a time with +a v2 host pinned to that same inclusive bucket range. This avoids mixed v1/v2 +bucket ownership and gives each host a simple rollback path. + +1) Pre-apply additive migrations during a quiet period: + + ./jetmon2 migrate + +2) On the host being replaced, copy the existing v1 bucket range into v2 config: + + "PINNED_BUCKET_MIN": 0, + "PINNED_BUCKET_MAX": 99, + "LEGACY_STATUS_PROJECTION_ENABLE": true, + "API_PORT": 0 + + The v1 names `BUCKET_NO_MIN` / `BUCKET_NO_MAX` are accepted as aliases, but + `PINNED_BUCKET_*` makes the migration mode explicit. In pinned mode, v2 does + not claim or heartbeat `jetmon_hosts`; it checks only the configured range. + +3) Before stopping v1, run config validation and confirm it prints the pinned + preflight plus projection-drift commands: + + ./jetmon2 validate-config + +4) Before starting the cutover, run the pinned rollout preflight: + + ./jetmon2 rollout pinned-check + + It verifies pinned mode, legacy projection writes, absence of a + `jetmon_hosts` row for the host, active site count for the range, and zero + legacy projection drift. + +5) Stop the v1 process for that range, start v2, and verify checks, + Veriflier confirmations, WPCOM notifications, audit rows, and legacy + `site_status` projection for that bucket range. If the operator dashboard is + enabled, also confirm rollout guard state and dependency health before + moving to the next host. + +6) If rollback is needed, stop v2 and restart the original v1 process with the + same bucket config. Because the v2 migrations are additive and the legacy + projection remains enabled, legacy readers continue to see familiar status + fields. + +7) Repeat for each v1 host. After the whole fleet is on v2 and stable, plan a + coordinated dynamic-ownership cutover, remove `PINNED_BUCKET_*` from the v2 + monitor configs, restart the fleet in the approved window, then run: + + ./jetmon2 rollout dynamic-check + + This verifies fresh, active, gap-free, overlap-free `jetmon_hosts` coverage + before the fleet moves to normal v2 rolling updates. + +If either rollout check reports legacy projection drift, list the mismatched +active site rows before continuing: + + ./jetmon2 rollout projection-drift + +For a specific range: + + ./jetmon2 rollout projection-drift --bucket-min=0 --bucket-max=99 --limit=100 + +See [`docs/v1-to-v2-pinned-rollout.md`](docs/v1-to-v2-pinned-rollout.md) for +the detailed rollout checklist. + +### v2 Rolling Updates (Zero Downtime) -Update one host at a time. Surviving hosts absorb the draining host's buckets during the update window: +After all monitor hosts are already on v2 dynamic bucket ownership, update one +host at a time. Surviving hosts absorb the draining host's buckets during the +update window: 1) On the host being updated, drain in-flight checks and release buckets: @@ -394,7 +610,11 @@ The service releases its buckets to the pool before exiting. Surviving hosts rec ./jetmon2 status -Or check the operator dashboard at the configured `DASHBOARD_PORT`. The System Health Map view shows the status of MySQL, each Veriflier, WPCOM API, StatsD, and disk in a single grid. +Or check the operator dashboard at the configured `DASHBOARD_PORT` for +check-pool, throughput, bucket, rollout guard, memory, WPCOM circuit-breaker +state, and live dependency health. The rollout section shows bucket ownership +mode, legacy projection mode, delivery-worker ownership, and the matching +rollout preflight and projection-drift commands for the active config. ### Config Reload Without Restart @@ -428,15 +648,28 @@ Metrics are emitted with prefix `com.jetpack.jetmon.`. The Graphite/Gr - Free and active goroutines - Sites processed per second - Round completion time -- WPCOM API success and error rates +- WPCOM API attempt, delivered, retry, error, and failed rates, including + status-specific splits for `down`, `running`, and `confirmed_down` - Veriflier response times +- Detection flow timing: first failure → Seems Down, first failure → + Veriflier escalation, Seems Down → Down, Seems Down → false alarm, and + Seems Down → probe-cleared recovery +- Detection outcome counters split by local failure class (`server`, `client`, + `blocked`, `https`, `redirect`, `intermittent`) for false-alarm and + confirmed-down rate comparisons +- Veriflier decision counters: escalations, RPC success/error, confirm/disagree + votes, quorum-met confirmations, and false alarms +- Per-Veriflier-host RPC and vote counters under `verifier.host..*` so + region/provider disagreement and latency can be compared during v2 production +- Legacy projection drift: per-bucket count of active sites whose + `site_status` no longer matches the authoritative open HTTP event - Memory usage StatsD is the primary metrics transport. For integration with external systems, expose the Graphite/StatsD data via your existing metrics pipeline. ### Veriflier Health -Verifliers that fail to respond are automatically excluded from confirmation requests. The System Health Map shows each Veriflier's reachability and last response time. If the number of healthy Verifliers drops below `PEER_OFFLINE_LIMIT`, no further downtime confirmations can be issued — monitor Veriflier health closely. +Verifliers that fail to respond are automatically excluded from confirmation requests. If the number of healthy Verifliers drops below `PEER_OFFLINE_LIMIT`, no further downtime confirmations can be issued — monitor Veriflier health closely. Verify Veriflier connectivity manually: diff --git a/ROADMAP.md b/ROADMAP.md index 4c207640..e4ef2c06 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -4,15 +4,138 @@ Deferred features that are intentionally out of scope for the current implementa --- +## Prioritized TODO + +This is the current implementation/refinement queue. Lower-priority items are +not abandoned; they are intentionally sequenced behind the v2 production +migration and the operating data needed to make larger architecture decisions. + +### P0 - v2 production hardening + +- **Keep the v2 deployment target conservative.** Ship and stabilize the + current main-server-plus-Veriflier design before moving toward a v3 + probe-agent architecture. The v2 event tables remain authoritative while + `LEGACY_STATUS_PROJECTION_ENABLE` keeps legacy `site_status` / + `last_status_change` consumers working during migration. Use the pinned + bucket rollout path for the first v1-to-v2 production migration, then remove + `PINNED_BUCKET_*` after every host is on v2 and stable. +- **Keep rollout health visible before cutover.** Operators should not have to + infer migration-critical state from logs or config while replacing v1 hosts. + The operator dashboard now shows bucket ownership mode, legacy projection + mode, delivery-worker ownership, rollout preflight commands, and live + dependency health for MySQL, Verifliers, WPCOM, StatsD, and log/stats disk + writes. Keep this visible and verified during rollout rehearsal because it + helps separate customer-site downtime from monitor-side impairment during + cutover. +- **Use delivery ownership as a rollout guard.** + In the single-binary deployment, `API_PORT > 0` also starts webhook and + alert-contact delivery workers. A standalone `jetmon-deliverer` entry point + and transactional `SELECT ... FOR UPDATE` row claims now exist; use + `DELIVERY_OWNER_HOST` as a rollout guard when intentionally keeping delivery + single-owner during migration from embedded to standalone delivery. +- **Run a production rollout rehearsal pass.** Validate that README, + `docs/v1-to-v2-pinned-rollout.md`, config samples, systemd units, + `validate-config`, `rollout pinned-check`, `rollout projection-drift`, and + rollback steps line up exactly before the first production host replacement. +- **Instrument the data needed for the v3 decision.** During v2 production, + measure first-failure-to-`Seems Down`, `Seems Down`-to-`Down`, false alarm + rate by failure class, Veriflier agreement/disagreement by region, Veriflier + latency/timeout rates, mixed-region outcomes, monitor-side `Unknown` cases, + primary-check vs confirmation cost, operator explanation gaps, and WPCOM + notification parity. StatsD now emits the core detection timings, outcome + counters split by local failure class, and per-Veriflier-host RPC/vote + counters, plus legacy WPCOM notification attempt/delivered/retry/error/failed + counters split by status. Durable report queries should wait until v2 has + enough real traffic to prove which questions operators actually need to ask. +- **Watch projection drift as a production bug.** While the legacy projection + is enabled, event mutations, transition rows, and the site-row projection + must remain transactionally consistent. `jetmon2 rollout projection-drift` + lists the exact active sites whose legacy projection disagrees with the + authoritative HTTP event state, so rollout failures are actionable instead of + count-only. +- **Keep roadmap/API documentation drift out of the branch.** `API.md` is the + source for the implemented internal `/api/v1` route surface. This roadmap + should track only the remaining public/customer API work, production + hardening, and deferred architecture choices. + +### P1 - post-v2 platform refinement + +- **Extract `jetmon-deliverer` when delivery scale or blast radius warrants + it.** Move webhook delivery, alert-contact delivery, and eventually WPCOM + notification dispatch behind one outbound-delivery binary. Initial shared + worker wiring, a standalone `jetmon-deliverer` entry point, and + transactional row claims exist. A sample systemd service is available at + `systemd/jetmon-deliverer.service`. The rollout policy is captured in + [`docs/jetmon-deliverer-rollout.md`](docs/jetmon-deliverer-rollout.md); + the remaining production cutover work is deployment-system adoption and + host-specific config wiring. +- **Unify webhook and alerting dispatch plumbing after production evidence.** + Keep the packages separate until there are two proven implementations and a + third transport path via WPCOM migration, then factor the shared retry, + claim, dispatch, and circuit-breaker shape behind a transport interface. +- **Migrate WPCOM notifications behind alert contacts/deliverer.** Do this + only after alert contacts have proven stable in production and recipient + parity has been verified. +- **Adopt consumer-specific OpenAPI generator validation when one is chosen.** + The route-driven `GET /api/v1/openapi.json` endpoint now includes + handler-derived request/response component schemas, and `make test` validates + schema refs plus a generated Go client smoke source. If production consumers + standardize on a specific generator, add that exact tool to CI so tool-specific + schema drift breaks before release. +- **Plan encryption-at-rest for outbound credentials before public/customer + secret management.** Plaintext webhook secrets and alert-contact + destination credentials are acceptable for the current internal threat + model, but KMS-style encryption should be planned before exposing + customer-managed secrets more broadly. See + [`docs/outbound-credential-encryption-plan.md`](docs/outbound-credential-encryption-plan.md). + +### P2 - v3 and product-driven extensions + +- **Revisit Candidate 3 after v2 has production data.** The current leading + v3 option is a central scheduler plus regional probe agents. The migration + should start with richer v2 probe metadata, then durable confirmation jobs, + generic probe agents, shadow-mode primary jobs, and gradual cutover. +- **Add regional/per-vantage status only when the support story is ready.** + Regional classifications, per-vantage SLA, and richer `Unknown` handling + depend on probe-agent data and taxonomy work; they should not leak to + customers prematurely. +- **Treat alert/webhook polish as demand-driven.** Grace-period webhook secret + rotation, `site.state_changed` webhooks, alert digest mode, quiet hours, + external acknowledgements, SMS, and OpsGenie are clean additions, but should + wait for customer demand or compliance pressure. +- **Retire the legacy status projection after consumers migrate.** Once + downstream readers use the v2 API/event tables, disable + `LEGACY_STATUS_PROJECTION_ENABLE` and stop treating stale legacy status + values as meaningful. + +--- + +## v3 Probe-Agent Architecture + +**Status:** Parked until v2 has been deployed to production and stabilized. + +The current v2 production target keeps the main-server-plus-Veriflier +confirmation model. After v2 has enough production data, revisit whether Jetmon +should evolve into a central scheduler plus regional probe-agent architecture. + +See [`docs/v3-probe-agent-architecture-options.md`](docs/v3-probe-agent-architecture-options.md) +for the candidate architectures, data to gather during v2, and the current +recommendation. + +--- + ## Public REST API -**Status:** Not started. No existing API surface covers this scope. +**Status:** Not started as a customer-facing surface. The v2 branch has an +internal `/api/v1` behind a gateway (see ADR-0002); this item is about the +public/customer contract and the gateway-facing semantics needed to expose it +safely. ### What it is -A versioned, authenticated REST API (`/api/v1/`) on competitive parity with established uptime monitoring services (Pingdom, UptimeRobot, Better Uptime, Datadog Synthetics). Users and integrations interact with Jetmon entirely through this API — reading current health state, pulling event history and SLA statistics, managing what gets monitored, configuring alerts, and triggering on-demand checks. +A versioned, authenticated customer-facing REST API on competitive parity with established uptime monitoring services (Pingdom, UptimeRobot, Better Uptime, Datadog Synthetics). Users and integrations interact with Jetmon entirely through this API — reading current health state, pulling event history and SLA statistics, managing what gets monitored, configuring alerts, and triggering on-demand checks. -Currently, Jetmon has no public API. The operator dashboard exposes real-time state via SSE for human consumption. Check configuration requires direct writes to `jetpack_monitor_sites`. Event and audit data requires direct DB queries or use of the `jetmon2 audit` CLI. There is no programmatic interface for users or external tooling to interact with Jetmon. +Currently, Jetmon's API is internal-only: callers are known services, tenant isolation lives at the gateway, errors are intentionally verbose, and ownership checks are coarse. What is missing is a stable public contract with customer-scoped auth, tenant ownership, sanitized error semantics, public rate limits, and payloads safe to expose directly to customer tooling. The capability list below describes the public/customer contract target; many internal equivalents already exist and are documented in `API.md`. ### Why it matters @@ -94,35 +217,474 @@ Programmatic management of where alerts go. Competitors that omit this force use | `GET /api/v1/sites/{blog_id}/alert-contacts` | List which contacts are subscribed to a site | | `PUT /api/v1/sites/{blog_id}/alert-contacts` | Set the alert contact list for a site | -**Alert contact types (v1):** email, webhook (generic HTTP POST with configurable payload template). Later: Slack, PagerDuty, OpsGenie, SMS. +**Alert contact types:** the internal API currently supports email, PagerDuty, Slack, and Teams. Generic customer-owned HTTP POSTs should use the HMAC-signed webhooks API instead of duplicating that surface as an alert-contact transport. Later, direct SMS or OpsGenie can be added if customer demand justifies them. **Webhook contract.** Outbound webhook POSTs carry a standard envelope: `event_type`, `site_id`, `blog_id`, `timestamp`, `event` (the full event object). `event_type` values: `site.seems_down`, `site.down`, `site.recovered`, `site.degraded`, `maintenance.started`, `maintenance.ended`. The payload structure is versioned and must not break existing webhook consumers when new fields are added. -### Design decisions to make before building +### Public API decisions before direct exposure + +The internal API decisions are implemented in `internal/api/` and documented in +`API.md`. A public/customer API is a different contract and needs these +decisions before direct exposure: + +**Tenant and ownership model.** The baseline gateway-to-Jetmon tenant contract +is drafted in [`docs/public-api-gateway-tenant-contract.md`](docs/public-api-gateway-tenant-contract.md): +the gateway remains the first tenant boundary, while Jetmon-side ownership +columns become necessary for defense in depth or any direct public exposure. +Direct customer exposure requires every read/write to be tenant-scoped. + +**Auth scopes.** The internal API uses coarse `read` / `write` / `admin` +scopes. Public keys likely need granular scopes such as `sites:read`, +`events:read`, `webhooks:write`, and `alerts:write` so customer integrations can +be least-privilege. + +**Error and metadata redaction.** Internal responses can expose query stages, +DB error classes, verifier names, and operational metadata. Public responses +need sanitized errors and customer-safe event metadata, with detailed context +remaining in server logs and operator-only surfaces. + +**Public rate limits and abuse controls.** Internal limits are service +protection. Public limits need commerce/abuse semantics, likely per tenant plus +per key, with separate controls for expensive operations such as trigger-now. + +**Webhook ownership and signing posture.** Internal HMAC signing is acceptable +today. Public customer-managed webhooks may need per-tenant ownership columns, +public-key/asymmetric signing, or stronger secret storage before direct +exposure. + +**OpenAPI and compatibility policy.** The customer contract needs a generated +OpenAPI 3.1 spec, client-codegen validation, explicit deprecation rules, and +tests that fail when handler behavior drifts from the published schema. + +### Public API work still to do + +- Backfill and reconcile `jetmon_site_tenants` from the gateway/customer source + of truth before customer traffic depends on Jetmon-side site enforcement. + Initial CSV import support exists via `jetmon2 site-tenants import`; remaining + work is agreeing on the gateway export contract and pruning/reconciliation + policy for mappings that disappear from the source of truth. +- Add public-contract integration tests for route-level tenant success and + denial paths across sites, events, stats, trigger-now, webhooks, and alert + contacts. +- Add customer-safe error and metadata redaction paths for every public route. +- Promote the internal route-driven `GET /api/v1/openapi.json` contract into a + public compatibility policy with deprecation rules and consumer-specific + generator validation. +- Add public-contract integration tests for auth, pagination, idempotency, + redaction, and trigger-now abuse controls. +- Revisit response-time/SLA pre-aggregation before exposing high-volume public + reporting queries. +- Document the migration path for consumers that currently use direct MySQL or + bespoke internal integrations. + +--- + +## Deferred from Phase 3 (webhooks) + +These were considered during Phase 3 design and intentionally left out of v1 with clean upgrade paths. + +### `site.state_changed` webhook events + +Phase 3 v1 ships only `event.*` webhooks (one per `jetmon_event_transitions` row). A `site.state_changed` rollup webhook — fires when the site's derived rollup state changes — was punted because: + +- Detecting site-level transitions cleanly without races requires changes to the orchestrator (it currently writes `site_status` but doesn't compute deltas) +- Event-level webhooks already give consumers everything they need to compute site-level rollup themselves +- The schema for site state is downstream of the events tables; we'd be adding a second source of truth for "the site is now Down" + +**When to revisit:** a real consumer asks for site-level rollup webhooks specifically. Likely shape: orchestrator computes a "previous_state → new_state" rollup from active events; a delivery worker translates that into `site.state_changed` deliveries. Same retry/filter/signature plumbing as `event.*` webhooks — the only new piece is the orchestrator-side delta computation. + +### Grace-period webhook secret rotation + +Phase 3 v1 ships immediate-revocation only: rotating a webhook secret invalidates the old secret immediately. Brief signature-verification failures during the consumer's deploy window go into the retry queue and resolve once the consumer rolls. + +A future Phase 3.x extension is **grace-period rotation**: server signs with both old and new secrets for a configurable window (24h default), consumer verifies whichever they support, then the old secret expires. This matches Stripe's webhook signing roll model and lets consumers deploy at their own pace. + +**Why this is a clean future addition:** +- Schema extension only: add `previous_secret_hash` and `previous_secret_expires_at` columns to `jetmon_webhooks` +- Header format already supports multiple `v1=` values (Stripe-compatible) +- New endpoint shape: `POST /webhooks/{id}/rotate-secret?grace=24h` +- No migration of existing webhooks needed; immediate-revocation is the default if `?grace` is absent + +**When to revisit:** a customer-managing consumer (not the gateway, not internal alerting) registers webhooks and asks for graceful rotation, or a compliance requirement forces routine secret rotation. + +--- + +## Deferred from Phase 3.x (alert contacts) + +These were considered during Phase 3.x design and intentionally left out of v1. Each has a clean addition path that doesn't disturb the v1 schema or worker shape. + +### Generic outbound webhook as an alert-contact transport + +Phase 3.x ships four managed transports: email, PagerDuty, Slack, Teams. A "generic webhook" alert-contact transport (POST a Jetmon-formatted JSON payload to any URL) was considered and rejected because the webhooks API (Family 4) already covers it — and covers it better, with HMAC signing, configurable filters across more dimensions, and a fully programmable payload shape. + +**The boundary:** alert contacts deliver Jetmon-rendered notifications through Jetmon-owned transports. Webhooks deliver the raw signed event stream for the consumer to render. A customer who wants "POST to my URL when sites change" should register a webhook; we shouldn't ship a duplicate surface that does the same thing worse. + +**When to revisit:** never, unless the boundary itself shifts (e.g. webhooks API gets removed, or alert contacts grows into a fundamentally different abstraction). + +### SMS notifications + +Skipped in v1. WPCOM SMS infrastructure availability is unclear, and a third-party SMS provider integration (Twilio/MessageBird/etc.) is a non-trivial credentialing and billing addition. PagerDuty already offers SMS as a downstream config — the dominant SMS use case is "page me," and that's already covered. + +**When to revisit:** a customer asks specifically for direct SMS without going through PagerDuty, AND a stable SMS sending channel (WPCOM-owned or vendor-procured) is available. + +### OpsGenie transport + +Skipped in v1. Same shape as PagerDuty but a different vendor; PagerDuty covers the dominant slice of customers who want incident-management routing. Adding OpsGenie is mechanical (new transport implementation, ~100 LoC) once a customer asks. + +**When to revisit:** a customer running OpsGenie asks for direct integration. Until then, they can route via webhook to OpsGenie's events API themselves. -**Authentication.** API keys stored in a `jetmon_api_keys` table (hashed, scoped, with optional expiry). The `Authorization: Bearer ` pattern from the Veriflier transport is the reference. Scopes: `read` (Capabilities 1–3), `write` (Capabilities 4–5), `admin` (key management). OAuth is overkill for an internal service; API keys are sufficient and match what competitors use for programmatic access. +### Quiet hours / on-call schedules -**Key lifecycle CLI.** `jetmon2 apikey create [--scope read|write|admin] [--expires 90d] [--label "CI deploy script"]`, `jetmon2 apikey revoke `, `jetmon2 apikey list`. Keys are never returned after creation; only the ID and label are stored. +Per-contact "don't page me between 11pm and 7am" or "route to alternate contact during my vacation" was considered and deferred. Reasons: -**Hosting.** API runs within the `jetmon2` binary on a dedicated port (separate from the operator dashboard port). Embedding keeps deployment to one artifact. The operator dashboard's existing HTTP server in `internal/dashboard/` is the starting point — the API mounts alongside it or on a configurable separate port. +- PagerDuty already handles this on its end with full schedule support; customers using PagerDuty don't need it from Jetmon. +- For Slack/email/Teams contacts, channel-level mute or auto-responders work as a workaround. +- Building scheduling into Jetmon is a rabbit hole — timezone handling, recurring patterns, escalation overrides, holiday lists. Each of those is a feature in itself. -**Pagination.** Cursor-based pagination for all list endpoints, using `event_id` or `timestamp` as the cursor. Offset-based pagination is rejected for append-only log tables. `limit` defaults to 100, max 1000. Response includes `next_cursor` when more results exist. +**When to revisit:** strong customer demand specifically for non-PagerDuty contacts AND a clear scope for what "scheduling" means in v1 (probably starts with a single per-contact `quiet_hours: {start, end, tz}` field, not full PagerDuty parity). -**Rate limiting.** Per API key. Default limits: 60 requests/minute for read, 20 requests/minute for write, 5 requests/minute for trigger. Configurable per key in the DB. The `trigger` endpoint has its own bucket separate from read/write to prevent it from being used as a DoS vector against the check pipeline. Rate limit headers (`X-RateLimit-Limit`, `X-RateLimit-Remaining`, `X-RateLimit-Reset`) returned on every response. +### Alert acknowledgements -**Schema versioning.** `/api/v1/`. Breaking changes require a new version prefix. Additive changes (new fields, new endpoints) are backwards-compatible within v1. The version prefix is in the URL, not a header, to make it unambiguous in logs. +"Operator acks an alert from PagerDuty/Slack and Jetmon stops re-paging" was considered and deferred because it's bidirectional — Jetmon would need to receive callbacks from each transport, store ack state, and gate further deliveries against it. That's a significant new surface (inbound webhooks from PagerDuty, Slack interactivity API, etc.) for a feature most customers handle within their incident-management tool. -**Trigger-now semantics.** The trigger endpoint enqueues an immediate check for the endpoint; it does not wait for the result. The response returns a `request_id`. The caller polls `GET /api/v1/sites/{blog_id}/history?request_id=` or waits for the event stream to observe the result. This avoids holding HTTP connections open for the duration of a check. +**When to revisit:** a customer specifically asks for cross-channel ack state (e.g. "I acked in PagerDuty, don't keep posting to Slack"). Probably ships as a per-contact `respect_external_ack: bool` flag plus per-transport ack-receiver implementations. -**Relationship to SLA Reporting.** The statistics capability (Capability 3) is a superset of the "Incident History and SLA Reporting" stretch goal from `PROJECT.md`. Building Capability 3 makes that stretch goal a subset of what's already available. +### Alert grouping / digest mode -### What needs to be built +When a regional outage flips 50 sites at once, v1 sends 50 separate notifications per matching contact (modulo the per-hour rate cap, which kicks in but only as a brake, not a grouping mechanism). A real grouping/digest feature — "send one email containing all transitions in the last 5 minutes" — was deferred. + +**Why deferred:** per-event delivery matches webhook semantics, is the simplest semantic to reason about, and is what most monitoring tools start with. Grouping introduces real questions (window size, group boundary criteria, what happens if a transition arrives mid-group) that benefit from real customer feedback. + +**When to revisit:** real users complain about pager noise during regional outages even with `max_per_hour` set. Likely shape: per-contact `digest_window_seconds` field; transitions within the window batch into one notification at window end. + +### Migrate WPCOM notifications behind alert contacts + +Phase 3.x ships alert contacts alongside the existing WPCOM notification flow rather than migrating the WPCOM flow to be a transport behind alert contacts. The two paths coexist; same human can be in both and receive duplicate notifications. + +**Why deferred:** drop-in compatibility with the existing v1 deployment shape is more important than architectural unification. Migrating WPCOM-flow consumers to alert contacts requires: +- Inventorying all current WPCOM notification recipients and their subscription patterns +- Building a `wpcom` transport (or reusing an existing one) that delivers through the same channel +- Migrating the per-recipient subscription data into `jetmon_alert_contacts` +- Verifying nothing regresses for the existing recipients during cutover + +This is a coordinated migration, not a code change — and it's safer to do once alert contacts has proven out in production with real customers. + +**Why this is a clean future addition:** +- The transport interface is already pluggable; adding a `wpcom` transport is the same shape as `email`/`pagerduty`/`slack`/`teams`. +- The orchestrator's existing WPCOM notification call site becomes a simple "delete this code path" once parity is verified. +- The deliverer-binary extraction (see Architectural roadmap below) becomes meaningfully cleaner with WPCOM unified — it's the third transport that justifies the split. + +**When to revisit:** alert contacts has been in production for 1–3 months without major issues, AND the deliverer-binary extraction is being actively planned. The two are the same conversation. + +--- + +## Architectural roadmap + +### Multi-repo / multi-binary split + +Today everything lives in one repo and the `jetmon2` binary contains the orchestrator, the API server, the operator dashboard, and (after Phase 3) the webhook delivery worker. The `veriflier2` binary is already separate but in the same repo. + +This is fine for now but won't scale operationally. Different concerns have very different deployment shapes: + +| Concern | Scaling axis | Deployment shape | +|---------|--------------|------------------| +| Orchestrator | bucket count, check rate | stateful (claims buckets in `jetmon_hosts`); horizontal via bucket coordination | +| API server | request rate | stateless; horizontal behind a load balancer | +| Outbound delivery | event volume + slow third parties | stateless; horizontal via row-claim on per-transport delivery tables | +| Operator dashboard | one-off operator sessions | one per ops region | +| Veriflier | geo-distributed vantage points | one per region | + +Putting everything in one binary means scaling the most expensive concern scales the cheap ones with it (CPU and memory headroom that's only used for one purpose). It also concentrates failure modes — a panic in the API server takes down the orchestrator. + +**Plausible split:** +- `jetmon-orchestrator` — round loop, check pool, DB writes +- `jetmon-api` — REST API server, auth, rate limiting (read/write surface) +- `jetmon-deliverer` — all outbound dispatch: webhooks (Phase 3), alert contacts, WPCOM notifications +- `jetmon-dashboard` — operator UI / SSE state stream +- `jetmon-verifier` — standalone HTTP check executor (today: `veriflier2`; rename TBD) + +**Why `jetmon-deliverer` is one binary, not three.** Webhooks, alert contacts, and WPCOM notifications all share the same plumbing: poll `jetmon_event_transitions` (or a similar source), build a frozen-at-fire-time payload, dispatch with a per-destination in-flight cap, retry on failure with exponential backoff, mark abandoned after N attempts. Only the transport differs (HTTPS POST + HMAC for webhooks, transport-specific protocols for PagerDuty/Slack/email/SMS, internal RPC for WPCOM). Splitting them into separate binaries would triple the operational surface (three deploy units, three retry queues, three sets of metrics) for what is fundamentally one job — outbound dispatch — with pluggable transports. Keeping them in one process also means a single circuit-breaker registry across destinations, which is the natural place to enforce shared-resource caps (e.g. "don't open 5,000 outbound connections during a regional outage"). + +What this means concretely: +- The Phase 3 webhook worker (`internal/webhooks/worker.go`) is the seed. Its `dispatchTick` / `deliverTick` shape generalizes — the matching, claiming, retry, and abandon logic is transport-agnostic. +- A future refactor abstracts the transport behind a `Dispatcher` interface (`Send(ctx, dest, payload) (status, error)`), with concrete implementations per channel. +- Per-channel state (webhook subscriptions, alert contacts, WPCOM circuit breaker counters) stays in its own table; the worker loops over each. + +**Revisit point: unify `internal/alerting/` and `internal/webhooks/`.** Phase 3.x ships alert contacts as a separate package (`internal/alerting/`) parallel to webhooks, deliberately *not* extending the webhook worker. The reasoning at the time was: alerting hadn't been built yet, we didn't know what shape it would actually take (fan-out? escalation? digest mode?), and forcing a shared abstraction with one known user (webhooks) and one guessed-at user (alerting) risked an abstraction that fits neither well. Better to build alerting concretely, see where the duplication actually lands, and factor with two real implementations in hand. + +The deliverer-binary extraction is the natural moment to revisit. By then we'll have: +- Two concrete dispatch workers in production with known operational profiles. +- A clear picture of what alerting actually grew into vs. what webhooks needed. +- A real third transport on the way (WPCOM migration), which validates the abstraction against three users instead of two. + +At that point, factor a `Dispatcher` interface against the three known shapes — not before. The duplication cost between `internal/webhooks/` and `internal/alerting/` is bounded (~300 lines); the cost of a wrong abstraction is unbounded. + +**Trigger that justifies the split.** A single outbound transport doesn't justify its own binary — webhooks alone could stay co-located with the orchestrator. The argument gets compelling once there are *multiple* transports to dispatch and a shared retry/circuit-breaker substrate to amortize. Adding alert contacts is the moment the abstraction earns its keep; pulling WPCOM notifications out of the orchestrator at the same time is the cleanup that pays off the extraction. + +The MySQL schema is already the implicit bus between these — each service reads/writes specific tables. Splitting would mostly be: +1. Extract each concern into its own `cmd//` directory with a thin main +2. Move shared types into `pkg/` (currently `internal/`) so the binaries can depend on them across repos +3. Decide on repo boundaries (one monorepo with multiple binaries, vs. multiple repos sharing a `pkg/` module) + +**Naming opportunity:** "veriflier" is a long-standing typo of "verifier" that has stuck around through the rewrite. A split is a natural moment to rename. Candidates: `verifier`, `witness`, `probe-worker`, `vantage`. Worth deciding before the split happens, not during. + +**When to revisit:** when a single binary's resource needs (CPU, memory, restart blast radius) starts working against the operational sweet spot for one of the concerns. The deliverer split specifically becomes worthwhile when alert contacts ship — that's the second outbound transport, and a third (WPCOM notifications) follows for free since they already exist as code that wants to live next to the others. + +### Path to a public API + +Today's API is internal-only — every caller is a known service (gateway, alerting workers, dashboard) and tenant isolation lives at the gateway. Several Phase 1–3 design decisions take advantage of that and would have to change if Jetmon ever exposes its API directly to end customers without a gateway in front. + +The decisions affected: + +| Decision | Internal-API form | Public-API form | +|----------|-------------------|-----------------| +| Auth scopes | Three coarse: `read` / `write` / `admin` | Granular per-resource (e.g. `sites:read`, `events:read`, `webhooks:write`) so customer keys can be scoped tightly | +| Error semantics | Honest 401/403/404 (no info-leak hiding) | 404-on-unauthorized (don't leak existence of resources owned by other tenants) | +| Error message verbosity | Verbose (DB error class, query stage) for incident response | Sanitized — internal detail belongs in server logs only | +| Webhook ownership | Any `write`-scope token can manage any webhook (`created_by` audit only) | Per-tenant ownership column; reads/writes filtered by owner | +| Webhook signing | HMAC-SHA256 with shared secret per webhook | Asymmetric (Ed25519) becomes more attractive — public key at a well-known URL, no per-customer secret to leak | +| Rate limiting | Per-key bucket sized for service protection | Per-tenant bucket sized for commerce/abuse | +| Idempotency keys | Scoped by `(api_key_id, key)` | Scoped by `(tenant_id, api_key_id, key)` to prevent cross-tenant collisions | +| Site `id` (= `blog_id`) | Numeric, canonical from WPCOM | Probably still numeric, but tenant-scoped on lookup | + +The migrations are individually clean (each is "add a column, filter on it, deprecate the unscoped version") but they touch most of the API surface. A public-API exposure would be a significant project, not a flag flip. + +**When to revisit:** if a stakeholder asks "can a customer integration call Jetmon directly?" — the answer should be "let's design that" rather than "yes, here's the URL." + +The Q9 (webhook ownership) section in API.md captures the most concrete piece of this; the rest is captured here for visibility when the conversation comes up. + +--- -- API key management: `jetmon_api_keys` table, key generation/revocation CLI, request authentication middleware with scope enforcement. -- Alert contacts: `jetmon_alert_contacts` table, `jetmon_site_alert_contacts` join table, outbound webhook dispatcher with retry queue. -- Query handlers: thin layer over existing DB functions in `internal/db/`, with response serialisation and cursor pagination. -- Statistics handlers: uptime/response-time aggregation queries; must be pre-aggregated or cached to avoid slow queries on large history tables. -- Manage handlers: validated writes to endpoint and check tables, triggering orchestrator pickup. -- Trigger handler: enqueue immediate check; return `request_id` for polling. -- Rate limiting middleware: per-key token bucket, separate buckets for read/write/trigger, rate-limit headers. -- Integration tests in the Docker Compose environment covering auth, pagination, state consistency, and webhook delivery. +## Completed + +This section lists major roadmap-level work completed since the v1 baseline, +including both the original `v2` rewrite and later work on this branch. It is +intentionally higher level than a changelog: entries explain what exists now, +where to look, and what each item unlocked. + +### v1-to-v2 Rewrite Foundation + +- **Single Go monitor binary.** Jetmon 2 replaces the Node.js master/worker + process tree and C++ native HTTP checker addon with the Go `jetmon2` binary. + This removes `npm`, `node-gyp`, and native-addon build friction while keeping + the legacy external contracts intact. +- **Go check pool with bounded concurrency.** HTTP checks run through + `internal/checker` using goroutines, `net/http`, and `httptrace` timing + capture instead of the v1 native addon. + The pool records DNS, TCP, TLS, TTFB, and total RTT timings and can adjust + worker count under queue or memory pressure. +- **Go orchestrator and retry queue.** The v2 orchestrator owns round + scheduling, local retry state, Veriflier escalation, WPCOM notifications, and + graceful drain behavior. + This preserves the v1 detection flow while making the retry queue and + shutdown behavior testable in Go. +- **Go Veriflier replacement.** `veriflier2` replaces the Qt/C++ Veriflier + with a small Go HTTP service and shared check logic. + The old custom SSL server dependency is gone, and the transport is easier to + test and deploy. +- **Embedded migrations and schema bootstrap.** `jetmon2 migrate` applies the + v2 additive schema and can create the legacy `jetpack_monitor_sites` table in + local/dev databases. + This makes fresh Docker environments and production schema upgrades use the + same migration path. +- **MySQL bucket coordination.** v2 introduced `jetmon_hosts` ownership and + heartbeat logic so hosts can claim, release, and reclaim bucket ranges + dynamically. + Static v1 bucket ranges are still supported later through pinned rollout + mode, but dynamic ownership is the v2 steady-state target. +- **Compatibility-preserving StatsD and stats files.** The Go metrics layer + keeps the existing StatsD prefix shape and `stats/` file outputs used by + legacy monitoring. + This lets operational dashboards survive the rewrite while new metrics are + added incrementally. +- **WPCOM client with circuit breaker.** The v2 WPCOM client preserves the + legacy notification payload while adding bounded queueing and circuit-breaker + behavior. + This protects monitor rounds from prolonged WPCOM API failures. +- **Operator dashboard and health surface.** v2 added a built-in dashboard for + worker state, queues, buckets, memory, WPCOM circuit state, and later rollout + and dependency health. + It gives operators a first-party view into the monitor without querying the + database directly. +- **Systemd and logrotate packaging.** The v2 branch added production service + and logrotate templates for the Go monitor. + These files provide the baseline deployment shape for rolling host updates. +- **Initial Docker Go development environment.** Docker builds now compile the + Go monitor and Veriflier, run migrations, and use the new config-rendering + entrypoints. + Later Docker cleanup refined ports, permissions, Mailpit, healthchecks, and + non-root MySQL credentials. + +### Core State and Detection + +- **Event-sourced incident state.** Jetmon now writes authoritative incident + state to `jetmon_events` and append-only lifecycle history to + `jetmon_event_transitions`. + Useful for: reconstructing incidents, API reads, webhook/alert delivery, and + legacy projection drift checks. +- **Shadow-state migration support.** The legacy `site_status` projection is + maintained behind `LEGACY_STATUS_PROJECTION_ENABLE` while v2 event tables + remain authoritative. + This keeps v1 consumers working during migration without making the legacy + column the source of truth. +- **API state derived from v2 events.** Site API responses use open v2 events + to report current health state instead of trusting only the legacy site row. + This keeps the API aligned with the eventstore during the shadow migration. +- **Detection-flow instrumentation.** StatsD now captures first failure to + Seems Down, first failure to Veriflier escalation, Seems Down to Down, + false-alarm timing, and probe-cleared recovery timing. + These metrics are the data set needed to evaluate future v3 probe-agent + designs with production evidence. +- **Outcome metrics split by failure class.** False alarms and confirmed-down + outcomes are split by local failure class such as `server`, `client`, + `blocked`, `https`, `redirect`, and `intermittent`. + This makes it possible to see which failure classes produce useful + confirmations and which produce noisy escalations. +- **Veriflier hardening and observability.** Veriflier request handling now has + stronger validation, safer body limits, clearer config behavior, and + per-host RPC/vote metrics. + The v2 production transport is documented as JSON-over-HTTP, with proto files + retained only as a future schema reference. +- **WPCOM notification parity metrics.** Legacy WPCOM notification attempts, + deliveries, retries, errors, and final failures are counted with + status-specific splits. + This supports production parity checks while WPCOM remains outside the new + deliverer path. + +### API and Gateway Surface + +- **Internal REST API foundation.** The internal `/api/v1` surface now includes + API-key auth, read endpoints, event detail/list endpoints, SLA/stat queries, + and authenticated write endpoints. + This moved Jetmon from DB-only integration toward a service boundary for + dashboards, gateway callers, CI tooling, and delivery workers. +- **Idempotent writes and scope enforcement.** POST-style writes support + idempotency keys, and route-level scope checks are covered through the full + mux. + API key revocation also honors future `revoked_at` timestamps so rotations + can use a grace window. +- **Site management write surface.** The API can create/update/delete/pause/ + resume sites, close events, and trigger an immediate check. + The write handlers preserve the eventstore and legacy-projection invariants + used by the orchestrator. +- **Site scheduling fields in API responses.** API site payloads now expose + operational scheduling/config fields such as check interval, maintenance + window, redirect policy, keyword, SSL expiry, and alert cooldown. + This lets API consumers inspect the settings that affect monitoring behavior. +- **Site soft-delete contract.** The soft-delete behavior is documented so + collaborators know how disabled sites are represented and what API consumers + should expect. + This avoids accidental hard-delete semantics while the legacy table remains + shared infrastructure. +- **Gateway tenant boundary.** The gateway-to-Jetmon tenant contract is + documented, and gateway-routed requests now carry trusted tenant context + through the API middleware. + Non-gateway consumers cannot spoof public-context headers. +- **Tenant ownership enforcement.** Gateway-routed site, event, stats, + trigger-now, webhook, alert-contact, delivery, and manual retry paths are + scoped through `jetmon_site_tenants` or resource `owner_tenant_id`. + This gives defense-in-depth behind the gateway while preserving unscoped + internal-operator behavior. +- **Site tenant import tooling.** `jetmon2 site-tenants import` can load + `tenant_id,blog_id` mappings from CSV, including dry-run validation. + This provides the operator path for backfilling gateway ownership data before + customer traffic depends on Jetmon-side checks. +- **Gateway tenant route tests.** Public-contract tests now cover mapped and + unmapped gateway paths across the key route families, including event lists, + transition lists, and trigger-now. + These tests reduce the risk that future API work bypasses tenant ownership + checks. +- **Route-driven OpenAPI contract.** `GET /api/v1/openapi.json` is generated + from the route table with request/response component schemas. + Tests validate schema references and smoke-check generated Go client source + so route/schema drift is caught early. + +### Delivery and Alerting + +- **HMAC webhook delivery.** Webhook CRUD, HMAC-signed outbound delivery, + filtering, retry, abandonment, delivery listing, and manual retry are + implemented. + Payloads are frozen at fire time so consumers see the event state that caused + the delivery. +- **Alert contacts.** Managed alert contacts now support email, PagerDuty, + Slack, and Teams, with send-test endpoints, delivery listing/retry, retry + behavior, and per-contact rate caps. + Email supports `stub`, `smtp`, and `wpcom` senders so local, staging, and + production modes can share the same API. +- **Delivery claiming.** Webhook and alert-contact delivery workers claim rows + before dispatch so multiple workers do not dispatch the same pending delivery. + This is the database coordination point that makes standalone delivery + feasible. +- **Delivery owner guard.** `DELIVERY_OWNER_HOST` constrains embedded delivery + to the intended host during conservative rollout. + This lets API-enabled hosts serve traffic without accidentally becoming + outbound delivery owners. +- **Standalone deliverer entry point.** `bin/jetmon-deliverer` runs webhook + and alert-contact workers without starting the monitor, API, dashboard, or + bucket ownership loop. + It is the first concrete process boundary for the future outbound-delivery + split. +- **Deliverer service packaging.** A sample + `systemd/jetmon-deliverer.service` now exists, and `jetmon-deliverer + validate-config` checks config parsing, DB connectivity, email transport + mode, and delivery ownership. + The rollout docs describe the service, process-specific `deliverer.json`, + and the shared `DB_*` environment expectations. + +### Rollout and Operations + +- **Pinned v1-to-v2 rollout mode.** v2 hosts can run pinned to the exact bucket + range of the v1 host they replace. + Example: `./jetmon2 rollout pinned-check` verifies pinned config, projection + writes, dynamic-ownership absence, active-site coverage, and projection drift + before cutover. +- **Dynamic ownership preflight.** `./jetmon2 rollout dynamic-check` verifies + that pinned ranges are removed, `jetmon_hosts` rows cover the full bucket + range without gaps/overlaps, heartbeats are fresh, and projection drift is + zero. + This supports the second step after every host has moved safely to v2. +- **Projection drift reporting.** `./jetmon2 rollout projection-drift` lists + the specific active sites whose legacy projection disagrees with the + authoritative open HTTP event. + Operators get actionable rows instead of a count-only rollout failure. +- **Rollout guidance in validation and dashboard.** `validate-config` prints + the correct rollout preflight and drift-report commands, while the operator + dashboard shows bucket mode, projection mode, delivery ownership, rollout + commands, and dependency health. + This keeps migration-critical state visible before and during cutover. +- **Systemd service cleanup.** The monitor unit now places start-limit keys in + the correct systemd section, and the deliverer unit validates with + `systemd-analyze`. + This removes avoidable service-file warnings before production packaging. +- **Docker development cleanup.** The Docker setup now has clearer local env + names, hardcoded container-internal ports, explicit host-port overrides, + non-root MySQL credentials, Mailpit, healthchecks, MySQL readiness waits, and + runtime permission fixes. + Local development now better matches the process and dependency shape used by + v2. + +### Documentation, Tests, and Tooling + +- **Architecture and ADR refresh.** The architecture docs, API reference, + AGENTS guidance, and ADRs were brought back in line with the current v2 + health-platform shape. + This captures the "why" behind event-sourced state, pull-only delivery, + webhook signatures, gateway tenant boundaries, and credential-storage tradeoffs. +- **v3 architecture options documented.** The v3 probe-agent candidates are + parked in `docs/v3-probe-agent-architecture-options.md` until v2 has + production data. + Candidate 3 remains the leading option, but the roadmap now says which data + should be collected before revisiting it. +- **Outbound credential encryption plan.** The repo has a staged plan for + encrypting webhook secrets and alert-contact destination credentials at rest. + The plan preserves current internal behavior while defining dual-write, + backfill, encrypted-required, and plaintext-removal phases. +- **Build and generation cleanup.** `make all` builds the monitor, deliverer, + and Veriflier binaries without requiring generated gRPC code, and Makefile + targets use an explicit Go path and writable build cache. + This keeps normal build/test workflows reliable in local and CI-like shells. +- **Coverage and race-test expansion.** Core packages gained coverage for + list handlers, lifecycle helpers, API audit paths, delivery behavior, + startup helpers, and previously racy tests. + The branch now has broader regression coverage around the shared API and + delivery paths that are most likely to be touched next. diff --git a/TAXONOMY.md b/TAXONOMY.md index cf5aabd6..a8e280c4 100644 --- a/TAXONOMY.md +++ b/TAXONOMY.md @@ -404,18 +404,31 @@ Jetmon uses an event-sourced architecture where **events are the source of truth ### Schema shape ``` -events (source of truth): +events (current state — one row per open incident, frozen on close): id - site_id + site_id (blog_id) endpoint_id (nullable — null for site-level events) check_type + discriminator (nullable — tiebreaker for tuples that can have multiple concurrent failures) severity (numeric, comparable) state (human-readable category) - started_at + started_at (frozen across severity/state changes) ended_at (nullable — null for active events) cause_event_id (nullable — causal link, separate from hierarchical rollup) resolution_reason (nullable — why the event closed) metadata (JSON — check-specific data) + dedup_key (generated, NULL when closed; UNIQUE — enforces one-open-per-tuple) + +event_transitions (append-only history of every event mutation): + id + event_id + site_id (blog_id, denormalized for SLA queries) + severity_before, severity_after + state_before, state_after + reason (opened, severity_escalation, verifier_confirmed, false_alarm, …) + source (local, veriflier:, operator:, system:) + metadata (JSON — transition-specific context) + changed_at sites (includes derived state for fast reads): id @@ -426,9 +439,13 @@ sites (includes derived state for fast reads): worst_active_severity ``` +**Why two tables, not one mutable events table:** keeping current state in `events` and history in `event_transitions` lets you serve "current state of site X" with a single-row read on `events`, and "how did incident Y evolve" with a narrow `WHERE event_id = ?` scan on `event_transitions`. Both queries are common, both want different shapes, and a single mutable-history table compromises one or the other. + +The invariant is that **every write to `events` is paired with one row inserted into `event_transitions` in the same transaction**. This is enforced in code by routing all event mutations through a single `eventstore` package. Replaying `event_transitions` in `changed_at` order reconstructs any event's current `severity` and `state`, so the live `events` row is fully rebuildable from the history table. + **Key design decisions:** -- **Events are the source of truth; derived state is denormalized onto the site row for read performance.** Update both transactionally — the derived state should never write without a corresponding event write, and vice versa. +- **Events are the source of truth across two tables.** `events` holds current state (mutable while open, frozen on close); `event_transitions` is the append-only history of every change. The site row stores a denormalized projection for fast reads. All three update transactionally — the projection should never write without a corresponding event write, and an event write must always be accompanied by a transition row. - **Severity and state are separate fields.** Severity is the numeric, comparable value used for rollup (e.g., 1=Warning, 2=Degraded, 3=Seems Down, 4=Down). State is the human-readable category. Keeping them separate lets you add new states without breaking rollup logic. @@ -563,7 +580,7 @@ A consolidated list of architectural decisions made across the conversation hist 5. **Rollup rules are explicit and configurable per site**, not hardcoded. 6. **Multi-state vocabulary:** Up, Warning, Degraded, Seems Down, Down, Paused, Maintenance, Unknown. 7. **Unknown state exists specifically to prevent monitor-side failures from being reported as customer-site downtime.** -8. **Event-sourced architecture** with derived site state denormalized for read performance. +8. **Event-sourced architecture** across two tables: `events` for current state, `event_transitions` for append-only history of every mutation. Derived site state is denormalized onto the site row for read performance. The `eventstore` package is the sole writer; every event mutation also writes a transition row in the same transaction. 9. **Severity and state are separate fields**; severity is numeric and comparable, state is human-readable. 10. **Seems Down promotes in place** to Down on verifier confirmation; `started_at` stays at first-failure time. 11. **Event identity is idempotent** via `(site_id, endpoint_id, check_type, discriminator)`. diff --git a/cmd/jetmon-deliverer/main.go b/cmd/jetmon-deliverer/main.go new file mode 100644 index 00000000..bc6813f8 --- /dev/null +++ b/cmd/jetmon-deliverer/main.go @@ -0,0 +1,153 @@ +package main + +import ( + "fmt" + "log" + "os" + "os/signal" + "strings" + "syscall" + + "github.com/Automattic/jetmon/internal/audit" + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/db" + "github.com/Automattic/jetmon/internal/deliverer" + "github.com/Automattic/jetmon/internal/metrics" +) + +// Injected at build time via -ldflags. +var ( + version = "dev" + buildDate = "unknown" + goVersion = "unknown" +) + +func main() { + if len(os.Args) > 1 { + switch os.Args[1] { + case "version": + fmt.Printf("jetmon-deliverer %s (built %s with %s)\n", version, buildDate, goVersion) + return + case "validate-config": + cmdValidateConfig() + return + default: + fmt.Fprintf(os.Stderr, "unknown command %q (want: version, validate-config)\n", os.Args[1]) + os.Exit(2) + } + } + run() +} + +func cmdValidateConfig() { + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS config parse") + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS db connect") + + cfg := config.Get() + fmt.Printf("INFO email_transport=%s\n", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + fmt.Printf("WARN email_transport=%s; alert-contact emails will be logged but not delivered\n", emailTransportLabel(cfg)) + } + if level, msg := deliveryOwnerStatus(cfg, db.Hostname()); msg != "" { + fmt.Printf("%s %s\n", level, msg) + } + + fmt.Println("\nvalidation passed") +} + +func run() { + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + log.Fatalf("load config: %v", err) + } + cfg := config.Get() + log.Printf("config: email_transport=%s", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + log.Printf("WARN: email_transport=%s; alert-contact emails will be logged but not delivered", emailTransportLabel(cfg)) + } + + config.LoadDB() + if err := db.ConnectWithRetry(10); err != nil { + log.Fatalf("db connect: %v", err) + } + audit.Init(db.DB()) + + if err := metrics.Init("statsd:8125", db.Hostname()); err != nil { + log.Printf("warning: statsd init failed: %v", err) + } + + hostname := db.Hostname() + if level, msg := deliveryOwnerStatus(cfg, hostname); msg != "" { + if level == "WARN" { + log.Printf("WARN: %s", msg) + } else { + log.Printf("config: %s", msg) + } + } + if !deliveryWorkersShouldStart(cfg, hostname) { + waitForShutdown() + log.Println("jetmon-deliverer: shutdown complete") + return + } + + runtime := deliverer.Start(deliverer.Config{ + DB: db.DB(), + InstanceID: hostname, + Dispatchers: deliverer.BuildAlertDispatchers(cfg), + }) + waitForShutdown() + runtime.Stop() + log.Println("jetmon-deliverer: shutdown complete") +} + +func deliveryWorkersShouldStart(cfg *config.Config, hostname string) bool { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + return owner == "" || owner == hostname +} + +func deliveryOwnerStatus(cfg *config.Config, hostname string) (string, string) { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + if owner == "" { + return "WARN", fmt.Sprintf("delivery_owner_host is unset; standalone deliverer on host %q will run delivery workers", hostname) + } + if owner == hostname { + return "INFO", fmt.Sprintf("delivery_owner_host=%q matched; delivery workers enabled on this host", owner) + } + return "INFO", fmt.Sprintf("delivery_owner_host=%q; standalone deliverer idle on host %q", owner, hostname) +} + +func waitForShutdown() { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + sig := <-sigCh + log.Printf("received %s, stopping", sig) +} + +func emailTransportLabel(cfg *config.Config) string { + if cfg.EmailTransport == "" { + return "stub" + } + return cfg.EmailTransport +} + +func emailTransportDelivers(cfg *config.Config) bool { + return cfg.EmailTransport == "smtp" || cfg.EmailTransport == "wpcom" +} + +func envOrDefault(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} diff --git a/cmd/jetmon-deliverer/main_test.go b/cmd/jetmon-deliverer/main_test.go new file mode 100644 index 00000000..b311965e --- /dev/null +++ b/cmd/jetmon-deliverer/main_test.go @@ -0,0 +1,101 @@ +package main + +import ( + "strings" + "testing" + + "github.com/Automattic/jetmon/internal/config" +) + +func TestDeliveryWorkersShouldStart(t *testing.T) { + tests := []struct { + name string + cfg config.Config + hostname string + wantStart bool + wantLevel string + wantMsg string + }{ + { + name: "empty owner starts with warning", + cfg: config.Config{}, + hostname: "host-a", + wantStart: true, + wantLevel: "WARN", + wantMsg: "delivery_owner_host is unset", + }, + { + name: "matching owner starts", + cfg: config.Config{ + DeliveryOwnerHost: "host-a", + }, + hostname: "host-a", + wantStart: true, + wantLevel: "INFO", + wantMsg: "matched", + }, + { + name: "non-owner idles", + cfg: config.Config{ + DeliveryOwnerHost: "host-a", + }, + hostname: "host-b", + wantLevel: "INFO", + wantMsg: "idle on host", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := deliveryWorkersShouldStart(&tt.cfg, tt.hostname); got != tt.wantStart { + t.Fatalf("deliveryWorkersShouldStart() = %v, want %v", got, tt.wantStart) + } + level, msg := deliveryOwnerStatus(&tt.cfg, tt.hostname) + if level != tt.wantLevel { + t.Fatalf("deliveryOwnerStatus() level = %q, want %q", level, tt.wantLevel) + } + if !strings.Contains(msg, tt.wantMsg) { + t.Fatalf("deliveryOwnerStatus() message = %q, want substring %q", msg, tt.wantMsg) + } + }) + } +} + +func TestEmailTransportLabelAndDelivery(t *testing.T) { + tests := []struct { + name string + cfg config.Config + label string + delivers bool + }{ + {name: "empty is stub alias", cfg: config.Config{}, label: "stub"}, + {name: "stub logs only", cfg: config.Config{EmailTransport: "stub"}, label: "stub"}, + {name: "smtp delivers", cfg: config.Config{EmailTransport: "smtp"}, label: "smtp", delivers: true}, + {name: "wpcom delivers", cfg: config.Config{EmailTransport: "wpcom"}, label: "wpcom", delivers: true}, + {name: "unknown does not deliver", cfg: config.Config{EmailTransport: "sendmail"}, label: "sendmail"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := emailTransportLabel(&tt.cfg); got != tt.label { + t.Fatalf("emailTransportLabel() = %q, want %q", got, tt.label) + } + if got := emailTransportDelivers(&tt.cfg); got != tt.delivers { + t.Fatalf("emailTransportDelivers() = %v, want %v", got, tt.delivers) + } + }) + } +} + +func TestEnvOrDefault(t *testing.T) { + const key = "JETMON_DELIVERER_TEST_ENV_OR_DEFAULT" + t.Setenv(key, "") + if got := envOrDefault(key, "fallback"); got != "fallback" { + t.Fatalf("envOrDefault() = %q, want fallback", got) + } + + t.Setenv(key, "set-value") + if got := envOrDefault(key, "fallback"); got != "set-value" { + t.Fatalf("envOrDefault() = %q, want set-value", got) + } +} diff --git a/cmd/jetmon2/main.go b/cmd/jetmon2/main.go index 97681c80..e96f014e 100644 --- a/cmd/jetmon2/main.go +++ b/cmd/jetmon2/main.go @@ -1,6 +1,7 @@ package main import ( + "context" "database/sql" "flag" "fmt" @@ -10,15 +11,21 @@ import ( "os" "os/signal" "path/filepath" + "strings" "syscall" "time" + "github.com/Automattic/jetmon/internal/alerting" + "github.com/Automattic/jetmon/internal/api" + "github.com/Automattic/jetmon/internal/apikeys" "github.com/Automattic/jetmon/internal/audit" "github.com/Automattic/jetmon/internal/config" "github.com/Automattic/jetmon/internal/dashboard" "github.com/Automattic/jetmon/internal/db" + "github.com/Automattic/jetmon/internal/deliverer" "github.com/Automattic/jetmon/internal/metrics" "github.com/Automattic/jetmon/internal/orchestrator" + "github.com/Automattic/jetmon/internal/veriflier" "github.com/Automattic/jetmon/internal/wpcom" ) @@ -50,6 +57,12 @@ func main() { cmdDrain() case "reload": cmdReload() + case "keys": + cmdKeys(os.Args[2:]) + case "site-tenants": + cmdSiteTenants(os.Args[2:]) + case "rollout": + cmdRollout(os.Args[2:]) default: runServe() } @@ -63,9 +76,11 @@ func runServe() { log.Fatalf("load config: %v", err) } cfg := config.Get() - - if cfg.DBUpdatesEnable && os.Getenv("JETMON_UNSAFE_DB_UPDATES") != "1" { - log.Fatalf("DB_UPDATES_ENABLE is true but JETMON_UNSAFE_DB_UPDATES=1 is not set — refusing to start. This setting must only be used in local test environments.") + log.Printf("config: legacy_status_projection=%s", enabledLabel(cfg.LegacyStatusProjectionEnable)) + log.Printf("config: bucket_ownership=%s", bucketOwnershipLabel(cfg)) + log.Printf("config: email_transport=%s", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + log.Printf("WARN: email_transport=%s — alert-contact emails will be logged but not delivered", emailTransportLabel(cfg)) } config.LoadDB() @@ -114,26 +129,82 @@ func runServe() { }() } + // Internal API server. Disabled when API_PORT is 0. Bears auth via + // jetmon_api_keys; key management is CLI-only (`./jetmon2 keys`). + var apiSrv *api.Server + if cfg.APIPort > 0 { + apiSrv = api.New(fmt.Sprintf(":%d", cfg.APIPort), db.DB(), db.Hostname()) + go func() { + if err := apiSrv.Listen(); err != nil && !api.IsServerClosed(err) { + log.Printf("api: %v", err) + } + }() + } + + if level, msg := deliveryOwnerStatus(cfg, db.Hostname()); msg != "" { + if level == "WARN" { + log.Printf("WARN: %s", msg) + } else { + log.Printf("config: %s", msg) + } + } + deliveryWorkersEnabled := deliveryWorkersShouldStart(cfg, db.Hostname()) + + var alertDispatchers map[alerting.Transport]alerting.Dispatcher + if cfg.APIPort > 0 { + alertDispatchers = deliverer.BuildAlertDispatchers(cfg) + if apiSrv != nil { + apiSrv.SetAlertDispatchers(alertDispatchers) + } + } + + // Embedded outbound delivery workers. Disabled when API_PORT is 0 + // (no API to manage webhooks or alert contacts) or when + // DELIVERY_OWNER_HOST names another host. + var deliveryRuntime *deliverer.Runtime + if deliveryWorkersEnabled { + deliveryRuntime = deliverer.Start(deliverer.Config{ + DB: db.DB(), + InstanceID: db.Hostname(), + Dispatchers: alertDispatchers, + }) + } + // Push dashboard state every stats interval. if dash != nil { + publishDashboardHealth(dash, wp) go func() { ticker := time.NewTicker(time.Duration(cfg.StatsUpdateIntervalMS) * time.Millisecond) defer ticker.Stop() for range ticker.C { bMin, bMax := orch.BucketRange() + currentCfg := config.Get() dash.Update(dashboard.State{ - WorkerCount: orch.WorkerCount(), - ActiveChecks: orch.ActiveChecks(), - QueueDepth: orch.QueueDepth(), - RetryQueueSize: orch.RetryQueueSize(), - SitesPerSec: 0, - WPCOMCircuitOpen: wp.IsCircuitOpen(), - WPCOMQueueDepth: wp.QueueDepth(), - BucketMin: bMin, - BucketMax: bMax, + WorkerCount: orch.WorkerCount(), + ActiveChecks: orch.ActiveChecks(), + QueueDepth: orch.QueueDepth(), + RetryQueueSize: orch.RetryQueueSize(), + SitesPerSec: 0, + WPCOMCircuitOpen: wp.IsCircuitOpen(), + WPCOMQueueDepth: wp.QueueDepth(), + BucketMin: bMin, + BucketMax: bMax, + BucketOwnership: bucketOwnershipLabel(currentCfg), + LegacyStatusProjectionEnabled: currentCfg.LegacyStatusProjectionEnable, + DeliveryWorkersEnabled: deliveryWorkersEnabled, + DeliveryOwnerHost: currentCfg.DeliveryOwnerHost, + RolloutPreflightCommand: rolloutPreflightCommand(currentCfg), + ProjectionDriftCommand: projectionDriftCommand(), }) } }() + go func() { + ticker := time.NewTicker(time.Duration(cfg.StatsUpdateIntervalMS) * time.Millisecond) + defer ticker.Stop() + for range ticker.C { + publishDashboardHealth(dash, wp) + } + }() } // Signal handling. @@ -152,6 +223,16 @@ func runServe() { } case syscall.SIGINT, syscall.SIGTERM: log.Println("received shutdown signal, draining") + if apiSrv != nil { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + if err := apiSrv.Shutdown(ctx); err != nil { + log.Printf("api: shutdown error: %v", err) + } + cancel() + } + if deliveryRuntime != nil { + deliveryRuntime.Stop() + } orch.Stop() // Hard kill if drain takes too long (e.g. a stalled HTTP check). time.AfterFunc(30*time.Second, func() { @@ -193,15 +274,257 @@ func cmdValidateConfig() { fmt.Println("PASS db connect") cfg := config.Get() + fmt.Printf("INFO legacy_status_projection=%s\n", enabledLabel(cfg.LegacyStatusProjectionEnable)) + fmt.Printf("INFO bucket_ownership=%s\n", bucketOwnershipLabel(cfg)) + for _, line := range rolloutAdviceLines(cfg) { + fmt.Println(line) + } + fmt.Printf("INFO email_transport=%s\n", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + fmt.Printf("WARN email_transport=%s — alert-contact emails will be logged but not delivered\n", emailTransportLabel(cfg)) + } + if level, msg := deliveryOwnerStatus(cfg, db.Hostname()); msg != "" { + fmt.Printf("%s %s\n", level, msg) + } for _, v := range cfg.Verifiers { - addr := fmt.Sprintf("%s:%s", v.Host, v.GRPCPort) - // Ping check is best-effort; don't fail validation on veriflier unavailability. + addr := fmt.Sprintf("%s:%s", v.Host, v.TransportPort()) + // Listing configured Verifliers is operator context, not a reachability check. fmt.Printf("INFO veriflier %q at %s\n", v.Name, addr) } fmt.Println("\nvalidation passed") } +func enabledLabel(b bool) string { + if b { + return "enabled" + } + return "disabled" +} + +func bucketOwnershipLabel(cfg *config.Config) string { + if min, max, ok := cfg.PinnedBucketRange(); ok { + return fmt.Sprintf("pinned range=%d-%d", min, max) + } + return "dynamic jetmon_hosts" +} + +func rolloutAdviceLines(cfg *config.Config) []string { + return []string{ + "INFO rollout_preflight=" + rolloutPreflightCommand(cfg), + "INFO rollout_drift_report=" + projectionDriftCommand(), + } +} + +func rolloutPreflightCommand(cfg *config.Config) string { + if _, _, ok := cfg.PinnedBucketRange(); ok { + return "./jetmon2 rollout pinned-check" + } + return "./jetmon2 rollout dynamic-check" +} + +func projectionDriftCommand() string { + return "./jetmon2 rollout projection-drift" +} + +const dashboardHealthTimeout = 2 * time.Second + +func publishDashboardHealth(dash *dashboard.Server, wp *wpcom.Client) { + if dash == nil { + return + } + dash.UpdateHealth(dashboardHealthEntries(context.Background(), config.Get(), db.DB(), wp, metrics.Global() != nil, time.Now().UTC())) +} + +func dashboardHealthEntries(ctx context.Context, cfg *config.Config, sqlDB *sql.DB, wp *wpcom.Client, statsdReady bool, checkedAt time.Time) []dashboard.HealthEntry { + entries := []dashboard.HealthEntry{ + mysqlHealthEntry(ctx, sqlDB, checkedAt), + wpcomHealthEntry(wp, checkedAt), + statsdHealthEntry(statsdReady, checkedAt), + diskHealthEntry("logs", checkedAt), + diskHealthEntry("stats", checkedAt), + } + entries = append(entries, veriflierHealthEntries(ctx, cfg, checkedAt)...) + return entries +} + +func mysqlHealthEntry(ctx context.Context, sqlDB *sql.DB, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "mysql", CheckedAt: checkedAt} + if sqlDB == nil { + entry.Status = "red" + entry.LastError = "database pool is not initialized" + return entry + } + + pingCtx, cancel := context.WithTimeout(ctx, dashboardHealthTimeout) + defer cancel() + + start := time.Now() + if err := sqlDB.PingContext(pingCtx); err != nil { + entry.Status = "red" + entry.Latency = time.Since(start).Milliseconds() + entry.LastError = err.Error() + return entry + } + entry.Status = "green" + entry.Latency = time.Since(start).Milliseconds() + return entry +} + +func veriflierHealthEntries(ctx context.Context, cfg *config.Config, checkedAt time.Time) []dashboard.HealthEntry { + if cfg == nil || len(cfg.Verifiers) == 0 { + return []dashboard.HealthEntry{{ + Name: "verifliers", + Status: "amber", + LastError: "no verifliers configured", + CheckedAt: checkedAt, + }} + } + + entries := make([]dashboard.HealthEntry, 0, len(cfg.Verifiers)) + for _, v := range cfg.Verifiers { + addr := fmt.Sprintf("%s:%s", v.Host, v.TransportPort()) + name := "veriflier:" + v.Name + if v.Name == "" { + name = "veriflier:" + addr + } + entry := dashboard.HealthEntry{Name: name, CheckedAt: checkedAt} + if v.Host == "" || v.TransportPort() == "" { + entry.Status = "red" + entry.LastError = "host or port is not configured" + entries = append(entries, entry) + continue + } + + pingCtx, cancel := context.WithTimeout(ctx, dashboardHealthTimeout) + start := time.Now() + version, err := veriflier.NewVeriflierClient(addr, v.AuthToken).Ping(pingCtx) + cancel() + entry.Latency = time.Since(start).Milliseconds() + if err != nil { + entry.Status = "red" + entry.LastError = err.Error() + } else { + entry.Status = "green" + if version != "" { + entry.Name = fmt.Sprintf("%s (%s)", entry.Name, version) + } + } + entries = append(entries, entry) + } + return entries +} + +func wpcomHealthEntry(wp *wpcom.Client, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "wpcom", CheckedAt: checkedAt} + if wp == nil { + entry.Status = "red" + entry.LastError = "wpcom client is not initialized" + return entry + } + queueDepth := wp.QueueDepth() + if wp.IsCircuitOpen() { + entry.Status = "red" + entry.LastError = fmt.Sprintf("circuit open, queued notifications=%d", queueDepth) + return entry + } + if queueDepth > 0 { + entry.Status = "amber" + entry.LastError = fmt.Sprintf("queued notifications=%d", queueDepth) + return entry + } + entry.Status = "green" + return entry +} + +func statsdHealthEntry(ready bool, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "statsd", CheckedAt: checkedAt} + if !ready { + entry.Status = "amber" + entry.LastError = "statsd client is not initialized" + return entry + } + entry.Status = "green" + return entry +} + +func diskHealthEntry(dir string, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "disk:" + dir, CheckedAt: checkedAt} + if err := checkWritableDir(dir); err != nil { + entry.Status = "red" + entry.LastError = err.Error() + return entry + } + entry.Status = "green" + return entry +} + +func checkWritableDir(dir string) error { + info, err := os.Stat(dir) + if err != nil { + return err + } + if !info.IsDir() { + return fmt.Errorf("%s is not a directory", dir) + } + f, err := os.CreateTemp(dir, ".jetmon-health-*") + if err != nil { + return err + } + name := f.Name() + if err := f.Close(); err != nil { + _ = os.Remove(name) + return err + } + if err := os.Remove(name); err != nil { + return err + } + return nil +} + +// emailTransportLabel collapses an empty EMAIL_TRANSPORT to its compatibility +// alias ("stub") so startup output and validate-config show a single canonical +// name regardless of which form an operator wrote in config. +func emailTransportLabel(cfg *config.Config) string { + if cfg.EmailTransport == "" { + return "stub" + } + return cfg.EmailTransport +} + +// emailTransportDelivers reports whether the configured email transport +// actually delivers mail. The stub transport (and the empty-string alias for +// it) only logs, so any alert-contact configured with transport="email" will +// silently disappear into the logs in that mode. +func emailTransportDelivers(cfg *config.Config) bool { + return cfg.EmailTransport == "smtp" || cfg.EmailTransport == "wpcom" +} + +func deliveryWorkersShouldStart(cfg *config.Config, hostname string) bool { + if cfg.APIPort <= 0 { + return false + } + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + return owner == "" || owner == hostname +} + +func deliveryOwnerStatus(cfg *config.Config, hostname string) (string, string) { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + if cfg.APIPort <= 0 { + if owner == "" { + return "INFO", "delivery_workers=disabled api_port=disabled" + } + return "INFO", fmt.Sprintf("delivery_owner_host=%q ignored because API_PORT is disabled", owner) + } + if owner == "" { + return "WARN", fmt.Sprintf("delivery_owner_host is unset; host %q will run delivery workers because API_PORT is enabled", hostname) + } + if owner == hostname { + return "INFO", fmt.Sprintf("delivery_owner_host=%q matched; delivery workers enabled on this host", owner) + } + return "INFO", fmt.Sprintf("delivery_owner_host=%q; delivery workers disabled on host %q", owner, hostname) +} + func cmdStatus() { // Connect to the running instance's internal API. port := envOrDefault("DASHBOARD_PORT", "8080") @@ -238,24 +561,21 @@ func cmdAudit() { fmt.Printf("Audit log for blog_id=%d\n", *blogID) fmt.Printf("%-25s %-22s %-15s %s\n", "TIMESTAMP", "EVENT", "SOURCE", "DETAIL") - fmt.Println(repeat("-", 90)) + fmt.Println(strings.Repeat("-", 90)) for rows.Next() { var ( id int64 - bid int64 + bid sql.NullInt64 + eventID sql.NullInt64 eventType string source string - httpCode sql.NullInt64 - errorCode sql.NullInt64 - rttMs sql.NullInt64 - oldStatus sql.NullInt64 - newStatus sql.NullInt64 detail sql.NullString + metadata sql.NullString createdAt time.Time ) - if err := rows.Scan(&id, &bid, &eventType, &source, &httpCode, &errorCode, - &rttMs, &oldStatus, &newStatus, &detail, &createdAt); err != nil { + if err := rows.Scan(&id, &bid, &eventID, &eventType, &source, + &detail, &metadata, &createdAt); err != nil { log.Printf("scan: %v", err) continue } @@ -263,11 +583,11 @@ func cmdAudit() { if detail.Valid { det = detail.String } - if httpCode.Valid { - det = fmt.Sprintf("http=%d err=%d rtt=%dms %s", httpCode.Int64, errorCode.Int64, rttMs.Int64, det) + if eventID.Valid { + det = fmt.Sprintf("event=%d %s", eventID.Int64, det) } - if oldStatus.Valid { - det = fmt.Sprintf("status %d→%d %s", oldStatus.Int64, newStatus.Int64, det) + if metadata.Valid && metadata.String != "" { + det = fmt.Sprintf("%s meta=%s", det, metadata.String) } fmt.Printf("%-25s %-22s %-15s %s\n", createdAt.Format("2006-01-02 15:04:05.000"), @@ -299,6 +619,175 @@ func cmdReload() { fmt.Printf("SIGHUP sent to pid %d\n", pid) } +// cmdKeys is the entrypoint for `./jetmon2 keys ...` ops commands. Key +// management is intentionally CLI-only — the public API has no /keys +// endpoints. See API.md "Authentication". +func cmdKeys(args []string) { + if len(args) == 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 keys [args]") + os.Exit(1) + } + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + log.Fatalf("db: %v", err) + } + ctx := context.Background() + + sub := args[0] + rest := args[1:] + switch sub { + case "create": + cmdKeysCreate(ctx, rest) + case "list": + cmdKeysList(ctx, rest) + case "revoke": + cmdKeysRevoke(ctx, rest) + case "rotate": + cmdKeysRotate(ctx, rest) + default: + fmt.Fprintf(os.Stderr, "unknown keys subcommand %q (want: create, list, revoke, rotate)\n", sub) + os.Exit(1) + } +} + +func cmdKeysCreate(ctx context.Context, args []string) { + fs := flag.NewFlagSet("keys create", flag.ExitOnError) + consumer := fs.String("consumer", "", "consumer name (e.g. 'gateway', 'alerts-worker') — required") + scopeStr := fs.String("scope", "read", "permission scope: read | write | admin") + rateLimit := fs.Int("rate-limit", 0, "requests per minute (0 = scope default)") + ttl := fs.Duration("ttl", 0, "key lifetime (e.g. 90d, 720h); 0 = never expires") + createdBy := fs.String("created-by", currentOperator(), "operator identity for audit") + _ = fs.Parse(args) + + if *consumer == "" { + fmt.Fprintln(os.Stderr, "--consumer is required") + os.Exit(1) + } + + raw, k, err := apikeys.Create(ctx, db.DB(), apikeys.CreateInput{ + ConsumerName: *consumer, + Scope: apikeys.Scope(*scopeStr), + RateLimitPerMinute: *rateLimit, + TTL: *ttl, + CreatedBy: *createdBy, + }) + if err != nil { + log.Fatalf("create: %v", err) + } + + fmt.Printf("Created key id=%d for consumer=%q scope=%s rate=%d/min\n", + k.ID, k.ConsumerName, k.Scope, k.RateLimitPerMinute) + if k.ExpiresAt != nil { + fmt.Printf("Expires: %s\n", k.ExpiresAt.UTC().Format(time.RFC3339)) + } else { + fmt.Println("Expires: never") + } + fmt.Println() + fmt.Println("Token (shown ONCE — save it now):") + fmt.Println(raw) +} + +func cmdKeysList(ctx context.Context, args []string) { + fs := flag.NewFlagSet("keys list", flag.ExitOnError) + includeRevoked := fs.Bool("include-revoked", false, "show revoked keys too") + _ = fs.Parse(args) + + keys, err := apikeys.List(ctx, db.DB()) + if err != nil { + log.Fatalf("list: %v", err) + } + + fmt.Printf("%-5s %-24s %-7s %-9s %-21s %-21s %s\n", + "ID", "CONSUMER", "SCOPE", "RATE/MIN", "EXPIRES", "LAST USED", "STATUS") + fmt.Println(strings.Repeat("-", 110)) + for _, k := range keys { + status := "active" + if k.RevokedAt != nil { + if !*includeRevoked && k.RevokedAt.Before(time.Now().UTC()) { + continue + } + if k.RevokedAt.After(time.Now().UTC()) { + status = "revokes-at " + k.RevokedAt.UTC().Format("2006-01-02T15:04:05Z") + } else { + status = "revoked" + } + } else if k.ExpiresAt != nil && k.ExpiresAt.Before(time.Now().UTC()) { + status = "expired" + } + expires := "never" + if k.ExpiresAt != nil { + expires = k.ExpiresAt.UTC().Format("2006-01-02T15:04:05Z") + } + lastUsed := "never" + if k.LastUsedAt != nil { + lastUsed = k.LastUsedAt.UTC().Format("2006-01-02T15:04:05Z") + } + fmt.Printf("%-5d %-24s %-7s %-9d %-21s %-21s %s\n", + k.ID, k.ConsumerName, k.Scope, k.RateLimitPerMinute, expires, lastUsed, status) + } +} + +func cmdKeysRevoke(ctx context.Context, args []string) { + if len(args) < 1 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 keys revoke ") + os.Exit(1) + } + id, err := parseInt64(args[0]) + if err != nil { + log.Fatalf("invalid id %q: %v", args[0], err) + } + if err := apikeys.Revoke(ctx, db.DB(), id); err != nil { + log.Fatalf("revoke: %v", err) + } + fmt.Printf("Revoked key id=%d\n", id) +} + +func cmdKeysRotate(ctx context.Context, args []string) { + fs := flag.NewFlagSet("keys rotate", flag.ExitOnError) + grace := fs.Duration("grace", 5*time.Minute, "grace period before old key is revoked (0 = revoke immediately)") + createdBy := fs.String("created-by", currentOperator(), "operator identity for audit") + _ = fs.Parse(args) + + if fs.NArg() < 1 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 keys rotate [--grace=DURATION] ") + os.Exit(1) + } + id, err := parseInt64(fs.Arg(0)) + if err != nil { + log.Fatalf("invalid id %q: %v", fs.Arg(0), err) + } + + raw, k, err := apikeys.Rotate(ctx, db.DB(), id, *grace, *createdBy) + if err != nil { + log.Fatalf("rotate: %v", err) + } + fmt.Printf("Rotated key id=%d → new key id=%d for consumer=%q\n", id, k.ID, k.ConsumerName) + if *grace > 0 { + fmt.Printf("Old key id=%d will be revoked at %s\n", id, time.Now().UTC().Add(*grace).Format(time.RFC3339)) + } else { + fmt.Printf("Old key id=%d revoked immediately\n", id) + } + fmt.Println() + fmt.Println("New token (shown ONCE — save it now):") + fmt.Println(raw) +} + +func currentOperator() string { + if u := os.Getenv("USER"); u != "" { + return u + } + if u := os.Getenv("LOGNAME"); u != "" { + return u + } + return "cli" +} + +func parseInt64(s string) (int64, error) { + var v int64 + _, err := fmt.Sscan(s, &v) + return v, err +} + func readPIDFile() int { pidPath := envOrDefault("JETMON_PID_FILE", "/run/jetmon2/jetmon2.pid") data, err := os.ReadFile(pidPath) @@ -316,7 +805,7 @@ func writePIDFile(path string) error { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return err } - return os.WriteFile(path, []byte(fmt.Sprintf("%d\n", os.Getpid())), 0644) + return os.WriteFile(path, fmt.Appendf(nil, "%d\n", os.Getpid()), 0644) } func removePIDFile(path string) { @@ -356,11 +845,3 @@ func resolveSince(s string) string { } return s } - -func repeat(s string, n int) string { - out := "" - for range n { - out += s - } - return out -} diff --git a/cmd/jetmon2/main_test.go b/cmd/jetmon2/main_test.go index 58c17df8..22128af7 100644 --- a/cmd/jetmon2/main_test.go +++ b/cmd/jetmon2/main_test.go @@ -1,6 +1,8 @@ package main import ( + "context" + "encoding/json" "fmt" "net/http" "net/http/httptest" @@ -9,6 +11,11 @@ import ( "strings" "testing" "time" + + "github.com/Automattic/jetmon/internal/alerting" + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/deliverer" + "github.com/DATA-DOG/go-sqlmock" ) func TestHTTPGet(t *testing.T) { @@ -55,18 +62,6 @@ func TestEnvOrDefault(t *testing.T) { } } -func TestRepeat(t *testing.T) { - if got := repeat("-", 5); got != "-----" { - t.Fatalf("repeat(\"-\", 5) = %q, want -----", got) - } - if got := repeat("ab", 3); got != "ababab" { - t.Fatalf("repeat(\"ab\", 3) = %q, want ababab", got) - } - if got := repeat("x", 0); got != "" { - t.Fatalf("repeat(\"x\", 0) = %q, want empty", got) - } -} - func TestReadPIDFile(t *testing.T) { dir := t.TempDir() pidPath := filepath.Join(dir, "test.pid") @@ -126,3 +121,387 @@ func TestResolveSince(t *testing.T) { t.Fatalf("resolveSince(%q) = %q, want passthrough", literal, got) } } + +func TestEmailTransportLabelAndDelivery(t *testing.T) { + tests := []struct { + name string + cfg config.Config + label string + delivers bool + }{ + { + name: "empty is stub alias", + cfg: config.Config{EmailTransport: ""}, + label: "stub", + delivers: false, + }, + { + name: "stub logs only", + cfg: config.Config{EmailTransport: "stub"}, + label: "stub", + delivers: false, + }, + { + name: "smtp delivers", + cfg: config.Config{EmailTransport: "smtp"}, + label: "smtp", + delivers: true, + }, + { + name: "wpcom delivers", + cfg: config.Config{EmailTransport: "wpcom"}, + label: "wpcom", + delivers: true, + }, + { + name: "invalid transport does not deliver", + cfg: config.Config{EmailTransport: "sendmail"}, + label: "sendmail", + delivers: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := emailTransportLabel(&tt.cfg); got != tt.label { + t.Fatalf("emailTransportLabel() = %q, want %q", got, tt.label) + } + if got := emailTransportDelivers(&tt.cfg); got != tt.delivers { + t.Fatalf("emailTransportDelivers() = %v, want %v", got, tt.delivers) + } + }) + } +} + +func TestDeliveryWorkersShouldStart(t *testing.T) { + tests := []struct { + name string + cfg config.Config + hostname string + wantStart bool + wantLevel string + wantMsg string + }{ + { + name: "api disabled", + cfg: config.Config{}, + hostname: "host-a", + wantLevel: "INFO", + wantMsg: "delivery_workers=disabled", + }, + { + name: "legacy api port behavior starts workers", + cfg: config.Config{APIPort: 8090}, + hostname: "host-a", + wantStart: true, + wantLevel: "WARN", + wantMsg: "delivery_owner_host is unset", + }, + { + name: "matching owner starts workers", + cfg: config.Config{ + APIPort: 8090, + DeliveryOwnerHost: "host-a", + }, + hostname: "host-a", + wantStart: true, + wantLevel: "INFO", + wantMsg: "matched", + }, + { + name: "non-owner skips workers", + cfg: config.Config{ + APIPort: 8090, + DeliveryOwnerHost: "host-a", + }, + hostname: "host-b", + wantLevel: "INFO", + wantMsg: "disabled on host", + }, + { + name: "owner ignored when api disabled", + cfg: config.Config{ + DeliveryOwnerHost: "host-a", + }, + hostname: "host-a", + wantLevel: "INFO", + wantMsg: "ignored because API_PORT is disabled", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := deliveryWorkersShouldStart(&tt.cfg, tt.hostname); got != tt.wantStart { + t.Fatalf("deliveryWorkersShouldStart() = %v, want %v", got, tt.wantStart) + } + level, msg := deliveryOwnerStatus(&tt.cfg, tt.hostname) + if level != tt.wantLevel { + t.Fatalf("deliveryOwnerStatus() level = %q, want %q", level, tt.wantLevel) + } + if !strings.Contains(msg, tt.wantMsg) { + t.Fatalf("deliveryOwnerStatus() message = %q, want substring %q", msg, tt.wantMsg) + } + }) + } +} + +func TestEnabledLabel(t *testing.T) { + if got := enabledLabel(true); got != "enabled" { + t.Fatalf("enabledLabel(true) = %q, want enabled", got) + } + if got := enabledLabel(false); got != "disabled" { + t.Fatalf("enabledLabel(false) = %q, want disabled", got) + } +} + +func TestBucketOwnershipLabel(t *testing.T) { + if got := bucketOwnershipLabel(&config.Config{}); got != "dynamic jetmon_hosts" { + t.Fatalf("bucketOwnershipLabel(dynamic) = %q", got) + } + min, max := 12, 34 + got := bucketOwnershipLabel(&config.Config{PinnedBucketMin: &min, PinnedBucketMax: &max}) + if got != "pinned range=12-34" { + t.Fatalf("bucketOwnershipLabel(pinned) = %q", got) + } +} + +func TestRolloutAdviceLines(t *testing.T) { + dynamic := rolloutAdviceLines(&config.Config{}) + if len(dynamic) != 2 { + t.Fatalf("dynamic advice len = %d, want 2", len(dynamic)) + } + if !strings.Contains(dynamic[0], "rollout dynamic-check") { + t.Fatalf("dynamic preflight advice = %q", dynamic[0]) + } + if !strings.Contains(dynamic[1], "rollout projection-drift") { + t.Fatalf("dynamic drift advice = %q", dynamic[1]) + } + + min, max := 12, 34 + pinned := rolloutAdviceLines(&config.Config{PinnedBucketMin: &min, PinnedBucketMax: &max}) + if len(pinned) != 2 { + t.Fatalf("pinned advice len = %d, want 2", len(pinned)) + } + if !strings.Contains(pinned[0], "rollout pinned-check") { + t.Fatalf("pinned preflight advice = %q", pinned[0]) + } + if !strings.Contains(pinned[1], "rollout projection-drift") { + t.Fatalf("pinned drift advice = %q", pinned[1]) + } +} + +func TestRolloutCommandHelpers(t *testing.T) { + if got := rolloutPreflightCommand(&config.Config{}); got != "./jetmon2 rollout dynamic-check" { + t.Fatalf("rolloutPreflightCommand(dynamic) = %q", got) + } + min, max := 12, 34 + cfg := &config.Config{PinnedBucketMin: &min, PinnedBucketMax: &max} + if got := rolloutPreflightCommand(cfg); got != "./jetmon2 rollout pinned-check" { + t.Fatalf("rolloutPreflightCommand(pinned) = %q", got) + } + if got := projectionDriftCommand(); got != "./jetmon2 rollout projection-drift" { + t.Fatalf("projectionDriftCommand() = %q", got) + } +} + +func TestDashboardHealthEntriesReportsCoreDependencies(t *testing.T) { + root := t.TempDir() + if err := os.Mkdir(filepath.Join(root, "logs"), 0755); err != nil { + t.Fatalf("mkdir logs: %v", err) + } + if err := os.Mkdir(filepath.Join(root, "stats"), 0755); err != nil { + t.Fatalf("mkdir stats: %v", err) + } + wd, err := os.Getwd() + if err != nil { + t.Fatalf("Getwd: %v", err) + } + if err := os.Chdir(root); err != nil { + t.Fatalf("Chdir: %v", err) + } + defer func() { + if err := os.Chdir(wd); err != nil { + t.Fatalf("restore working directory: %v", err) + } + }() + + sqlDB, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqlDB.Close() + mock.ExpectPing() + + checkedAt := time.Date(2026, 4, 28, 3, 0, 0, 0, time.UTC) + entries := dashboardHealthEntries(context.Background(), &config.Config{}, sqlDB, nil, false, checkedAt) + byName := make(map[string]string, len(entries)) + for _, entry := range entries { + byName[entry.Name] = entry.Status + if !entry.CheckedAt.Equal(checkedAt) { + t.Fatalf("%s CheckedAt = %s, want %s", entry.Name, entry.CheckedAt, checkedAt) + } + } + + want := map[string]string{ + "mysql": "green", + "wpcom": "red", + "statsd": "amber", + "disk:logs": "green", + "disk:stats": "green", + "verifliers": "amber", + } + for name, status := range want { + if byName[name] != status { + t.Fatalf("health[%s] = %q, want %q (entries=%v)", name, byName[name], status, entries) + } + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("sql expectations: %v", err) + } +} + +func TestCheckWritableDirReportsMissingDirectory(t *testing.T) { + err := checkWritableDir(filepath.Join(t.TempDir(), "missing")) + if err == nil { + t.Fatal("checkWritableDir() returned nil for missing directory") + } +} + +func TestParseInt64(t *testing.T) { + got, err := parseInt64("12345") + if err != nil { + t.Fatalf("parseInt64(valid) error = %v", err) + } + if got != 12345 { + t.Fatalf("parseInt64(valid) = %d, want 12345", got) + } + if _, err := parseInt64("not-an-id"); err == nil { + t.Fatal("parseInt64(invalid) returned nil error") + } +} + +func TestCurrentOperatorPrefersUserThenLogname(t *testing.T) { + t.Setenv("USER", "alice") + t.Setenv("LOGNAME", "bob") + if got := currentOperator(); got != "alice" { + t.Fatalf("currentOperator() = %q, want USER", got) + } + + t.Setenv("USER", "") + if got := currentOperator(); got != "bob" { + t.Fatalf("currentOperator() = %q, want LOGNAME", got) + } + + t.Setenv("LOGNAME", "") + if got := currentOperator(); got != "cli" { + t.Fatalf("currentOperator() = %q, want cli", got) + } +} + +func TestReadPIDFileRejectsInvalidContent(t *testing.T) { + dir := t.TempDir() + pidPath := filepath.Join(dir, "test.pid") + if err := os.WriteFile(pidPath, []byte("0\n"), 0644); err != nil { + t.Fatalf("WriteFile: %v", err) + } + t.Setenv("JETMON_PID_FILE", pidPath) + + if os.Getenv("JETMON_TEST_READ_PID_INVALID") == "1" { + _ = readPIDFile() + return + } + + cmd := os.Args[0] + proc, err := os.StartProcess(cmd, []string{cmd, "-test.run=TestReadPIDFileRejectsInvalidContent"}, &os.ProcAttr{ + Env: append(os.Environ(), + "JETMON_TEST_READ_PID_INVALID=1", + "JETMON_PID_FILE="+pidPath, + ), + Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}, + }) + if err != nil { + t.Fatalf("StartProcess: %v", err) + } + state, err := proc.Wait() + if err != nil { + t.Fatalf("Wait: %v", err) + } + if state.Success() { + t.Fatal("readPIDFile accepted invalid PID content") + } +} + +func TestBuildAlertDispatchersIncludesStubEmail(t *testing.T) { + dispatchers := deliverer.BuildAlertDispatchers(&config.Config{ + EmailTransport: "stub", + EmailFrom: "jetmon@example.com", + }) + + for _, transport := range []alerting.Transport{ + alerting.TransportEmail, + alerting.TransportPagerDuty, + alerting.TransportSlack, + alerting.TransportTeams, + } { + if dispatchers[transport] == nil { + t.Fatalf("dispatcher for %s is nil", transport) + } + } + + destination, err := json.Marshal(map[string]string{"address": "ops@example.com"}) + if err != nil { + t.Fatalf("Marshal destination: %v", err) + } + + status, response, err := dispatchers[alerting.TransportEmail].Send( + context.Background(), + destination, + alerting.Notification{ + SiteID: 123, + SiteURL: "https://example.com", + EventID: 456, + EventType: "alert.opened", + SeverityName: "Down", + Timestamp: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), + }, + ) + if err != nil { + t.Fatalf("stub email dispatcher Send() error = %v", err) + } + // 250 mirrors the SMTP "Requested mail action okay, completed" reply + // code so the audit row reads the same shape regardless of which email + // transport actually fired. + if status != 250 { + t.Fatalf("stub email dispatcher status = %d, want 250", status) + } + if response != "delivered" { + t.Fatalf("stub email dispatcher response = %q, want delivered", response) + } +} + +func TestBuildAlertDispatchersSelectsConfiguredEmailSenders(t *testing.T) { + tests := []struct { + name string + transport string + wantType string + }{ + {name: "smtp", transport: "smtp", wantType: "*alerting.emailDispatcher"}, + {name: "wpcom", transport: "wpcom", wantType: "*alerting.emailDispatcher"}, + {name: "unknown falls back", transport: "sendmail", wantType: "*alerting.emailDispatcher"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dispatchers := deliverer.BuildAlertDispatchers(&config.Config{ + EmailTransport: tt.transport, + EmailFrom: "jetmon@example.com", + WPCOMEmailEndpoint: "https://wpcom.example/send", + SMTPHost: "smtp.example", + SMTPPort: 25, + }) + got := fmt.Sprintf("%T", dispatchers[alerting.TransportEmail]) + if got != tt.wantType { + t.Fatalf("email dispatcher type = %s, want %s", got, tt.wantType) + } + }) + } +} diff --git a/cmd/jetmon2/rollout.go b/cmd/jetmon2/rollout.go new file mode 100644 index 00000000..683274ed --- /dev/null +++ b/cmd/jetmon2/rollout.go @@ -0,0 +1,422 @@ +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "io" + "os" + "sort" + "strings" + "time" + + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/db" +) + +type pinnedRolloutCheckDeps struct { + Hostname func() string + HostRowExists func(context.Context, string) (bool, error) + CountActiveSitesForBucketRange func(context.Context, int, int) (int, error) + CountLegacyProjectionDrift func(context.Context, int, int) (int, error) +} + +type dynamicRolloutCheckDeps struct { + Now func() time.Time + GetAllHosts func() ([]db.HostRow, error) + CountActiveSitesForBucketRange func(context.Context, int, int) (int, error) + CountLegacyProjectionDrift func(context.Context, int, int) (int, error) +} + +type projectionDriftDeps struct { + CountLegacyProjectionDrift func(context.Context, int, int) (int, error) + ListLegacyProjectionDrift func(context.Context, int, int, int) ([]db.ProjectionDriftRow, error) +} + +func cmdRollout(args []string) { + if len(args) == 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 rollout [args]") + os.Exit(1) + } + + switch args[0] { + case "pinned-check": + cmdRolloutPinnedCheck(args[1:]) + case "dynamic-check": + cmdRolloutDynamicCheck(args[1:]) + case "projection-drift": + cmdRolloutProjectionDrift(args[1:]) + default: + fmt.Fprintf(os.Stderr, "unknown rollout subcommand %q (want: pinned-check, dynamic-check, projection-drift)\n", args[0]) + os.Exit(1) + } +} + +func cmdRolloutPinnedCheck(args []string) { + fs := flag.NewFlagSet("rollout pinned-check", flag.ExitOnError) + host := fs.String("host", "", "host id to check (default current hostname)") + _ = fs.Parse(args) + if fs.NArg() != 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 rollout pinned-check [--host=]") + os.Exit(1) + } + + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS config parse") + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS db connect") + + deps := pinnedRolloutCheckDeps{ + Hostname: db.Hostname, + HostRowExists: db.HostRowExists, + CountActiveSitesForBucketRange: db.CountActiveSitesForBucketRange, + CountLegacyProjectionDrift: db.CountLegacyProjectionDrift, + } + if err := runPinnedRolloutCheck(context.Background(), os.Stdout, config.Get(), *host, deps); err != nil { + fmt.Fprintf(os.Stderr, "FAIL %v\n", err) + os.Exit(1) + } +} + +func cmdRolloutDynamicCheck(args []string) { + fs := flag.NewFlagSet("rollout dynamic-check", flag.ExitOnError) + _ = fs.Parse(args) + if fs.NArg() != 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 rollout dynamic-check") + os.Exit(1) + } + + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS config parse") + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS db connect") + + deps := dynamicRolloutCheckDeps{ + Now: time.Now, + GetAllHosts: db.GetAllHosts, + CountActiveSitesForBucketRange: db.CountActiveSitesForBucketRange, + CountLegacyProjectionDrift: db.CountLegacyProjectionDrift, + } + if err := runDynamicRolloutCheck(context.Background(), os.Stdout, config.Get(), deps); err != nil { + fmt.Fprintf(os.Stderr, "FAIL %v\n", err) + os.Exit(1) + } +} + +func cmdRolloutProjectionDrift(args []string) { + fs := flag.NewFlagSet("rollout projection-drift", flag.ExitOnError) + bucketMin := fs.Int("bucket-min", -1, "inclusive bucket minimum (default pinned range or 0)") + bucketMax := fs.Int("bucket-max", -1, "inclusive bucket maximum (default pinned range or BUCKET_TOTAL-1)") + limit := fs.Int("limit", 50, "maximum drift rows to print") + _ = fs.Parse(args) + if fs.NArg() != 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 rollout projection-drift [--bucket-min=N --bucket-max=N] [--limit=N]") + os.Exit(1) + } + + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS config parse") + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS db connect") + + deps := projectionDriftDeps{ + CountLegacyProjectionDrift: db.CountLegacyProjectionDrift, + ListLegacyProjectionDrift: db.ListLegacyProjectionDrift, + } + if err := runProjectionDriftReport(context.Background(), os.Stdout, config.Get(), *bucketMin, *bucketMax, *limit, deps); err != nil { + fmt.Fprintf(os.Stderr, "FAIL %v\n", err) + os.Exit(1) + } +} + +func runPinnedRolloutCheck(ctx context.Context, out io.Writer, cfg *config.Config, hostOverride string, deps pinnedRolloutCheckDeps) error { + if cfg == nil { + return errors.New("config is not loaded") + } + minBucket, maxBucket, ok := cfg.PinnedBucketRange() + if !ok { + return errors.New("pinned bucket range is not configured; set PINNED_BUCKET_MIN/PINNED_BUCKET_MAX or BUCKET_NO_MIN/BUCKET_NO_MAX") + } + fmt.Fprintf(out, "PASS pinned_range=%d-%d\n", minBucket, maxBucket) + + if !cfg.LegacyStatusProjectionEnable { + return errors.New("LEGACY_STATUS_PROJECTION_ENABLE must be true during pinned v1-to-v2 rollout") + } + fmt.Fprintln(out, "PASS legacy_status_projection=enabled") + + if cfg.APIPort > 0 { + fmt.Fprintf(out, "WARN api_port=%d; confirm the API/delivery ownership plan before monitor cutover\n", cfg.APIPort) + } else { + fmt.Fprintln(out, "PASS api_port=disabled") + } + + hostID := strings.TrimSpace(hostOverride) + if hostID == "" { + if deps.Hostname == nil { + return errors.New("hostname resolver is not configured") + } + hostID = strings.TrimSpace(deps.Hostname()) + } + if hostID == "" { + return errors.New("host id is empty") + } + + if deps.HostRowExists == nil { + return errors.New("host row checker is not configured") + } + hostRowExists, err := deps.HostRowExists(ctx, hostID) + if err != nil { + return fmt.Errorf("check jetmon_hosts row for %q: %w", hostID, err) + } + if hostRowExists { + return fmt.Errorf("host %q still has a jetmon_hosts row; pinned hosts must not participate in dynamic bucket ownership", hostID) + } + fmt.Fprintf(out, "PASS jetmon_hosts row absent host=%q\n", hostID) + + if deps.CountActiveSitesForBucketRange == nil { + return errors.New("active site counter is not configured") + } + activeSites, err := deps.CountActiveSitesForBucketRange(ctx, minBucket, maxBucket) + if err != nil { + return fmt.Errorf("count active sites in pinned range %d-%d: %w", minBucket, maxBucket, err) + } + fmt.Fprintf(out, "INFO active_sites_in_pinned_range=%d\n", activeSites) + + if deps.CountLegacyProjectionDrift == nil { + return errors.New("projection drift counter is not configured") + } + drift, err := deps.CountLegacyProjectionDrift(ctx, minBucket, maxBucket) + if err != nil { + return fmt.Errorf("count legacy projection drift in pinned range %d-%d: %w", minBucket, maxBucket, err) + } + if drift > 0 { + return fmt.Errorf("legacy projection drift=%d in pinned range %d-%d", drift, minBucket, maxBucket) + } + fmt.Fprintln(out, "PASS legacy_projection_drift=0") + fmt.Fprintln(out, "pinned rollout check passed") + return nil +} + +func runDynamicRolloutCheck(ctx context.Context, out io.Writer, cfg *config.Config, deps dynamicRolloutCheckDeps) error { + if cfg == nil { + return errors.New("config is not loaded") + } + if minBucket, maxBucket, ok := cfg.PinnedBucketRange(); ok { + return fmt.Errorf("pinned bucket range %d-%d is still configured; remove PINNED_BUCKET_*/BUCKET_NO_* before dynamic ownership cutover", minBucket, maxBucket) + } + fmt.Fprintln(out, "PASS bucket_ownership=dynamic") + + if !cfg.LegacyStatusProjectionEnable { + return errors.New("LEGACY_STATUS_PROJECTION_ENABLE must remain true until legacy readers have migrated") + } + fmt.Fprintln(out, "PASS legacy_status_projection=enabled") + + if deps.GetAllHosts == nil { + return errors.New("host list query is not configured") + } + hosts, err := deps.GetAllHosts() + if err != nil { + return fmt.Errorf("query jetmon_hosts: %w", err) + } + fmt.Fprintf(out, "INFO jetmon_hosts_rows=%d\n", len(hosts)) + + now := time.Now() + if deps.Now != nil { + now = deps.Now() + } + if err := validateDynamicBucketCoverage(hosts, cfg.BucketTotal, time.Duration(cfg.BucketHeartbeatGraceSec)*time.Second, now); err != nil { + return err + } + fmt.Fprintf(out, "PASS dynamic_bucket_coverage=0-%d hosts=%d\n", cfg.BucketTotal-1, len(hosts)) + + if deps.CountActiveSitesForBucketRange == nil { + return errors.New("active site counter is not configured") + } + activeSites, err := deps.CountActiveSitesForBucketRange(ctx, 0, cfg.BucketTotal-1) + if err != nil { + return fmt.Errorf("count active sites in dynamic range 0-%d: %w", cfg.BucketTotal-1, err) + } + fmt.Fprintf(out, "INFO active_sites_dynamic_range=%d\n", activeSites) + + if deps.CountLegacyProjectionDrift == nil { + return errors.New("projection drift counter is not configured") + } + drift, err := deps.CountLegacyProjectionDrift(ctx, 0, cfg.BucketTotal-1) + if err != nil { + return fmt.Errorf("count legacy projection drift in dynamic range 0-%d: %w", cfg.BucketTotal-1, err) + } + if drift > 0 { + return fmt.Errorf("legacy projection drift=%d in dynamic range 0-%d", drift, cfg.BucketTotal-1) + } + fmt.Fprintln(out, "PASS legacy_projection_drift=0") + fmt.Fprintln(out, "dynamic rollout check passed") + return nil +} + +func validateDynamicBucketCoverage(hosts []db.HostRow, bucketTotal int, heartbeatGrace time.Duration, now time.Time) error { + if bucketTotal <= 0 { + return errors.New("BUCKET_TOTAL must be > 0") + } + if heartbeatGrace <= 0 { + return errors.New("BUCKET_HEARTBEAT_GRACE_SEC must be > 0") + } + if len(hosts) == 0 { + return errors.New("jetmon_hosts has no rows; dynamic ownership is not established") + } + + sortedHosts := append([]db.HostRow(nil), hosts...) + sort.Slice(sortedHosts, func(i, j int) bool { + if sortedHosts[i].BucketMin == sortedHosts[j].BucketMin { + return sortedHosts[i].HostID < sortedHosts[j].HostID + } + return sortedHosts[i].BucketMin < sortedHosts[j].BucketMin + }) + + expectedMin := 0 + for _, host := range sortedHosts { + if host.Status != "active" { + return fmt.Errorf("host %q has status=%q; all dynamic ownership rows must be active", host.HostID, host.Status) + } + if age := now.Sub(host.LastHeartbeat); age > heartbeatGrace { + return fmt.Errorf("host %q heartbeat is stale age=%s grace=%s", host.HostID, age.Round(time.Second), heartbeatGrace) + } + if host.BucketMin < 0 || host.BucketMax < host.BucketMin || host.BucketMax >= bucketTotal { + return fmt.Errorf("host %q has invalid bucket range %d-%d for BUCKET_TOTAL=%d", host.HostID, host.BucketMin, host.BucketMax, bucketTotal) + } + if host.BucketMin > expectedMin { + return fmt.Errorf("dynamic bucket coverage has gap %d-%d before host %q", expectedMin, host.BucketMin-1, host.HostID) + } + if host.BucketMin < expectedMin { + return fmt.Errorf("dynamic bucket coverage overlaps before host %q at bucket %d", host.HostID, host.BucketMin) + } + expectedMin = host.BucketMax + 1 + } + + if expectedMin < bucketTotal { + return fmt.Errorf("dynamic bucket coverage has trailing gap %d-%d", expectedMin, bucketTotal-1) + } + return nil +} + +func runProjectionDriftReport(ctx context.Context, out io.Writer, cfg *config.Config, bucketMin, bucketMax, limit int, deps projectionDriftDeps) error { + if cfg == nil { + return errors.New("config is not loaded") + } + if limit <= 0 { + return errors.New("limit must be > 0") + } + minBucket, maxBucket, err := resolveProjectionDriftRange(cfg, bucketMin, bucketMax) + if err != nil { + return err + } + + if deps.CountLegacyProjectionDrift == nil { + return errors.New("projection drift counter is not configured") + } + count, err := deps.CountLegacyProjectionDrift(ctx, minBucket, maxBucket) + if err != nil { + return fmt.Errorf("count legacy projection drift in range %d-%d: %w", minBucket, maxBucket, err) + } + fmt.Fprintf(out, "INFO projection_drift_range=%d-%d\n", minBucket, maxBucket) + fmt.Fprintf(out, "INFO legacy_projection_drift=%d\n", count) + + if count == 0 { + fmt.Fprintln(out, "PASS legacy_projection_drift=0") + return nil + } + + if deps.ListLegacyProjectionDrift == nil { + return errors.New("projection drift lister is not configured") + } + rows, err := deps.ListLegacyProjectionDrift(ctx, minBucket, maxBucket, limit) + if err != nil { + return fmt.Errorf("list legacy projection drift in range %d-%d: %w", minBucket, maxBucket, err) + } + printProjectionDriftRows(out, rows) + if count > len(rows) { + fmt.Fprintf(out, "INFO projection_drift_rows_truncated=%d\n", count-len(rows)) + } + return fmt.Errorf("legacy projection drift=%d in range %d-%d", count, minBucket, maxBucket) +} + +func resolveProjectionDriftRange(cfg *config.Config, bucketMin, bucketMax int) (int, int, error) { + if bucketMin < -1 || bucketMax < -1 { + return 0, 0, errors.New("bucket-min and bucket-max must be >= 0") + } + if (bucketMin == -1) != (bucketMax == -1) { + return 0, 0, errors.New("bucket-min and bucket-max must be set together") + } + if bucketMin >= 0 && bucketMax >= 0 { + if bucketMax < bucketMin { + return 0, 0, errors.New("bucket-max must be >= bucket-min") + } + if bucketMax >= cfg.BucketTotal { + return 0, 0, fmt.Errorf("bucket-max must be < BUCKET_TOTAL (%d)", cfg.BucketTotal) + } + return bucketMin, bucketMax, nil + } + if minBucket, maxBucket, ok := cfg.PinnedBucketRange(); ok { + return minBucket, maxBucket, nil + } + if cfg.BucketTotal <= 0 { + return 0, 0, errors.New("BUCKET_TOTAL must be > 0") + } + return 0, cfg.BucketTotal - 1, nil +} + +func printProjectionDriftRows(out io.Writer, rows []db.ProjectionDriftRow) { + fmt.Fprintf(out, "%-12s %-8s %-11s %-9s %-10s %s\n", + "BLOG_ID", "BUCKET", "SITE_STATUS", "EXPECTED", "EVENT_ID", "EVENT_STATE") + for _, row := range rows { + fmt.Fprintf(out, "%-12d %-8d %-11d %-9d %-10s %s\n", + row.BlogID, + row.BucketNo, + row.SiteStatus, + row.ExpectedStatus, + formatOptionalInt(row.EventID), + formatOptionalString(row.EventState), + ) + } +} + +func formatOptionalInt(v *int64) string { + if v == nil { + return "-" + } + return fmt.Sprintf("%d", *v) +} + +func formatOptionalString(v *string) string { + if v == nil || *v == "" { + return "-" + } + return *v +} diff --git a/cmd/jetmon2/rollout_test.go b/cmd/jetmon2/rollout_test.go new file mode 100644 index 00000000..b66ce09e --- /dev/null +++ b/cmd/jetmon2/rollout_test.go @@ -0,0 +1,606 @@ +package main + +import ( + "bytes" + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/db" +) + +func TestRunPinnedRolloutCheckSuccess(t *testing.T) { + minBucket, maxBucket := 12, 34 + cfg := &config.Config{ + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + LegacyStatusProjectionEnable: true, + } + + var gotHost string + var gotMin, gotMax int + deps := pinnedRolloutCheckDeps{ + Hostname: func() string { return "host-a" }, + HostRowExists: func(_ context.Context, hostID string) (bool, error) { + gotHost = hostID + return false, nil + }, + CountActiveSitesForBucketRange: func(_ context.Context, min, max int) (int, error) { + gotMin, gotMax = min, max + return 37, nil + }, + CountLegacyProjectionDrift: func(_ context.Context, min, max int) (int, error) { + if min != minBucket || max != maxBucket { + t.Fatalf("CountLegacyProjectionDrift range = %d-%d, want %d-%d", min, max, minBucket, maxBucket) + } + return 0, nil + }, + } + + var out bytes.Buffer + if err := runPinnedRolloutCheck(context.Background(), &out, cfg, "", deps); err != nil { + t.Fatalf("runPinnedRolloutCheck: %v", err) + } + if gotHost != "host-a" { + t.Fatalf("host = %q, want host-a", gotHost) + } + if gotMin != minBucket || gotMax != maxBucket { + t.Fatalf("active site range = %d-%d, want %d-%d", gotMin, gotMax, minBucket, maxBucket) + } + for _, want := range []string{ + "PASS pinned_range=12-34", + "PASS legacy_status_projection=enabled", + "PASS api_port=disabled", + "PASS jetmon_hosts row absent host=\"host-a\"", + "INFO active_sites_in_pinned_range=37", + "PASS legacy_projection_drift=0", + "pinned rollout check passed", + } { + if !strings.Contains(out.String(), want) { + t.Fatalf("output missing %q:\n%s", want, out.String()) + } + } +} + +func TestRunPinnedRolloutCheckUsesHostOverride(t *testing.T) { + minBucket, maxBucket := 1, 2 + cfg := &config.Config{ + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + LegacyStatusProjectionEnable: true, + } + + var gotHost string + deps := pinnedRolloutCheckDeps{ + Hostname: func() string { return "wrong-host" }, + HostRowExists: func(_ context.Context, hostID string) (bool, error) { + gotHost = hostID + return false, nil + }, + CountActiveSitesForBucketRange: func(context.Context, int, int) (int, error) { + return 1, nil + }, + CountLegacyProjectionDrift: func(context.Context, int, int) (int, error) { + return 0, nil + }, + } + + var out bytes.Buffer + if err := runPinnedRolloutCheck(context.Background(), &out, cfg, " override-host ", deps); err != nil { + t.Fatalf("runPinnedRolloutCheck: %v", err) + } + if gotHost != "override-host" { + t.Fatalf("host = %q, want override-host", gotHost) + } +} + +func TestRunPinnedRolloutCheckWarnsWhenAPIEnabled(t *testing.T) { + minBucket, maxBucket := 1, 2 + cfg := &config.Config{ + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + LegacyStatusProjectionEnable: true, + APIPort: 8090, + } + deps := successfulPinnedRolloutDeps() + + var out bytes.Buffer + if err := runPinnedRolloutCheck(context.Background(), &out, cfg, "", deps); err != nil { + t.Fatalf("runPinnedRolloutCheck: %v", err) + } + if !strings.Contains(out.String(), "WARN api_port=8090") { + t.Fatalf("output missing API warning:\n%s", out.String()) + } +} + +func TestRunPinnedRolloutCheckFailures(t *testing.T) { + minBucket, maxBucket := 1, 2 + tests := []struct { + name string + cfg *config.Config + deps pinnedRolloutCheckDeps + want string + }{ + { + name: "missing pinned range", + cfg: &config.Config{LegacyStatusProjectionEnable: true}, + deps: successfulPinnedRolloutDeps(), + want: "pinned bucket range is not configured", + }, + { + name: "legacy projection disabled", + cfg: &config.Config{ + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + }, + deps: successfulPinnedRolloutDeps(), + want: "LEGACY_STATUS_PROJECTION_ENABLE must be true", + }, + { + name: "host row exists", + cfg: pinnedRolloutTestConfig(minBucket, maxBucket), + deps: pinnedRolloutCheckDeps{ + Hostname: func() string { return "host-a" }, + HostRowExists: func(context.Context, string) (bool, error) { + return true, nil + }, + }, + want: "still has a jetmon_hosts row", + }, + { + name: "host row query error", + cfg: pinnedRolloutTestConfig(minBucket, maxBucket), + deps: pinnedRolloutCheckDeps{ + Hostname: func() string { return "host-a" }, + HostRowExists: func(context.Context, string) (bool, error) { + return false, errors.New("db unavailable") + }, + }, + want: "db unavailable", + }, + { + name: "projection drift", + cfg: pinnedRolloutTestConfig(minBucket, maxBucket), + deps: pinnedRolloutCheckDeps{ + Hostname: func() string { return "host-a" }, + HostRowExists: func(context.Context, string) (bool, error) { + return false, nil + }, + CountActiveSitesForBucketRange: func(context.Context, int, int) (int, error) { + return 10, nil + }, + CountLegacyProjectionDrift: func(context.Context, int, int) (int, error) { + return 2, nil + }, + }, + want: "legacy projection drift=2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var out bytes.Buffer + err := runPinnedRolloutCheck(context.Background(), &out, tt.cfg, "", tt.deps) + if err == nil { + t.Fatal("runPinnedRolloutCheck succeeded") + } + if !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error = %q, want substring %q", err.Error(), tt.want) + } + }) + } +} + +func pinnedRolloutTestConfig(minBucket, maxBucket int) *config.Config { + return &config.Config{ + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + LegacyStatusProjectionEnable: true, + } +} + +func successfulPinnedRolloutDeps() pinnedRolloutCheckDeps { + return pinnedRolloutCheckDeps{ + Hostname: func() string { return "host-a" }, + HostRowExists: func(context.Context, string) (bool, error) { + return false, nil + }, + CountActiveSitesForBucketRange: func(context.Context, int, int) (int, error) { + return 1, nil + }, + CountLegacyProjectionDrift: func(context.Context, int, int) (int, error) { + return 0, nil + }, + } +} + +func TestRunDynamicRolloutCheckSuccess(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + cfg := &config.Config{ + BucketTotal: 10, + BucketHeartbeatGraceSec: 60, + LegacyStatusProjectionEnable: true, + } + + var gotMin, gotMax int + deps := dynamicRolloutCheckDeps{ + Now: func() time.Time { return now }, + GetAllHosts: func() ([]db.HostRow, error) { + return []db.HostRow{ + {HostID: "host-b", BucketMin: 5, BucketMax: 9, LastHeartbeat: now.Add(-10 * time.Second), Status: "active"}, + {HostID: "host-a", BucketMin: 0, BucketMax: 4, LastHeartbeat: now.Add(-10 * time.Second), Status: "active"}, + }, nil + }, + CountActiveSitesForBucketRange: func(_ context.Context, min, max int) (int, error) { + gotMin, gotMax = min, max + return 123, nil + }, + CountLegacyProjectionDrift: func(_ context.Context, min, max int) (int, error) { + if min != 0 || max != 9 { + t.Fatalf("drift range = %d-%d, want 0-9", min, max) + } + return 0, nil + }, + } + + var out bytes.Buffer + if err := runDynamicRolloutCheck(context.Background(), &out, cfg, deps); err != nil { + t.Fatalf("runDynamicRolloutCheck: %v", err) + } + if gotMin != 0 || gotMax != 9 { + t.Fatalf("active site range = %d-%d, want 0-9", gotMin, gotMax) + } + for _, want := range []string{ + "PASS bucket_ownership=dynamic", + "PASS legacy_status_projection=enabled", + "INFO jetmon_hosts_rows=2", + "PASS dynamic_bucket_coverage=0-9 hosts=2", + "INFO active_sites_dynamic_range=123", + "PASS legacy_projection_drift=0", + "dynamic rollout check passed", + } { + if !strings.Contains(out.String(), want) { + t.Fatalf("output missing %q:\n%s", want, out.String()) + } + } +} + +func TestRunDynamicRolloutCheckFailures(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + minBucket, maxBucket := 1, 2 + + tests := []struct { + name string + cfg *config.Config + deps dynamicRolloutCheckDeps + want string + }{ + { + name: "pinned range still configured", + cfg: &config.Config{ + BucketTotal: 10, + BucketHeartbeatGraceSec: 60, + LegacyStatusProjectionEnable: true, + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + }, + deps: successfulDynamicRolloutDeps(now), + want: "pinned bucket range 1-2 is still configured", + }, + { + name: "legacy projection disabled", + cfg: &config.Config{ + BucketTotal: 10, + BucketHeartbeatGraceSec: 60, + }, + deps: successfulDynamicRolloutDeps(now), + want: "LEGACY_STATUS_PROJECTION_ENABLE must remain true", + }, + { + name: "host query error", + cfg: dynamicRolloutTestConfig(), + deps: dynamicRolloutCheckDeps{ + GetAllHosts: func() ([]db.HostRow, error) { + return nil, errors.New("db unavailable") + }, + }, + want: "db unavailable", + }, + { + name: "projection drift", + cfg: dynamicRolloutTestConfig(), + deps: dynamicRolloutCheckDeps{ + Now: func() time.Time { return now }, + GetAllHosts: func() ([]db.HostRow, error) { + return dynamicRolloutHosts(now), nil + }, + CountActiveSitesForBucketRange: func(context.Context, int, int) (int, error) { + return 10, nil + }, + CountLegacyProjectionDrift: func(context.Context, int, int) (int, error) { + return 3, nil + }, + }, + want: "legacy projection drift=3", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var out bytes.Buffer + err := runDynamicRolloutCheck(context.Background(), &out, tt.cfg, tt.deps) + if err == nil { + t.Fatal("runDynamicRolloutCheck succeeded") + } + if !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error = %q, want substring %q", err.Error(), tt.want) + } + }) + } +} + +func TestValidateDynamicBucketCoverageFailures(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + tests := []struct { + name string + hosts []db.HostRow + want string + }{ + { + name: "no hosts", + hosts: nil, + want: "jetmon_hosts has no rows", + }, + { + name: "inactive host", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 9, LastHeartbeat: now, Status: "draining"}, + }, + want: "status=\"draining\"", + }, + { + name: "stale heartbeat", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 9, LastHeartbeat: now.Add(-2 * time.Minute), Status: "active"}, + }, + want: "heartbeat is stale", + }, + { + name: "invalid range", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 10, LastHeartbeat: now, Status: "active"}, + }, + want: "invalid bucket range", + }, + { + name: "leading gap", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 1, BucketMax: 9, LastHeartbeat: now, Status: "active"}, + }, + want: "gap 0-0", + }, + { + name: "middle gap", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 3, LastHeartbeat: now, Status: "active"}, + {HostID: "host-b", BucketMin: 5, BucketMax: 9, LastHeartbeat: now, Status: "active"}, + }, + want: "gap 4-4", + }, + { + name: "overlap", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 5, LastHeartbeat: now, Status: "active"}, + {HostID: "host-b", BucketMin: 5, BucketMax: 9, LastHeartbeat: now, Status: "active"}, + }, + want: "overlaps", + }, + { + name: "trailing gap", + hosts: []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 8, LastHeartbeat: now, Status: "active"}, + }, + want: "trailing gap 9-9", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateDynamicBucketCoverage(tt.hosts, 10, time.Minute, now) + if err == nil { + t.Fatal("validateDynamicBucketCoverage succeeded") + } + if !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error = %q, want substring %q", err.Error(), tt.want) + } + }) + } +} + +func TestRunProjectionDriftReportNoDrift(t *testing.T) { + cfg := dynamicRolloutTestConfig() + deps := projectionDriftDeps{ + CountLegacyProjectionDrift: func(_ context.Context, min, max int) (int, error) { + if min != 0 || max != 9 { + t.Fatalf("count range = %d-%d, want 0-9", min, max) + } + return 0, nil + }, + } + + var out bytes.Buffer + if err := runProjectionDriftReport(context.Background(), &out, cfg, -1, -1, 50, deps); err != nil { + t.Fatalf("runProjectionDriftReport: %v", err) + } + for _, want := range []string{ + "INFO projection_drift_range=0-9", + "INFO legacy_projection_drift=0", + "PASS legacy_projection_drift=0", + } { + if !strings.Contains(out.String(), want) { + t.Fatalf("output missing %q:\n%s", want, out.String()) + } + } +} + +func TestRunProjectionDriftReportListsRowsAndFails(t *testing.T) { + cfg := dynamicRolloutTestConfig() + eventID := int64(123) + eventState := "Down" + deps := projectionDriftDeps{ + CountLegacyProjectionDrift: func(context.Context, int, int) (int, error) { + return 2, nil + }, + ListLegacyProjectionDrift: func(_ context.Context, min, max, limit int) ([]db.ProjectionDriftRow, error) { + if min != 2 || max != 4 || limit != 1 { + t.Fatalf("list args = %d-%d limit=%d, want 2-4 limit=1", min, max, limit) + } + return []db.ProjectionDriftRow{ + {BlogID: 42, BucketNo: 3, SiteStatus: 1, ExpectedStatus: 2, EventID: &eventID, EventState: &eventState}, + }, nil + }, + } + + var out bytes.Buffer + err := runProjectionDriftReport(context.Background(), &out, cfg, 2, 4, 1, deps) + if err == nil { + t.Fatal("runProjectionDriftReport succeeded") + } + if !strings.Contains(err.Error(), "legacy projection drift=2") { + t.Fatalf("error = %q, want drift count", err.Error()) + } + for _, want := range []string{ + "BLOG_ID", + "42", + "Down", + "INFO projection_drift_rows_truncated=1", + } { + if !strings.Contains(out.String(), want) { + t.Fatalf("output missing %q:\n%s", want, out.String()) + } + } +} + +func TestResolveProjectionDriftRange(t *testing.T) { + minBucket, maxBucket := 2, 4 + tests := []struct { + name string + cfg *config.Config + inMin int + inMax int + wantMin int + wantMax int + wantErr string + }{ + { + name: "dynamic default", + cfg: dynamicRolloutTestConfig(), + inMin: -1, + inMax: -1, + wantMin: 0, + wantMax: 9, + }, + { + name: "pinned default", + cfg: &config.Config{ + BucketTotal: 10, + PinnedBucketMin: &minBucket, + PinnedBucketMax: &maxBucket, + }, + inMin: -1, + inMax: -1, + wantMin: 2, + wantMax: 4, + }, + { + name: "explicit range", + cfg: dynamicRolloutTestConfig(), + inMin: 3, + inMax: 5, + wantMin: 3, + wantMax: 5, + }, + { + name: "one sided range", + cfg: dynamicRolloutTestConfig(), + inMin: 3, + inMax: -1, + wantErr: "must be set together", + }, + { + name: "negative range", + cfg: dynamicRolloutTestConfig(), + inMin: -2, + inMax: -2, + wantErr: "must be >= 0", + }, + { + name: "inverted range", + cfg: dynamicRolloutTestConfig(), + inMin: 7, + inMax: 3, + wantErr: "bucket-max must be >= bucket-min", + }, + { + name: "range outside total", + cfg: dynamicRolloutTestConfig(), + inMin: 0, + inMax: 10, + wantErr: "bucket-max must be < BUCKET_TOTAL", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotMin, gotMax, err := resolveProjectionDriftRange(tt.cfg, tt.inMin, tt.inMax) + if tt.wantErr != "" { + if err == nil { + t.Fatal("resolveProjectionDriftRange succeeded") + } + if !strings.Contains(err.Error(), tt.wantErr) { + t.Fatalf("error = %q, want substring %q", err.Error(), tt.wantErr) + } + return + } + if err != nil { + t.Fatalf("resolveProjectionDriftRange: %v", err) + } + if gotMin != tt.wantMin || gotMax != tt.wantMax { + t.Fatalf("range = %d-%d, want %d-%d", gotMin, gotMax, tt.wantMin, tt.wantMax) + } + }) + } +} + +func dynamicRolloutTestConfig() *config.Config { + return &config.Config{ + BucketTotal: 10, + BucketHeartbeatGraceSec: 60, + LegacyStatusProjectionEnable: true, + } +} + +func dynamicRolloutHosts(now time.Time) []db.HostRow { + return []db.HostRow{ + {HostID: "host-a", BucketMin: 0, BucketMax: 4, LastHeartbeat: now, Status: "active"}, + {HostID: "host-b", BucketMin: 5, BucketMax: 9, LastHeartbeat: now, Status: "active"}, + } +} + +func successfulDynamicRolloutDeps(now time.Time) dynamicRolloutCheckDeps { + return dynamicRolloutCheckDeps{ + Now: func() time.Time { return now }, + GetAllHosts: func() ([]db.HostRow, error) { + return dynamicRolloutHosts(now), nil + }, + CountActiveSitesForBucketRange: func(context.Context, int, int) (int, error) { + return 1, nil + }, + CountLegacyProjectionDrift: func(context.Context, int, int) (int, error) { + return 0, nil + }, + } +} diff --git a/cmd/jetmon2/site_tenants.go b/cmd/jetmon2/site_tenants.go new file mode 100644 index 00000000..c3e590b3 --- /dev/null +++ b/cmd/jetmon2/site_tenants.go @@ -0,0 +1,163 @@ +package main + +import ( + "context" + "encoding/csv" + "errors" + "flag" + "fmt" + "io" + "log" + "os" + "strconv" + "strings" + + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/db" +) + +type siteTenantImport struct { + Mappings []db.SiteTenantMapping + SkippedDuplicate int +} + +func cmdSiteTenants(args []string) { + if len(args) == 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 site-tenants [args]") + os.Exit(1) + } + + switch args[0] { + case "import": + cmdSiteTenantsImport(args[1:]) + default: + fmt.Fprintf(os.Stderr, "unknown site-tenants subcommand %q (want: import)\n", args[0]) + os.Exit(1) + } +} + +func cmdSiteTenantsImport(args []string) { + fs := flag.NewFlagSet("site-tenants import", flag.ExitOnError) + path := fs.String("file", "", "CSV file with tenant_id,blog_id rows; use - for stdin") + source := fs.String("source", "gateway", "mapping source label") + dryRun := fs.Bool("dry-run", false, "parse and validate input without writing") + _ = fs.Parse(args) + + if strings.TrimSpace(*path) == "" { + fmt.Fprintln(os.Stderr, "usage: jetmon2 site-tenants import --file [--source=gateway] [--dry-run]") + os.Exit(1) + } + + rc, err := openSiteTenantImport(*path) + if err != nil { + log.Fatalf("open import file: %v", err) + } + defer rc.Close() + + in, err := parseSiteTenantMappings(rc) + if err != nil { + log.Fatalf("parse import file: %v", err) + } + + if *dryRun { + fmt.Printf("Validated %d site tenant mappings", len(in.Mappings)) + if in.SkippedDuplicate > 0 { + fmt.Printf(" (%d duplicate rows skipped)", in.SkippedDuplicate) + } + fmt.Println() + return + } + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + log.Fatalf("db: %v", err) + } + affected, err := db.UpsertSiteTenantMappings(context.Background(), db.DB(), in.Mappings, *source) + if err != nil { + log.Fatalf("import: %v", err) + } + + fmt.Printf("Imported %d site tenant mappings", len(in.Mappings)) + if in.SkippedDuplicate > 0 { + fmt.Printf(" (%d duplicate rows skipped)", in.SkippedDuplicate) + } + fmt.Printf("; database rows affected=%d\n", affected) +} + +func openSiteTenantImport(path string) (io.ReadCloser, error) { + if strings.TrimSpace(path) == "-" { + return io.NopCloser(os.Stdin), nil + } + return os.Open(path) +} + +func parseSiteTenantMappings(r io.Reader) (siteTenantImport, error) { + reader := csv.NewReader(r) + reader.TrimLeadingSpace = true + reader.FieldsPerRecord = -1 + + out := siteTenantImport{} + seen := make(map[db.SiteTenantMapping]struct{}) + line := 0 + sawData := false + for { + record, err := reader.Read() + if errors.Is(err, io.EOF) { + break + } + line++ + if err != nil { + return out, err + } + if emptyCSVRecord(record) { + continue + } + if !sawData && isSiteTenantHeader(record) { + sawData = true + continue + } + sawData = true + if len(record) != 2 { + return out, fmt.Errorf("line %d: expected 2 columns tenant_id,blog_id; got %d", line, len(record)) + } + + tenantID := strings.TrimSpace(record[0]) + if tenantID == "" { + return out, fmt.Errorf("line %d: tenant_id is required", line) + } + blogID, err := strconv.ParseInt(strings.TrimSpace(record[1]), 10, 64) + if err != nil || blogID <= 0 { + return out, fmt.Errorf("line %d: blog_id must be a positive integer", line) + } + + mapping := db.SiteTenantMapping{TenantID: tenantID, BlogID: blogID} + if _, ok := seen[mapping]; ok { + out.SkippedDuplicate++ + continue + } + seen[mapping] = struct{}{} + out.Mappings = append(out.Mappings, mapping) + } + + if len(out.Mappings) == 0 { + return out, errors.New("no site tenant mappings found") + } + return out, nil +} + +func isSiteTenantHeader(record []string) bool { + if len(record) != 2 { + return false + } + return strings.EqualFold(strings.TrimSpace(record[0]), "tenant_id") && + strings.EqualFold(strings.TrimSpace(record[1]), "blog_id") +} + +func emptyCSVRecord(record []string) bool { + for _, field := range record { + if strings.TrimSpace(field) != "" { + return false + } + } + return true +} diff --git a/cmd/jetmon2/site_tenants_test.go b/cmd/jetmon2/site_tenants_test.go new file mode 100644 index 00000000..c69df0ed --- /dev/null +++ b/cmd/jetmon2/site_tenants_test.go @@ -0,0 +1,70 @@ +package main + +import ( + "strings" + "testing" + + "github.com/Automattic/jetmon/internal/db" +) + +func TestParseSiteTenantMappingsHeaderDedupesAndSkipsBlanks(t *testing.T) { + in, err := parseSiteTenantMappings(strings.NewReader(` +tenant_id,blog_id +tenant-a,42 + +tenant-a,42 +tenant-b,43 +`)) + if err != nil { + t.Fatalf("parseSiteTenantMappings: %v", err) + } + if in.SkippedDuplicate != 1 { + t.Fatalf("SkippedDuplicate = %d, want 1", in.SkippedDuplicate) + } + want := []db.SiteTenantMapping{ + {TenantID: "tenant-a", BlogID: 42}, + {TenantID: "tenant-b", BlogID: 43}, + } + if len(in.Mappings) != len(want) { + t.Fatalf("Mappings len = %d, want %d", len(in.Mappings), len(want)) + } + for i := range want { + if in.Mappings[i] != want[i] { + t.Fatalf("Mappings[%d] = %+v, want %+v", i, in.Mappings[i], want[i]) + } + } +} + +func TestParseSiteTenantMappingsRejectsInvalidRows(t *testing.T) { + tests := []struct { + name string + csv string + want string + }{ + {name: "empty", csv: "\n", want: "no site tenant mappings"}, + {name: "missing tenant", csv: ",42\n", want: "tenant_id is required"}, + {name: "bad blog id", csv: "tenant-a,nope\n", want: "blog_id must be a positive integer"}, + {name: "too many columns", csv: "tenant-a,42,extra\n", want: "expected 2 columns"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := parseSiteTenantMappings(strings.NewReader(tt.csv)) + if err == nil { + t.Fatal("parseSiteTenantMappings succeeded") + } + if !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error = %q, want substring %q", err.Error(), tt.want) + } + }) + } +} + +func TestIsSiteTenantHeader(t *testing.T) { + if !isSiteTenantHeader([]string{" tenant_id ", " blog_id "}) { + t.Fatal("isSiteTenantHeader did not accept canonical header") + } + if isSiteTenantHeader([]string{"tenant", "blog"}) { + t.Fatal("isSiteTenantHeader accepted non-canonical header") + } +} diff --git a/config/config-sample.json b/config/config-sample.json index fc09687a..7c38e873 100644 --- a/config/config-sample.json +++ b/config/config-sample.json @@ -5,7 +5,7 @@ "DATASET_SIZE" : 100, "WORKER_MAX_MEM_MB" : 53, - "DB_UPDATES_ENABLE" : false, + "LEGACY_STATUS_PROJECTION_ENABLE" : true, "BUCKET_TOTAL" : 1000, "BUCKET_TARGET" : 500, @@ -33,13 +33,25 @@ "LOG_FORMAT" : "text", "DASHBOARD_PORT" : 8080, + "API_PORT" : 0, + "DELIVERY_OWNER_HOST": "", "DEBUG_PORT" : 6060, + "EMAIL_TRANSPORT" : "stub", + "EMAIL_FROM" : "jetmon@noreply.invalid", + "WPCOM_EMAIL_ENDPOINT" : "", + "WPCOM_EMAIL_AUTH_TOKEN": "", + "SMTP_HOST" : "", + "SMTP_PORT" : 0, + "SMTP_USERNAME" : "", + "SMTP_PASSWORD" : "", + "SMTP_USE_TLS" : false, + "VERIFIERS": [ { "name" : "Veriflier 1", "host" : "veriflier", - "grpc_port" : "", + "port" : "", "auth_token" : "" } ] diff --git a/config/config.readme b/config/config.readme index 9a9505e4..9702c206 100644 --- a/config/config.readme +++ b/config/config.readme @@ -10,10 +10,11 @@ Number of sites to dispatch per round. Default: 40. DATASET_SIZE Maximum number of sites to fetch from the database per batch. Default: 100. +LEGACY_STATUS_PROJECTION_ENABLE +Set to true while Jetmon v2 is running in shadow-v2-state migration mode. When enabled, v2 writes its authoritative incident state to jetmon_events / jetmon_event_transitions and also projects v1-compatible site_status + last_status_change values back into jetpack_monitor_sites for legacy consumers. Set to false only after downstream readers have moved to the v2 event/API surface. Default: true. + DB_UPDATES_ENABLE -WARNING: Do not enable on production hosts. -Set to true to allow Jetmon to update the jetpack_monitor_sites table. Only useful in local Docker test environments to observe status-change behaviour. -Enabling this also requires the environment variable JETMON_UNSAFE_DB_UPDATES=1 to be set, as a second confirmation gate against accidental production use. +Deprecated alias for LEGACY_STATUS_PROJECTION_ENABLE. If both keys are present, LEGACY_STATUS_PROJECTION_ENABLE wins. BUCKET_TOTAL Total number of buckets in the system across all hosts. Must match the range of bucket_no values in the jetpack_monitor_sites table. Default: 1000. @@ -24,6 +25,12 @@ Number of buckets this host should claim on startup. Used for initial distributi BUCKET_HEARTBEAT_GRACE_SEC Seconds after a host's last heartbeat before its buckets are considered available for reclaiming by another host. Default: 600. +PINNED_BUCKET_MIN / PINNED_BUCKET_MAX +Migration-only static bucket range for replacing one v1 host with one v2 host during the initial v1-to-v2 rollout. When both are set, jetmon2 checks only that inclusive bucket range and does not claim, heartbeat, or release rows in jetmon_hosts. Disable after the whole fleet is on v2 so dynamic bucket ownership can take over. Must satisfy 0 <= min <= max < BUCKET_TOTAL. Default: unset. + +BUCKET_NO_MIN / BUCKET_NO_MAX +Deprecated v1 names accepted as aliases for PINNED_BUCKET_MIN / PINNED_BUCKET_MAX. They must be set together and must match PINNED_BUCKET_* if both forms are present. + BATCH_SIZE Number of buckets fetched per database query when loading sites. Default: 32. @@ -67,15 +74,49 @@ LOG_FORMAT Log output format. Set to "json" for structured logging (e.g. for log aggregators), or "text" for human-readable output. Default: "text". DASHBOARD_PORT -Port for the operator dashboard and internal API. Set to 0 to disable. Default: 8080. +Port for the operator dashboard. Set to 0 to disable. Default: 8080. + +API_PORT +Port for the internal REST API. Set to 0 to disable. In the embedded v2 deployment, API_PORT also controls whether webhook and alert-contact delivery workers are eligible to run inside jetmon2. The standalone jetmon-deliverer binary does not start the API and does not require API_PORT. Default: 0. + +DELIVERY_OWNER_HOST +Optional hostname that is allowed to run webhook and alert-contact delivery workers. Delivery rows are claimed transactionally, so multiple active workers do not claim the same pending row; use this setting when you want an explicit single-owner rollout while moving from embedded jetmon2 delivery to standalone jetmon-deliverer. If empty and embedded delivery is eligible, the current jetmon2 host starts delivery workers for backward compatibility and startup / validate-config emit a warning. If empty for jetmon-deliverer, that process starts delivery workers and logs the same warning. Default: empty. DEBUG_PORT Port for the pprof debug server. Only binds to 127.0.0.1 (localhost) — never accessible remotely. Set to 0 to disable. Default: 6060. Access via: curl http://localhost:6060/debug/pprof/ +EMAIL_TRANSPORT +Email sender used by alert contacts with transport "email". Set to "stub" to log rendered email without sending, "smtp" to send directly through SMTP, or "wpcom" to POST to a WPCOM-owned email API endpoint. Empty is treated like "stub" for compatibility. Startup and validate-config warn when this resolves to "stub" because email alert contacts will not deliver mail in that mode. Default: "stub". + +EMAIL_FROM +From address used when rendering alert-contact emails. Default: "jetmon@noreply.invalid". + +WPCOM_EMAIL_ENDPOINT +Required when EMAIL_TRANSPORT is "wpcom". HTTP endpoint that receives rendered email payloads. + +WPCOM_EMAIL_AUTH_TOKEN +Optional Bearer token sent to WPCOM_EMAIL_ENDPOINT when EMAIL_TRANSPORT is "wpcom". + +SMTP_HOST +Required when EMAIL_TRANSPORT is "smtp". SMTP server hostname. + +SMTP_PORT +Required when EMAIL_TRANSPORT is "smtp". SMTP server port. + +SMTP_USERNAME +Optional SMTP username used when EMAIL_TRANSPORT is "smtp". + +SMTP_PASSWORD +Optional SMTP password used when EMAIL_TRANSPORT is "smtp". + +SMTP_USE_TLS +Set to true to connect to SMTP_HOST with TLS from the start. Default: false. + VERIFIERS Array of veriflier configuration objects. Each entry requires: name - display name host - hostname or IP of the veriflier - grpc_port - gRPC/HTTP port (default 7803) + port - Veriflier JSON-over-HTTP transport port (default 7803) auth_token - shared secret for veriflier authentication +The legacy grpc_port key is still accepted as a compatibility alias. diff --git a/docker/.env-sample b/docker/.env-sample index c8f52fd9..e071cc79 100644 --- a/docker/.env-sample +++ b/docker/.env-sample @@ -1,22 +1,56 @@ -# MySQL — MYSQLDB_USER connects as root for local dev. -# In staging/production use a dedicated user with only the permissions jetmon needs. -MYSQLDB_USER=root -MYSQLDB_ROOT_PASSWORD=123456 -MYSQLDB_DATABASE=jetmon_db -MYSQLDB_LOCAL_PORT=3307 -MYSQLDB_DOCKER_PORT=3306 +# Docker Compose reads this file for local development only. +# *_HOST_PORT variables publish hardcoded container ports to your host. -WPCOM_JETMON_AUTH_TOKEN=change_me +# Host interface used for non-API published development ports. +BIND_ADDR=127.0.0.1 +# API bind address. Default exposes the API to other systems on your network; +# set to 127.0.0.1 when you only want local API access. +API_BIND_ADDR=0.0.0.0 + +# MySQL container bootstrap plus Jetmon's app-level DB connection. +# MYSQL_ROOT_PASSWORD is only used by the local MySQL container and the +# one-shot mysql-user setup service. Jetmon connects with MYSQL_USER and +# MYSQL_PASSWORD instead of root. +MYSQL_USER=jetmon +MYSQL_PASSWORD=jetmon_dev_password +MYSQL_ROOT_PASSWORD=123456 +MYSQL_DATABASE=jetmon_db +MYSQL_HOST_PORT=3307 + +# Token used by Jetmon when generating local config/config.json from the sample. +WPCOM_AUTH_TOKEN=change_me + +# Monitor-to-Veriflier auth plus the host-published Veriflier port. VERIFLIER_AUTH_TOKEN=veriflier_1_auth_token -VERIFLIER_GRPC_LOCAL_PORT=7803 -VERIFLIER_GRPC_DOCKER_PORT=7803 +VERIFLIER_HOST_PORT=7803 + +# Host-published ports for Jetmon's dashboard and REST API. +DASHBOARD_HOST_PORT=8080 +API_HOST_PORT=8090 + +# Host-published port for the local Mailpit web UI. Jetmon sends SMTP to the +# internal mailpit:1025 address; the SMTP port is not published to the host. +MAILPIT_HOST_PORT=8025 + +# Docker-generated config uses Mailpit for local alert-contact email delivery. +EMAIL_TRANSPORT=smtp +EMAIL_FROM=jetmon@noreply.invalid +SMTP_HOST=mailpit +SMTP_PORT=1025 +SMTP_USERNAME= +SMTP_PASSWORD= +SMTP_USE_TLS=false -DASHBOARD_LOCAL_PORT=8080 -DASHBOARD_DOCKER_PORT=8080 +# Host-published ports for local Graphite and StatsD access. +GRAPHITE_HOST_PORT=8088 +STATSD_HOST_PORT=8125 -JETMON_UID=1000 -JETMON_GID=1000 +# Container user/group ids. Match these to your host user so bind-mounted files +# in config/, logs/, and stats/ stay writable without root-owned output. +UID=1000 +GID=1000 -# Uncomment to allow DB_UPDATES_ENABLE in config.json (local dev only — never in production). -# JETMON_UNSAFE_DB_UPDATES=1 +# Local escape hatch for legacy config files that still contain +# DB_UPDATES_ENABLE. Do not enable this in staging or production. +# UNSAFE_DB_UPDATES=1 diff --git a/docker/Dockerfile_jetmon b/docker/Dockerfile_jetmon index 784d42c2..5a7d49ab 100644 --- a/docker/Dockerfile_jetmon +++ b/docker/Dockerfile_jetmon @@ -11,7 +11,9 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o jetmon2 ./cmd/jetmon2/ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ + bash \ ca-certificates \ + curl \ && rm -rf /var/lib/apt/lists/* RUN groupadd -r jetmon && useradd --no-log-init -r -g jetmon jetmon @@ -26,7 +28,7 @@ RUN chmod +x entrypoint.sh \ && chown -R jetmon:jetmon /jetmon \ && chmod 777 logs stats certs -EXPOSE 8080/tcp +EXPOSE 8080/tcp 8090/tcp USER jetmon diff --git a/docker/Dockerfile_veriflier b/docker/Dockerfile_veriflier index ac559a8d..1865878c 100644 --- a/docker/Dockerfile_veriflier +++ b/docker/Dockerfile_veriflier @@ -11,7 +11,9 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o veriflier2-bin ./veriflier2/cmd/ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ + bash \ ca-certificates \ + curl \ && rm -rf /var/lib/apt/lists/* RUN groupadd -r veriflier && useradd --no-log-init -r -g veriflier veriflier diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 3322602b..a5dcafc2 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,72 +1,139 @@ services: - mysqldb: - image: mysql:8.0 - restart: unless-stopped - env_file: - - .env - environment: - - MYSQL_ROOT_PASSWORD=$MYSQLDB_ROOT_PASSWORD - - MYSQL_DATABASE=$MYSQLDB_DATABASE - ports: - - $MYSQLDB_LOCAL_PORT:$MYSQLDB_DOCKER_PORT - volumes: - - db:/var/lib/mysql - healthcheck: - test: ["CMD-SHELL", "MYSQL_PWD=$$MYSQLDB_ROOT_PASSWORD mysqladmin ping -h localhost -u root --silent"] - interval: 5s - timeout: 5s - retries: 10 - start_period: 10s - jetmon: - hostname: docker.jetmon.dev.com - build: - context: ../ - dockerfile: docker/Dockerfile_jetmon - restart: unless-stopped - user: "${JETMON_UID:-1000}:${JETMON_GID:-1000}" - env_file: - - .env - volumes: - - ../config:/jetmon/config - environment: - - DB_HOST=mysqldb - - DB_USER=$MYSQLDB_USER - - DB_PASSWORD=$MYSQLDB_ROOT_PASSWORD - - DB_NAME=$MYSQLDB_DATABASE - - DB_PORT=$MYSQLDB_DOCKER_PORT - - VERIFLIER_AUTH_TOKEN=$VERIFLIER_AUTH_TOKEN - - VERIFLIER_GRPC_PORT=$VERIFLIER_GRPC_DOCKER_PORT - - WPCOM_JETMON_AUTH_TOKEN=$WPCOM_JETMON_AUTH_TOKEN - - DASHBOARD_PORT=$DASHBOARD_DOCKER_PORT - ports: - - $DASHBOARD_LOCAL_PORT:$DASHBOARD_DOCKER_PORT - depends_on: - mysqldb: - condition: service_healthy - veriflier: - build: - context: ../ - dockerfile: docker/Dockerfile_veriflier - restart: unless-stopped - volumes: - - ../veriflier2/config:/opt/veriflier/config - ports: - - $VERIFLIER_GRPC_LOCAL_PORT:$VERIFLIER_GRPC_DOCKER_PORT - environment: - - VERIFLIER_AUTH_TOKEN=$VERIFLIER_AUTH_TOKEN - - VERIFLIER_GRPC_PORT=$VERIFLIER_GRPC_DOCKER_PORT - statsd: - image: graphiteapp/graphite-statsd - restart: unless-stopped - ports: - - 8088:80 - - 8125:8125 - - 8125:8125/udp - volumes: - - ./volumes/statsd/graphite/conf:/opt/graphite/conf - - ./volumes/statsd/graphite/storage:/opt/graphite/storage - - ./volumes/statsd/statsd/config:/opt/statsd/config - - ./volumes/statsd/logs:/var/log + mysqldb: + image: mysql:8.0 + restart: unless-stopped + environment: + MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD:-123456} + MYSQL_DATABASE: ${MYSQL_DATABASE:-jetmon_db} + MYSQL_USER: ${MYSQL_USER:-jetmon} + MYSQL_PASSWORD: ${MYSQL_PASSWORD:-jetmon_dev_password} + ports: + - "${BIND_ADDR:-127.0.0.1}:${MYSQL_HOST_PORT:-3307}:3306" + volumes: + - db:/var/lib/mysql + healthcheck: + test: ["CMD-SHELL", "MYSQL_PWD=$$MYSQL_ROOT_PASSWORD mysqladmin ping --protocol=tcp -h 127.0.0.1 -u root --silent"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 10s + + mysql-user: + image: mysql:8.0 + restart: "no" + depends_on: + mysqldb: + condition: service_healthy + environment: + MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD:-123456} + MYSQL_DATABASE: ${MYSQL_DATABASE:-jetmon_db} + MYSQL_USER: ${MYSQL_USER:-jetmon} + MYSQL_PASSWORD: ${MYSQL_PASSWORD:-jetmon_dev_password} + volumes: + - ./init-mysql-user.sh:/usr/local/bin/init-mysql-user.sh:ro + entrypoint: ["bash", "/usr/local/bin/init-mysql-user.sh"] + + jetmon: + hostname: docker.jetmon.dev.com + build: + context: ../ + dockerfile: docker/Dockerfile_jetmon + init: true + restart: unless-stopped + user: "${UID:-1000}:${GID:-1000}" + volumes: + - ../config:/jetmon/config + - ../logs:/jetmon/logs + - ../stats:/jetmon/stats + environment: + DB_HOST: mysqldb + DB_USER: ${MYSQL_USER:-jetmon} + DB_PASSWORD: ${MYSQL_PASSWORD:-jetmon_dev_password} + DB_NAME: ${MYSQL_DATABASE:-jetmon_db} + DB_PORT: "3306" + VERIFLIER_AUTH_TOKEN: ${VERIFLIER_AUTH_TOKEN:-veriflier_1_auth_token} + VERIFLIER_PORT: "7803" + WPCOM_AUTH_TOKEN: ${WPCOM_AUTH_TOKEN:-change_me} + EMAIL_TRANSPORT: ${EMAIL_TRANSPORT:-smtp} + EMAIL_FROM: ${EMAIL_FROM:-jetmon@noreply.invalid} + SMTP_HOST: ${SMTP_HOST:-mailpit} + SMTP_PORT: ${SMTP_PORT:-1025} + SMTP_USERNAME: ${SMTP_USERNAME:-} + SMTP_PASSWORD: ${SMTP_PASSWORD:-} + SMTP_USE_TLS: ${SMTP_USE_TLS:-false} + JETMON_PID_FILE: /jetmon/stats/jetmon2.pid + ports: + - "${BIND_ADDR:-127.0.0.1}:${DASHBOARD_HOST_PORT:-8080}:8080" + - "${API_BIND_ADDR:-0.0.0.0}:${API_HOST_PORT:-8090}:8090" + depends_on: + mysql-user: + condition: service_completed_successfully + mailpit: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8090/api/v1/health"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 30s + + veriflier: + build: + context: ../ + dockerfile: docker/Dockerfile_veriflier + init: true + restart: unless-stopped + volumes: + - ../veriflier2/config:/opt/veriflier/config + ports: + - "${BIND_ADDR:-127.0.0.1}:${VERIFLIER_HOST_PORT:-7803}:7803" + environment: + VERIFLIER_AUTH_TOKEN: ${VERIFLIER_AUTH_TOKEN:-veriflier_1_auth_token} + VERIFLIER_PORT: "7803" + STATSD_ADDR: statsd:8125 + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:7803/status"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 10s + + mailpit: + image: axllent/mailpit:v1.29 + restart: unless-stopped + ports: + - "${BIND_ADDR:-127.0.0.1}:${MAILPIT_HOST_PORT:-8025}:8025" + environment: + MP_DATABASE: /data/mailpit.db + MP_MAX_MESSAGES: 5000 + volumes: + - mailpit-data:/data + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://127.0.0.1:8025/readyz"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 10s + + statsd: + image: graphiteapp/graphite-statsd + restart: unless-stopped + ports: + - "${BIND_ADDR:-127.0.0.1}:${GRAPHITE_HOST_PORT:-8088}:80" + - "${BIND_ADDR:-127.0.0.1}:${STATSD_HOST_PORT:-8125}:8125" + - "${BIND_ADDR:-127.0.0.1}:${STATSD_HOST_PORT:-8125}:8125/udp" + volumes: + - statsd-graphite-storage:/opt/graphite/storage + - statsd-logs:/var/log + healthcheck: + test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1/', timeout=2).close()\""] + interval: 10s + timeout: 5s + retries: 12 + start_period: 20s volumes: db: + mailpit-data: + statsd-graphite-storage: + statsd-logs: diff --git a/docker/init-mysql-user.sh b/docker/init-mysql-user.sh new file mode 100755 index 00000000..1096a2d5 --- /dev/null +++ b/docker/init-mysql-user.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${MYSQL_ROOT_PASSWORD:?MYSQL_ROOT_PASSWORD is required}" +: "${MYSQL_DATABASE:?MYSQL_DATABASE is required}" +: "${MYSQL_USER:?MYSQL_USER is required}" +: "${MYSQL_PASSWORD:?MYSQL_PASSWORD is required}" + +if [ "${MYSQL_USER}" = "root" ]; then + echo "MYSQL_USER must be a non-root application user" >&2 + exit 1 +fi + +sql_string() { + local value=$1 + value=${value//\\/\\\\} + value=${value//\'/\\\'} + printf "'%s'" "${value}" +} + +sql_identifier() { + local value=$1 + value=${value//\`/\`\`} + printf '`%s`' "${value}" +} + +db_name=$(sql_identifier "${MYSQL_DATABASE}") +app_user=$(sql_string "${MYSQL_USER}") +app_password=$(sql_string "${MYSQL_PASSWORD}") + +mysql_root() { + MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql \ + --protocol=tcp \ + --host=mysqldb \ + --user=root \ + --connect-timeout=2 \ + "$@" +} + +attempt=1 +max_attempts=${MYSQL_READY_ATTEMPTS:-60} +while ! mysql_root --execute="SELECT 1" >/dev/null 2>&1; do + if [ "${attempt}" -ge "${max_attempts}" ]; then + echo "mysql: could not connect to mysqldb:3306 after ${max_attempts} attempts" >&2 + exit 1 + fi + echo "mysql: waiting for mysqldb:3306 to accept TCP connections (${attempt}/${max_attempts})" >&2 + attempt=$((attempt + 1)) + sleep 2 +done + +mysql_root <|$(sed_escape "${WPCOM_AUTH_TOKEN:-change_me}")|g" \ + -e "s||$(sed_escape "${VERIFLIER_PORT}")|g" \ + -e "s||$(sed_escape "${VERIFLIER_AUTH_TOKEN:-veriflier_1_auth_token}")|g" \ + -e 's|"API_PORT" : 0|"API_PORT" : 8090|g' \ + -e "s|\"EMAIL_TRANSPORT\" : \"stub\"|\"EMAIL_TRANSPORT\" : \"$(sed_escape "${EMAIL_TRANSPORT:-smtp}")\"|g" \ + -e "s|\"EMAIL_FROM\" : \"jetmon@noreply.invalid\"|\"EMAIL_FROM\" : \"$(sed_escape "${EMAIL_FROM:-jetmon@noreply.invalid}")\"|g" \ + -e "s|\"SMTP_HOST\" : \"\"|\"SMTP_HOST\" : \"$(sed_escape "${SMTP_HOST:-mailpit}")\"|g" \ + -e "s|\"SMTP_PORT\" : 0|\"SMTP_PORT\" : ${SMTP_PORT:-1025}|g" \ + -e "s|\"SMTP_USERNAME\" : \"\"|\"SMTP_USERNAME\" : \"$(sed_escape "${SMTP_USERNAME:-}")\"|g" \ + -e "s|\"SMTP_PASSWORD\" : \"\"|\"SMTP_PASSWORD\" : \"$(sed_escape "${SMTP_PASSWORD:-}")\"|g" \ + -e "s|\"SMTP_USE_TLS\" : false|\"SMTP_USE_TLS\" : ${SMTP_USE_TLS:-false}|g" \ + config/config-sample.json > "${target}" +} + +config_target() { if [ -w config/ ]; then - sed \ - -e "s//${WPCOM_JETMON_AUTH_TOKEN}/g" \ - -e "s//${VERIFLIER_GRPC_PORT}/g" \ - -e "s//${VERIFLIER_AUTH_TOKEN}/g" \ - config/config-sample.json > config/config.json + printf '%s\n' "config/config.json" else export JETMON_CONFIG=/tmp/config.json - sed \ - -e "s//${WPCOM_JETMON_AUTH_TOKEN}/g" \ - -e "s//${VERIFLIER_GRPC_PORT}/g" \ - -e "s//${VERIFLIER_AUTH_TOKEN}/g" \ - config/config-sample.json > "${JETMON_CONFIG}" + printf '%s\n' "${JETMON_CONFIG}" fi -fi +} + +# /jetmon is owned by the jetmon user from the Dockerfile, but the container +# runs as ${UID:-1000}:${GID:-1000} via docker-compose — write to stats/ instead, which +# the Dockerfile chmods 0777 specifically so reload/drain commands work. +export JETMON_PID_FILE="${JETMON_PID_FILE:-/jetmon/stats/jetmon2.pid}" +export VERIFLIER_PORT="${VERIFLIER_PORT:-${VERIFLIER_GRPC_PORT:-7803}}" +mkdir -p logs stats +for path in logs/jetmon.log logs/status-change.log stats/sitespersec stats/sitesqueue stats/totals; do + if ! touch "$path" 2>/dev/null; then + echo "warning: could not write $path; check docker/.env UID/GID and host directory permissions" >&2 + fi +done + +if [ ! -f config/config.json ]; then + render_config "$(config_target)" +fi ./jetmon2 migrate diff --git a/docker/run-veriflier.sh b/docker/run-veriflier.sh index b6f43a42..20c25eeb 100644 --- a/docker/run-veriflier.sh +++ b/docker/run-veriflier.sh @@ -1,11 +1,33 @@ #!/usr/bin/env bash +set -euo pipefail + cd /opt/veriflier -if [ ! -f config/veriflier.json ]; then +sed_escape() { + printf '%s' "$1" | sed -e 's/[\\&|]/\\&/g' +} + +render_config() { + local target=$1 sed \ - -e "s//${VERIFLIER_GRPC_PORT}/g" \ - -e "s//${VERIFLIER_AUTH_TOKEN}/g" \ - config/veriflier-sample.json > config/veriflier.json + -e "s||$(sed_escape "${VERIFLIER_PORT}")|g" \ + -e "s||$(sed_escape "${VERIFLIER_AUTH_TOKEN:-veriflier_1_auth_token}")|g" \ + config/veriflier-sample.json > "${target}" +} + +config_target() { + if [ -w config/ ]; then + printf '%s\n' "config/veriflier.json" + else + export VERIFLIER_CONFIG=/tmp/veriflier.json + printf '%s\n' "${VERIFLIER_CONFIG}" + fi +} + +export VERIFLIER_PORT="${VERIFLIER_PORT:-${VERIFLIER_GRPC_PORT:-7803}}" + +if [ ! -f config/veriflier.json ]; then + render_config "$(config_target)" fi exec ./veriflier2 diff --git a/docker/volumes/statsd/graphite/.gitignore b/docker/volumes/statsd/graphite/.gitignore deleted file mode 100644 index 5e7d2734..00000000 --- a/docker/volumes/statsd/graphite/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore diff --git a/docker/volumes/statsd/logs/.gitignore b/docker/volumes/statsd/logs/.gitignore deleted file mode 100644 index 5e7d2734..00000000 --- a/docker/volumes/statsd/logs/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore diff --git a/docker/volumes/statsd/statsd/.gitignore b/docker/volumes/statsd/statsd/.gitignore deleted file mode 100644 index 5e7d2734..00000000 --- a/docker/volumes/statsd/statsd/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..1f9e51b9 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,24 @@ +# Jetmon Docs + +This directory holds longer-form design material that does not belong in the +main README. + +## Architecture Decisions + +Accepted decisions live in [`adr/`](adr/). These records are append-only and +capture load-bearing choices that the current v2 implementation depends on. + +Start with [`adr/README.md`](adr/README.md) for the ADR format and index. + +## Planning Notes + +Planning notes capture future options and open design threads. They are not +accepted architecture decisions. + +| Document | Purpose | +|---|---| +| [`jetmon-deliverer-rollout.md`](jetmon-deliverer-rollout.md) | Operational rollout policy for moving outbound dispatch from embedded `jetmon2` workers to standalone `jetmon-deliverer`. | +| [`outbound-credential-encryption-plan.md`](outbound-credential-encryption-plan.md) | Migration plan for encrypting webhook secrets and alert-contact destination credentials after the current plaintext v2 model. | +| [`public-api-gateway-tenant-contract.md`](public-api-gateway-tenant-contract.md) | Gateway boundary contract, implemented Jetmon-side tenant ownership checks, and remaining public-exposure prerequisites. | +| [`v1-to-v2-pinned-rollout.md`](v1-to-v2-pinned-rollout.md) | Initial production migration plan for replacing v1 static-bucket hosts with v2 hosts pinned to the same ranges before enabling dynamic ownership. | +| [`v3-probe-agent-architecture-options.md`](v3-probe-agent-architecture-options.md) | Post-v2 architecture options for evolving from main servers plus Verifliers toward a probe-agent architecture. | diff --git a/docs/adr/0001-event-sourced-state-model.md b/docs/adr/0001-event-sourced-state-model.md new file mode 100644 index 00000000..6287d7fd --- /dev/null +++ b/docs/adr/0001-event-sourced-state-model.md @@ -0,0 +1,108 @@ +# 0001 — Event-sourced state model with dedicated transitions table + +**Status:** Accepted (2026-04-22) + +## Context + +Jetmon 1 stored the current site status as a column on +`jetpack_monitor_sites` (`site_status`, with a `last_status_change` +timestamp) and emitted a notification on every transition. There was +no durable history of state changes — the WPCOM API was the only +record of what happened. This made several common questions hard or +impossible to answer: + +- "Why was site X notified as down at 04:12 UTC? What were the check + results that led to that?" +- "How many times did site X flap between Down and SeemsDown over the + last hour?" +- "Did the verifier confirm the down at 04:12 or was it a single-host + decision?" +- "Did this row's status change because of a new check, a verifier + update, an operator close, or a maintenance window?" + +The site row was a projection — useful for "is this site up right +now?" — but it had no audit story. Every customer escalation that +touched "what happened" required digging through StatsD, application +logs, and WPCOM-side records. + +The v2 redesign needed a durable, queryable record of every state +change to support the planned events / SLA / webhooks / alert-contacts +surface. We considered three shapes during design: + +- **Option 1 — Reuse `jetmon_audit_log`.** Add `old_status` / + `new_status` columns and emit one audit row per status change. Single + table, no schema growth. Rejected because audit log was operational + ("who did what to the system") and conflating it with site state + history made both queries slower and the schema confusing — the + audit log is for actions, not state. + +- **Option 2 — Dedicated `jetmon_event_transitions` table.** One row + per transition with `severity_before` / `severity_after` / + `state_before` / `state_after` / `reason` / `source` / `metadata`. + Append-only. Pairs with a `jetmon_events` table holding the current + authoritative state of each open incident. + +- **Option 3 — Synthesize from `jetmon_check_history`.** Compute + state changes by walking the check history table. Rejected because + not every check produces a transition, the verifier's outcome can + override individual check results, and operator manual closes don't + appear in check history at all. + +## Decision + +We will store every site state change in a dedicated, append-only +`jetmon_event_transitions` table, paired with a current-state +projection in `jetmon_events`. `internal/eventstore` is the single +writer for both, writing each transition + projection update in one +transaction so they cannot disagree. + +Each transition row records: +- `event_id` (the open incident this transition belongs to) +- `severity_before`, `severity_after` (uint8 from + `internal/eventstore.Severity*`) +- `state_before`, `state_after` (string state names) +- `reason` (e.g. `opened`, `verifier_confirmed`, `manual_override`, + `superseded`) +- `source` (which jetmon2 instance or which API caller wrote it) +- `metadata` (JSON blob with check results, verifier outputs, etc.) +- `changed_at` (timestamp with millisecond precision) + +`jetmon_events` rows have a generated `dedup_key` column that is +non-NULL only while `ended_at IS NULL`, with a `UNIQUE KEY` enforcing +"one open event per (blog_id, endpoint_id, check_type, discriminator) +tuple" without requiring partial indexes (which MySQL lacks). + +## Consequences + +**Wins:** +- Every customer-facing question about site history has a single, + authoritative source. +- The webhook and alerting workers consume `jetmon_event_transitions` + via a high-water mark — no in-process pub/sub needed (see ADR-0005). +- The transition table is naturally auditable: who/what/when for every + change is on the row. +- The five-layer severity ladder (`Up < Warning < Degraded < + SeemsDown < Down`) is uniformly applied and queryable; severity + evolves independently of state. + +**Costs:** +- Two tables instead of a column. Storage cost is bounded — one row + per real state change, not one per check — but non-zero. +- Writes are now transactional across two tables. Mitigated by + `internal/eventstore` owning the contract. +- Migration path from Jetmon 1 is non-trivial. Acceptable because + v2 is a separate branch (PR #61) intentionally not drop-in + compatible. + +## Alternatives considered + +See Context. The audit-log overload (Option 1) was the most tempting +shortcut and is the path most projects regret later — once the audit +log mixes operational events with state-change events, every query +gets harder. + +## Related + +- `internal/eventstore/` — the single writer +- Migrations 10 (`jetmon_events`) and 11 (`jetmon_event_transitions`) +- ADR-0005 (Pull-only delivery via event transitions) diff --git a/docs/adr/0002-internal-only-api-behind-gateway.md b/docs/adr/0002-internal-only-api-behind-gateway.md new file mode 100644 index 00000000..6d12b5d6 --- /dev/null +++ b/docs/adr/0002-internal-only-api-behind-gateway.md @@ -0,0 +1,80 @@ +# 0002 — Internal-only API behind a gateway + +**Status:** Accepted (2026-04-22) + +## Context + +The v2 branch ships a versioned REST API (`/api/v1/...`) covering +sites, events, SLA stats, webhooks, and alert contacts. The API was +originally scoped as "the public API," and several Phase 1 design +decisions were drafted with public-API constraints in mind (granular +per-resource scopes, 404-on-unauthorized to avoid leaking resource +existence, sanitized error messages, per-tenant ownership on every +write surface, etc.). + +Mid-Phase-1 the scope changed: a separate gateway service will sit in +front of Jetmon and handle all customer-facing concerns (tenant +isolation, public errors, customer rate limiting, per-tenant +analytics, OAuth, billing). Jetmon's API becomes internal — every +caller is a known service (the gateway, alerting workers, the +operator dashboard, CI tooling, the uptime-bench harness). This +materially changes the appropriate trade-offs across most of the API +surface. + +## Decision + +We will treat Jetmon's API as **internal-only**. Specifically: + +- **Auth scopes are coarse:** `read` / `write` / `admin`. Granular + per-resource scopes (e.g. `webhooks:write`, `events:read`) are + unnecessary because all callers are trusted services that operate + at a single privilege level. +- **Errors are honest.** 401 vs 403 vs 404 are reported correctly + (no info-leak hiding). Error messages can include operational + detail (DB error class, the SQL stage that failed) because the + audience is operators and the gateway, not customers. +- **Webhook and alert-contact ownership is shared.** Any `write`-scope + token can manage any registration; `created_by` is recorded for + audit but does not gate access. +- **Idempotency-Key scope is `(api_key_id, key)`.** No tenant in the + scope tuple because there's no tenant abstraction. +- **Rate limits are per-key, sized for service protection** (preventing + one buggy caller from DoS-ing the rest), not for commerce or abuse. +- **Resource IDs are raw integers.** No type-prefixed IDs (`evt_`, + `whk_`); see the "Resolved design questions" section in API.md for + the full rationale. + +Each of these is the appropriate choice for an internal service and +not the appropriate choice for a public API. + +## Consequences + +**Wins:** +- The implementation is dramatically simpler than a public API. No + per-tenant isolation, no oauth surface, no analytics events on + every request, no per-customer rate limit configuration. +- Operators can debug from the API surface directly — error messages + carry the information needed to diagnose problems. +- Schema design is unconstrained by tenant-scoping concerns, which + keeps queries fast and indexes simple. + +**Costs:** +- If Jetmon's API is ever exposed to customers without a gateway in + front, several decisions need to be unwound. The migration path is + documented in ROADMAP.md "Path to a public API." Each change is + individually clean (add a column, filter on it, deprecate the + unscoped version) but they touch most of the surface, so it would + be a significant project rather than a flag flip. +- Documentation has to be careful not to leak the internal surface to + external readers. API.md is checked-in but is unambiguous about + internal-only scope; the gateway will re-export a sanitized subset. + +## Related + +- `API.md` — full API reference; the "Resolved design questions" + section captures the trade-offs that fall out of this decision. +- `ROADMAP.md` "Path to a public API" — what would change if this + decision is reversed. +- ADR-0003 (Plaintext credentials) — depends on this; if customers + managed their own webhooks the credential storage threat model + would shift. diff --git a/docs/adr/0003-plaintext-credentials-for-outbound-dispatch.md b/docs/adr/0003-plaintext-credentials-for-outbound-dispatch.md new file mode 100644 index 00000000..be5ebe7d --- /dev/null +++ b/docs/adr/0003-plaintext-credentials-for-outbound-dispatch.md @@ -0,0 +1,109 @@ +# 0003 — Plaintext credential storage for outbound dispatch + +**Status:** Accepted (2026-04-25) + +## Context + +Both `jetmon_webhooks.secret` (HMAC signing key) and +`jetmon_alert_contacts.destination` (transport-specific credential +JSON: PagerDuty integration key, Slack/Teams webhook URL, SMTP +password) need to be available at dispatch time so the worker can +authenticate or sign the outbound request. + +`jetmon_api_keys.token_hash` stores SHA-256 hashes — keys are +verified by hashing the inbound bearer token and comparing in +constant time. This pattern works because API keys are validated on +the **inbound** path, where having only the hash is sufficient. + +The first draft of the webhook schema (migration 13) mirrored this +pattern with `secret_hash CHAR(64)`. While building the delivery +worker we realized the analogy doesn't transfer: HMAC signing +requires the actual secret material, not its hash. There is no way +to reconstruct the original secret from a SHA-256 hash, so a hashed +secret is functionally useless to the worker. + +The same constraint applies to alert-contact credentials. To call +the PagerDuty Events API we need the integration key. To POST to a +Slack incoming-webhook URL we need the URL. To `smtp.SendMail` we +need the password. These are call-time inputs; hashing them at rest +would prevent the call. + +## Decision + +We will store outbound-dispatch credentials in **plaintext** in the +relevant tables: + +- `jetmon_webhooks.secret VARCHAR(80)` — the raw HMAC signing key, + with the `whsec_` prefix preserved (Stripe-style leak-detection + hint). +- `jetmon_alert_contacts.destination JSON` — the transport-specific + credential as supplied by the operator. + +Each table also stores a small "preview" column (`secret_preview` +for webhooks, `destination_preview` for alert contacts) holding the +last 4 characters of the credential, so the API can return a +non-sensitive identifier without ever leaking the full value. + +The full credential value is never returned through the API after +creation. `secret` is shown ONCE in the create / rotate response. +`destination` is supplied by the caller on create and is never echoed +back; subsequent reads expose only `destination_preview`. + +We document the threat model on the migrations and in code comments +so future readers can audit it without rediscovering it. + +## Consequences + +**Wins:** +- Outbound dispatch works correctly with no special infrastructure + (no KMS round-trip, no per-secret cache layer). +- Read-only API consumers (read-scope tokens) cannot exfiltrate + credentials — the SELECT used by handlers does not return the + credential column. The worker uses a separate `LoadSecret` / + `LoadDestination` call. +- Rotation is simple: replace the row's secret column, return the + new value once, the next dispatch picks it up. + +**Costs:** +- A read of `jetmon_webhooks` or `jetmon_alert_contacts` at the SQL + level (DBA query, MySQL replica, backup file) leaks all signing + keys and destination credentials in plaintext. For an internal + service behind a gateway with an internal-only set of consumers + (ADR-0002), this is equivalent to the existing access-to-events + threat — anyone with that level of DB access already has access to + the events themselves. The marginal cost is small. +- If Jetmon ever exposes its API directly to customers (i.e. + ADR-0002 is reversed), this trade-off changes. Customer-managed + secrets in plaintext under shared infrastructure is a stronger + threat. The mitigation path is encryption at rest with a master + key (KMS-style), which is queued in ROADMAP.md as a future + hardening step. + +## Alternatives considered + +- **Hashed credentials (the API-key pattern).** Rejected because + HMAC signing and outbound HTTPS auth need the raw key material, + not its hash. There is no inbound-validation use case for these + secrets. +- **Encryption at rest with a master key (e.g. KMS).** A real + improvement on plaintext, but adds an operational dependency + (KMS access, key rotation procedure) and a runtime cost (decrypt + on every dispatch or maintain an in-process cache). Deferred — + the right time to do this is alongside any move toward customer- + managed secrets, not before. +- **Per-row at-rest encryption with the AUTH_TOKEN as key material.** + Rejected as security theatre — the key sits next to the data on + the same host, so an attacker with DB access likely has config + access too. The complexity buys nothing. + +## Related + +- ADR-0002 (Internal-only API) — defines the threat model that + makes plaintext storage acceptable today. +- Migration 13 (`jetmon_webhooks`) — documents the rationale inline. +- Migration 16 (`jetmon_alert_contacts`) — same rationale. +- `internal/webhooks/webhooks.go` — `LoadSecret` is intentionally a + separate function (not a field on `Webhook`) to prevent leakage + through serialization. +- `internal/alerting/contacts.go` — `LoadDestination` follows the + same pattern. diff --git a/docs/adr/0004-stripe-style-hmac-webhook-signatures.md b/docs/adr/0004-stripe-style-hmac-webhook-signatures.md new file mode 100644 index 00000000..0e68e61f --- /dev/null +++ b/docs/adr/0004-stripe-style-hmac-webhook-signatures.md @@ -0,0 +1,97 @@ +# 0004 — Stripe-style HMAC-SHA256 webhook signatures + +**Status:** Accepted (2026-04-23) + +## Context + +Webhook deliveries need a way for consumers to verify that a POST +actually came from Jetmon and wasn't replayed or forged. The choice +of signing scheme is consumer-facing — once shipped, every consumer's +verification code depends on it, and changing the format is a +coordinated migration. + +We surveyed the established patterns: + +| Scheme | Used by | Notes | +|--------|---------|-------| +| Stripe-style HMAC-SHA256 with versioned header | Stripe, GitHub (sig-256) | `t=,v1=` over `{ts}.{body}`. Replay-resistant via timestamp. | +| GitHub HMAC-SHA1 (legacy) | GitHub `X-Hub-Signature` | SHA-1 is broken; only here for legacy receivers. | +| Slack HMAC-SHA256 | Slack | Same idea as Stripe but slightly different concatenation order. | +| JWT (signed token in header) | Some uptime services | More complex parser surface, no clear benefit for one-way notifications. | +| RFC 9421 HTTP Message Signatures | Some IETF-leaning services | More features (covered headers), much more complex consumer code. | +| Ed25519 asymmetric signature | Few production webhooks | Public key in metadata, no per-consumer secret to leak. | + +Phase 3 design needed a single choice that handled the immediate use +case (internal API, one signing key per webhook), left a clean path +to future algorithm rotation, and didn't impose unusual consumer +code. + +## Decision + +We will sign every webhook delivery with HMAC-SHA256 using the +webhook's shared secret, and surface the signature in a Stripe-style +versioned header: + +``` +X-Jetmon-Signature: t=,v1= +``` + +The HMAC input is `{timestamp}.{request_body}` — concatenating the +timestamp into the signed material lets consumers reject stale +deliveries (replay protection) by comparing `t=` against their own +clock. + +The `v1=` prefix is **reserved space for a future algorithm +rotation**. We do not ship multi-algorithm signing today (one secret, +one algorithm). When rotation is needed, the transition emits both +`v1=` and `v2=` for a window so consumers can verify whichever they +support, then `v1=` is retired. Stripe-compatible header parsing +already supports multiple `v=` values, so consumers don't need to +update their parser to receive a v2-augmented signature. + +Secret storage is plaintext per ADR-0003. The signing key is +generated by the server (32 random bytes, base32-encoded with the +`whsec_` prefix) and returned to the operator once on create or +rotate-secret. + +## Consequences + +**Wins:** +- Familiar to anyone who has written a Stripe webhook receiver. + Documentation and example code in any major language exists. +- Replay protection is built in via the timestamp. Consumers reject + signatures with `t=` more than ~5 minutes old. +- Algorithm rotation is a clean future operation — schema column + additions only, no header-format churn. +- Consumer verification is ~10 lines of code in any language with + an HMAC primitive. + +**Costs:** +- HMAC requires the consumer to share the secret with us. If the + secret leaks, an attacker can mint valid deliveries until the + operator rotates. The `whsec_` prefix is a leak-detection hint + but is not a mitigation. +- Asymmetric signatures (Ed25519) would let us publish a public key + and let consumers verify without holding a secret. Considered but + rejected for v1 because (a) it requires consumers to handle key + rotation via a published JWKS-like endpoint, which adds receiver + complexity, and (b) HMAC is what the gateway and current internal + consumers already know how to verify. The `v1=` prefix leaves + the door open for an Ed25519 `v2=`. + +## Alternatives considered + +See the table in Context. Stripe-style HMAC was chosen for the +combination of simplicity, familiarity, and the clean rotation path. +The Ed25519 option remains attractive if Jetmon ever exposes its +webhooks to customer-managed receivers (per ADR-0002 reversal). + +## Related + +- API.md "Family 4 → Signing and secret rotation" +- `internal/webhooks/webhooks.go` `Sign` function and + `TestSignatureRoundTrip` in the test suite (the contract test that + every consumer's verification depends on). +- ADR-0003 (Plaintext credentials) +- ROADMAP.md "Grace-period webhook secret rotation" — the next + follow-up that builds on the `v1=` reservation. diff --git a/docs/adr/0005-pull-only-delivery-via-event-transitions.md b/docs/adr/0005-pull-only-delivery-via-event-transitions.md new file mode 100644 index 00000000..6463dbd5 --- /dev/null +++ b/docs/adr/0005-pull-only-delivery-via-event-transitions.md @@ -0,0 +1,115 @@ +# 0005 — Pull-only webhook and alerting delivery + +**Status:** Accepted (2026-04-23) + +## Context + +When an event transition happens (a site goes Down, recovers, +escalates from Degraded to SeemsDown, etc.), the webhook delivery +worker and the alerting delivery worker each need to fan that +transition out to matching subscribers. There were two viable shapes: + +- **In-process pub/sub.** The eventstore notifies subscribers + in-process via a Go channel; each worker is a subscriber. The + workers wake on every transition with no polling latency. +- **Pull from `jetmon_event_transitions`.** Workers maintain a + high-water mark in their own progress table and poll the + transitions table on a tick (default 1s). Transitions are + durable; new transitions are picked up on the next poll. + +Pub/sub is faster (no polling latency) and avoids a poll loop. Pull +is slower (up to 1s tick latency) but has several properties that +matter at the architectural scale: + +- The MySQL schema is the bus. No in-process state has to survive + a restart — the high-water mark is in the DB. A worker that + crashes resumes from where it left off. +- Multiple worker instances are trivially supported. Each instance + has its own row in the progress table and polls independently. + (Multi-instance does need row-level claim semantics on the + delivery table; see ADR-0007.) +- Workers don't have to live in the same process as the eventstore + writer. The deliverer-binary extraction (`ROADMAP.md`, + Architectural roadmap) becomes a clean cut: the worker code moves + to its own binary, points at the same MySQL, and continues + working without the eventstore writer being aware. +- "I want to replay deliveries since timestamp T" is a SELECT, not a + bus replay primitive. + +## Decision + +We will use **pull-only delivery** for both the webhook worker +(`internal/webhooks`) and the alerting worker (`internal/alerting`). +Both workers: + +- Maintain a high-water mark of the last `jetmon_event_transitions.id` + they processed, in their own per-instance progress table + (`jetmon_webhook_dispatch_progress`, + `jetmon_alert_dispatch_progress`). +- Poll on a 1-second tick by default for new transition rows after + the mark. +- For each new transition, match against active subscribers and + enqueue per-(subscriber, transition) deliveries. +- Then dispatch with retries on a shared retry ladder + (1m / 5m / 30m / 1h / 6h, then abandon). + +The MySQL schema is the bus between writers (eventstore) and readers +(webhook worker, alerting worker). + +## Consequences + +**Wins:** +- Crash-safe by design. A worker that dies mid-tick resumes + correctly when restarted; in-flight deliveries are caught by the + retry path. +- Multi-instance friendly with a small claim-locking addition + (ADR-0007). The basic shape doesn't change. +- Each worker can be extracted into its own binary without + modifying the eventstore. The deliverer-binary roadmap entry + builds on this. +- Replay and audit are SQL queries. +- Consumers of the events table (audit tooling, ad-hoc reporting, + the SLA endpoints) see the same source of truth as the workers. + +**Costs:** +- 1-second tick latency is acceptable for outage notifications but + not for sub-second user-interactive flows. Jetmon's notification + use case tolerates seconds; this would be wrong for, say, a chat + message delivery system. +- Tight tick + lots of subscribers + lots of transitions = noticeable + DB query rate. The per-tick SELECT is bounded by `BatchSize` (200 + by default) and uses indexed columns. Watching this at scale and + tuning the tick is in scope for future operational work. +- The dispatcher and the deliverer are two coupled poll loops in + one process. The webhook worker poll-and-enqueue tick is separate + from the poll-pending-deliveries tick. This is documented in + worker.go but is more complex than a single-loop in-process + pub/sub would be. + +## Alternatives considered + +- **In-process pub/sub.** Faster, simpler in single-process + deployment, but creates an in-process dependency between the + eventstore writer and the workers, breaks the multi-instance + story, and complicates the deliverer-binary extraction. The + latency win does not pay for those costs in our use case. +- **MySQL `LISTEN`/`NOTIFY` (PostgreSQL pattern).** MySQL has no + equivalent. Ruled out. +- **Outbox-pattern with explicit fan-out at write time.** The + eventstore writer would compute matching subscribers and write + per-(subscriber, transition) rows directly. Rejected because + matching changes when subscribers are added or removed; precomputing + at write time would mean a configuration change has to wait for + the next transition before taking effect. Pull-with-match-at-tick + picks up registry changes immediately. + +## Related + +- ADR-0001 (Event-sourced state model) — defines the + `jetmon_event_transitions` table the workers consume. +- ADR-0007 (Soft-lock claim) — the row-level locking that makes + multi-instance pull safe. +- `internal/webhooks/worker.go`, `internal/alerting/worker.go` — the + two pull-loop implementations. +- `ROADMAP.md` "Multi-repo / multi-binary split" — the deliverer + binary that builds on this decision. diff --git a/docs/adr/0006-separate-alerting-and-webhooks-packages.md b/docs/adr/0006-separate-alerting-and-webhooks-packages.md new file mode 100644 index 00000000..c598cd70 --- /dev/null +++ b/docs/adr/0006-separate-alerting-and-webhooks-packages.md @@ -0,0 +1,101 @@ +# 0006 — Separate `internal/alerting` and `internal/webhooks` packages + +**Status:** Accepted (2026-04-25) + +## Context + +Phase 3 shipped `internal/webhooks` — a webhook registry, delivery +worker, and HMAC signing flow. Phase 3.x then needed to ship alert +contacts: managed channels (email, PagerDuty, Slack, Teams) for +human destinations, with site-filter + severity-gate filtering and a +per-hour rate cap. + +The two are noticeably similar at the operational level. Both: + +- Poll `jetmon_event_transitions` on a high-water mark (per ADR-0005). +- Match new transitions against an active registry. +- Enqueue per-(subscriber, transition) deliveries with INSERT IGNORE + on a UNIQUE KEY. +- Have a deliver loop with a per-subscriber in-flight cap and a + shared retry ladder (1m / 5m / 30m / 1h / 6h). +- Surface delivery list / manual-retry endpoints through the API. + +The natural temptation was to extend the webhook worker to handle +both — define a `Dispatcher` interface, two concrete implementations +(HMAC-POST for webhooks, transport-rendered for alert contacts), and +share the loop / retry / claim plumbing. + +## Decision + +We will keep `internal/alerting` and `internal/webhooks` as +**separate packages with parallel-but-duplicated structure**, at +least until the deliverer-binary extraction (`ROADMAP.md`). + +The webhook worker keeps its existing shape; the alerting worker is +copy-paste-and-adapt with the alerting-specific concerns layered on +(severity gate, rate cap, transport map, Notification rendering). + +This is a deliberate choice to defer abstraction. Webhooks shipped +first; alerting hadn't been built. We didn't yet know what shape +alerting would actually take — fan-out, escalation, digest mode, +on-call routing are all real possibilities for future alert-contact +features that webhooks doesn't have. Building a shared abstraction +against one known concrete user (webhooks) and one guessed-at user +(alerting) was likely to produce an abstraction that fits neither +well. + +## Consequences + +**Wins:** +- Each package can evolve independently. Webhooks growing a v2 + signature scheme doesn't risk regressing alerting; alerting + growing per-contact escalation doesn't risk regressing the webhook + flow. +- Webhooks went to production first (verified end-to-end before + alerting was started). Coupling them to greenfield code would + have added production risk to a working feature. +- Reading either package is easy: it's all the relevant code in one + spot, no "is this branch reached for webhooks too?" cognitive + load. + +**Costs:** +- ~300 lines of duplicated code: retry schedule constants, in-flight + cap, transactional claim-and-lease pattern (ADR-0007), polling loop + shape, abandon semantics. Bug fixes have to land twice (the claim + fix did exactly that). +- Two metrics namespaces (`webhook_*` vs `alert_*`). Operators have + to remember which is which. +- Drift risk — improvements in one package don't automatically reach + the other. + +These costs are bounded and acceptable in exchange for the +flexibility, but they accrue every time we touch the workers. The +delivery-claim fix is the canary: if every fix is two-pass, the +unification is overdue. + +## Future revisit + +The deliverer-binary extraction is the natural moment to revisit +this. By then we'll have: + +- Two concrete dispatch workers in production with known operational + profiles. +- A clear picture of what alerting actually grew into vs. what + webhooks actually needed. +- WPCOM legacy notifications queued to migrate behind the same + abstraction, providing a third concrete user. + +At that point, factor a `Dispatcher` interface against three known +implementations, not one known plus one guess. The unification work +is documented in `ROADMAP.md` "Multi-repo / multi-binary split → +Revisit point: unify `internal/alerting/` and `internal/webhooks/`." + +## Related + +- ROADMAP.md "Multi-repo / multi-binary split" +- `internal/webhooks/worker.go` and `internal/alerting/worker.go` — + the parallel implementations. +- ADR-0005 (Pull-only delivery) — the shared shape both workers + follow. +- ADR-0007 (Soft-lock claim) — a fix that had to land in both + packages, illustrating the duplication cost. diff --git a/docs/adr/0007-soft-lock-vs-row-claim.md b/docs/adr/0007-soft-lock-vs-row-claim.md new file mode 100644 index 00000000..3fd60027 --- /dev/null +++ b/docs/adr/0007-soft-lock-vs-row-claim.md @@ -0,0 +1,124 @@ +# 0007 — Soft-lock claim vs transactional row claim + +**Status:** Accepted (2026-04-25), amended (2026-04-28) + +## Context + +The webhook and alerting deliver loops (per ADR-0005) tick every +1 second. Each tick: + +1. SELECTs up to N pending deliveries whose `next_attempt_at` has + passed. +2. For each, spawns a goroutine to dispatch (subject to a per- + subscriber in-flight cap). +3. The goroutine eventually calls `MarkDelivered` (success) or + `ScheduleRetry` (failure) to update the row's `next_attempt_at`. + +Two correctness questions arise: + +- **Within a single process**, the dispatch goroutine takes seconds + (HTTP timeout default 30s). If the next tick fires while the + dispatch is still in flight, the SELECT returns the same row + again — its status is still `pending` and its `next_attempt_at` + hasn't been updated. The goroutine hasn't finished yet. The + per-subscriber in-flight cap (default 3) bounds this, but lets + up to 3 concurrent dispatches of the same row. Each computes a + retry delay from the same `d.Attempt = N` value, all run + `attempt = attempt + 1` in SQL, and the row ends with + `attempt = N+3`. The retry ladder collapses: we go from 1m to + abandoned in roughly an hour instead of the documented 7h36m. + +- **Across multiple instances**, two jetmon2 processes hitting the + same MySQL would both see the same pending row in their SELECTs + and both spawn dispatch goroutines. We'd send each delivery N+1 + times where N is the number of instances. + +There are two well-known fixes: + +- **Soft lock by pushing `next_attempt_at` out** before the + goroutine starts. The next tick's SELECT (which gates on + `next_attempt_at <= NOW()`) won't match the row again until the + soft lock expires. The dispatch goroutine overwrites the soft + lock with its real result. +- **Transactional row claiming via `SELECT … FOR UPDATE`**. Two + concurrent claim transactions cannot claim the same row; the second + claimant waits briefly for the first transaction to commit, then sees + the updated `next_attempt_at` and skips that in-flight delivery. +- **Transactional row claiming via `SELECT … FOR UPDATE SKIP LOCKED`**. + Same correctness property, but concurrent claimers skip locked rows + rather than waiting. This is better for high delivery concurrency but + requires newer MySQL than the current 5.7+ compatibility target. + +## Decision + +`internal/webhooks/deliveries.go` and `internal/alerting/deliveries.go` +now use a transactional row claim. `ClaimReady` starts a transaction, +selects ready rows with `SELECT … FOR UPDATE`, pushes each selected +row's `next_attempt_at` to NOW + `claimLockDuration` (60 seconds), and +commits. The dispatch goroutine overwrites that in-flight lease with +its real value when it finishes. + +We intentionally use plain `FOR UPDATE` rather than `SKIP LOCKED` so +the delivery claim path remains compatible with the MySQL 5.7+ +production target. The claim transaction is short: it only scans rows, +updates their in-flight lease, and commits before any outbound network +I/O begins. A competing worker may block briefly during that claim, but +it will not duplicate the delivery. + +A crashed goroutine that never updates the row recovers naturally +when the in-flight lease expires after 60s — the row becomes claimable +again. This is intentional rollback behavior. + +## Consequences + +**Wins:** +- The retry ladder behaves as documented; the visible regression that + motivated the original soft lock (~1h-then-abandon instead of 7h36m) + stays fixed. +- Active-active delivery workers no longer duplicate the same pending + delivery row. +- The implementation remains MySQL 5.7+ compatible. +- Crash recovery is automatic — a process kill mid-dispatch leaves + the row recoverable. + +**Costs:** +- `FOR UPDATE` can make one worker wait briefly behind another worker's + claim transaction. This is acceptable while the transaction is kept + short and contains no network I/O. +- `SKIP LOCKED` would use high-concurrency workers more efficiently, but + it is deferred until the production database compatibility target + allows it. +- The in-flight lease duration is a tuning parameter. Too short and a + slow dispatch can race with the next tick; too long and a crashed + goroutine takes longer to recover. 60s is a comfortable margin + for the default 30s + 5s dispatch timeout. + +## Alternatives considered + +- **`SELECT … FOR UPDATE SKIP LOCKED`.** Correct for multi-instance and + avoids blocking behind already-claimed rows, but would raise the MySQL + requirement beyond the current compatibility target. +- **Keep the soft lock only.** Simple and MySQL-compatible, but two + workers can both read the same pending row before either moves + `next_attempt_at`, so active-active delivery still duplicates work. +- **Reduce the per-subscriber in-flight cap to 1.** Doesn't fix + the bug; the second tick still sees the same row, the cap just + prevents the second goroutine from starting. The row stays pending + with stale `next_attempt_at` and the dispatch is delayed by the + cap rather than re-attempted concurrently. Slightly better + observable behavior, same underlying issue. +- **A separate "claim ID" column with CAS semantics.** Similar + correctness with more schema and more code. Not worth the additional + complexity when row locks already provide the claim primitive. + +## Related + +- ADR-0005 (Pull-only delivery) — the worker shape that creates + this concurrency question. +- ADR-0006 (Separate alerting and webhooks packages) — the fix + had to land in both packages, illustrating the duplication cost. +- `internal/webhooks/deliveries.go` `ClaimReady` and the matching + `TestClaimReadyClaimsRowsTransactionally`. +- `internal/alerting/deliveries.go` `ClaimReady` and matching test. +- ROADMAP.md post-v2 platform refinement items for the deliverer split + and active-active delivery. diff --git a/docs/adr/0008-shadow-v2-state-migration.md b/docs/adr/0008-shadow-v2-state-migration.md new file mode 100644 index 00000000..c924e802 --- /dev/null +++ b/docs/adr/0008-shadow-v2-state-migration.md @@ -0,0 +1,79 @@ +# 0008 — Shadow-v2-state migration with legacy status projection + +**Status:** Accepted (2026-04-27) + +## Context + +Jetmon 2 replaces mutable v1 status handling with event-sourced incident +state (`jetmon_events` + `jetmon_event_transitions`). Production consumers, +however, still read the legacy `jetpack_monitor_sites.site_status` and +`last_status_change` fields. A hard cutover would require every consumer +to migrate at the same time as the monitor binary, which is operationally +fragile. + +We considered creating a completely separate v2 sites table, but that +would immediately introduce bidirectional config sync, backfill, and +reconciliation problems. The site/config row is not the hardest part of +the migration; incident state is. + +## Decision + +Jetmon 2 will use a **shadow-v2-state** migration model: + +- `jetmon_events` and `jetmon_event_transitions` are the authoritative + incident state. +- `jetpack_monitor_sites` remains the legacy site/config table during + migration. +- While `LEGACY_STATUS_PROJECTION_ENABLE` is true, event mutations also + update the v1-compatible `site_status` / `last_status_change` + projection in the same transaction. +- The internal API derives current state from active v2 events first. It + falls back to legacy `site_status` only while the legacy projection is + enabled; after disabling projection, "no active v2 event" means `Up` + regardless of stale legacy status values. +- After downstream readers move to the v2 API/event tables, + `LEGACY_STATUS_PROJECTION_ENABLE` can be disabled. V2 incident writes + continue unchanged. + +`DB_UPDATES_ENABLE` remains as a deprecated config alias for older local +configs, but `LEGACY_STATUS_PROJECTION_ENABLE` is the real switch. + +## Consequences + +**Wins:** +- We can deploy v2 without requiring a simultaneous consumer migration. +- Rollback is straightforward: legacy readers still see familiar status + values while projection is enabled. +- The v2 event model becomes the source of truth immediately, so new API, + webhook, alerting, and SLA work does not depend on the legacy status + column. +- Disabling legacy status writes later is a config change, not a schema + rewrite. + +**Costs:** +- During migration, there are two readable state surfaces. The event tables + are authoritative; the legacy status fields are only a projection. +- Projection drift must be treated as a bug while + `LEGACY_STATUS_PROJECTION_ENABLE` is true. +- `jetpack_monitor_sites` still carries site configuration and some v2 + additive bookkeeping columns (`last_checked_at`, `ssl_expiry_date`, + cooldown fields). Disabling legacy status projection does not remove the + table from the system. + +## Alternatives considered + +- **Full v2 sites table now.** Cleaner isolation, but much more migration + machinery: config sync, ownership rules, backfill, reconciliation, and + dual-write failure handling. Deferred until legacy schema constraints + actually block v2 feature work. +- **Only additive migrations on the legacy table.** Simpler schema, but it + keeps incident state conceptually tied to `site_status` and makes the + eventual cutover harder to reason about. +- **Hard cutover to v2 event tables.** Cleanest end state, highest rollout + risk. + +## Related + +- ADR-0001 — Event-sourced state model. +- `EVENTS.md` — event lifecycle and projection invariants. +- `internal/eventstore` — sole writer for event rows and transitions. diff --git a/docs/adr/README.md b/docs/adr/README.md new file mode 100644 index 00000000..db0e60aa --- /dev/null +++ b/docs/adr/README.md @@ -0,0 +1,51 @@ +# Architecture Decision Records + +Short, immutable records of load-bearing decisions in Jetmon 2 — the kind +of "why is it like this" question that has been answered more than once +in code review, on Slack, or in a PR description. + +## Format + +Each ADR is a numbered Markdown file: `NNNN-short-slug.md`. Numbers are +allocated sequentially and never reused. The body has four sections: + +- **Status** — Proposed / Accepted / Superseded by ADR-NNNN / Deprecated. +- **Context** — what problem we're solving and the constraints that + shaped the choice. Capture the world as it was when the decision was + made. +- **Decision** — what we chose, in active voice ("We will…"). +- **Consequences** — what falls out of the decision, both the wins and + the costs we accept. Future readers should be able to evaluate + whether the consequences are still acceptable. + +Optional fifth section: **Alternatives considered** when the rejected +options carry useful information for a future revisit. + +## Conventions + +- **ADRs are append-only.** Once accepted, the body is not edited. + Status changes (e.g. "Superseded by ADR-NNNN") are added at the top + with a date. +- **Each ADR captures one decision.** If a topic produces several + decisions, write several ADRs that cross-reference. +- **Write what was true at the time.** If a column has been renamed + since, the ADR keeps the old name with a footnote rather than being + silently updated. Otherwise the historical thread is lost. +- **Cross-link generously.** ADRs frequently depend on each other; + always link to the related decisions. +- **Don't backfill speculatively.** ADRs document decisions that have + actually been made and shipped. Open questions belong in + `ROADMAP.md` until they're resolved. + +## Index + +| # | Title | Status | +|---|-------|--------| +| [0001](0001-event-sourced-state-model.md) | Event-sourced state model with dedicated transitions table | Accepted | +| [0002](0002-internal-only-api-behind-gateway.md) | Internal-only API behind a gateway | Accepted | +| [0003](0003-plaintext-credentials-for-outbound-dispatch.md) | Plaintext credential storage for outbound dispatch | Accepted | +| [0004](0004-stripe-style-hmac-webhook-signatures.md) | Stripe-style HMAC-SHA256 webhook signatures | Accepted | +| [0005](0005-pull-only-delivery-via-event-transitions.md) | Pull-only webhook and alerting delivery | Accepted | +| [0006](0006-separate-alerting-and-webhooks-packages.md) | Separate `internal/alerting` and `internal/webhooks` packages | Accepted | +| [0007](0007-soft-lock-vs-row-claim.md) | Soft-lock claim vs transactional row claim | Accepted | +| [0008](0008-shadow-v2-state-migration.md) | Shadow-v2-state migration with legacy status projection | Accepted | diff --git a/docs/jetmon-deliverer-rollout.md b/docs/jetmon-deliverer-rollout.md new file mode 100644 index 00000000..0d52b0ab --- /dev/null +++ b/docs/jetmon-deliverer-rollout.md @@ -0,0 +1,142 @@ +# Jetmon Deliverer Rollout + +**Status:** Operational runbook for the existing v2 implementation. + +`jetmon-deliverer` is the first standalone process boundary for outbound +delivery. It runs the webhook and alert-contact workers without starting the +monitor round loop, REST API, dashboard, Veriflier server, or bucket ownership. + +The code path is shared with embedded `jetmon2` delivery through +`internal/deliverer`. Delivery rows are claimed with short transactional +`SELECT ... FOR UPDATE` leases, so multiple active delivery workers cannot +claim the same pending delivery row. `DELIVERY_OWNER_HOST` remains useful as a +rollout guard when operators want a deliberately single-owner cutover. + +## Process Responsibilities + +| Process | Owns | Does not own | +|---|---|---| +| `jetmon2` with `API_PORT = 0` | monitor rounds, bucket ownership, checks, WPCOM legacy notifications | REST API, webhook delivery, alert-contact delivery | +| `jetmon2` with `API_PORT > 0` | REST API and, when allowed by `DELIVERY_OWNER_HOST`, embedded delivery | standalone process isolation for delivery | +| `jetmon-deliverer` | webhook delivery and alert-contact delivery | REST API, monitor rounds, bucket ownership, dashboard | + +The production target for the split is: + +- monitor hosts run `jetmon2` with monitor responsibilities only; +- API hosts run `jetmon2` for `/api/v1` traffic but do not own delivery; +- deliverer hosts run `jetmon-deliverer` for outbound dispatch. + +## Package Contents + +A production package for the deliverer should include: + +- `bin/jetmon-deliverer` +- `systemd/jetmon-deliverer.service` or the equivalent deployment-system unit +- the same `config/config.json` schema used by `jetmon2` +- database config via the same `DB_*` environment variables used by `jetmon2` +- alert transport credentials required by the selected `EMAIL_TRANSPORT` +- log routing equivalent to the existing `jetmon2` service + +The binary uses `JETMON_CONFIG` when set, otherwise it reads +`config/config.json`. Use a separate config file per process class when API +hosts and deliverer hosts need different `DELIVERY_OWNER_HOST` values. + +The sample systemd unit expects: + +- `ExecStart=/opt/jetmon2/bin/jetmon-deliverer` +- `EnvironmentFile=-/opt/jetmon2/config/jetmon2.env` +- `JETMON_CONFIG=/opt/jetmon2/config/deliverer.json` + +Keep `deliverer.json` process-specific. Sharing a config file with API-enabled +`jetmon2` hosts is only safe when `DELIVERY_OWNER_HOST` is intentionally set for +all process classes that read it. + +## Single-Owner Cutover + +This is the conservative migration path from embedded delivery to standalone +delivery. + +1. Build and package `bin/jetmon-deliverer`. +2. Install and enable `systemd/jetmon-deliverer.service` or the equivalent + deployment-system unit. +3. Pick one deliverer host and set `DELIVERY_OWNER_HOST` to that host's + hostname in the deliverer config. +4. Keep embedded API hosts from delivering by giving their `jetmon2` process a + config where `DELIVERY_OWNER_HOST` does not match the API hostnames. The + most common pattern is a process-specific config file via `JETMON_CONFIG`. +5. Start `jetmon-deliverer` on the owner host. +6. Confirm logs show `delivery_owner_host="" matched; delivery workers + enabled on this host`. +7. Confirm API-host logs show delivery workers are skipped or idle. +8. Watch `jetmon_webhook_deliveries` and `jetmon_alert_deliveries` for pending + backlog, abandon rate, and retry volume. +9. Stop embedded delivery after the standalone owner has been stable for at + least one normal alerting window. + +Rollback is simple: stop `jetmon-deliverer` and restore the previous embedded +delivery config so one API-enabled `jetmon2` host matches +`DELIVERY_OWNER_HOST` or uses the legacy empty-owner behavior. + +## Active-Active Delivery + +Transactional row claims make active-active delivery safe at the delivery-row +level. The remaining rollout question is process selection: + +- If `DELIVERY_OWNER_HOST` is set, only the exact matching hostname runs + delivery workers. +- If `DELIVERY_OWNER_HOST` is empty, every eligible `jetmon2` process with + `API_PORT > 0` and every `jetmon-deliverer` process runs delivery workers. + +Therefore, active-active standalone delivery should use process-specific +configs: + +- API hosts: set `DELIVERY_OWNER_HOST` to a non-matching guard value so they + serve API traffic without dispatching outbound delivery. +- Deliverer hosts: leave `DELIVERY_OWNER_HOST` empty, or run one config per + deliverer host while keeping the guard disabled only for that process class. + +Do not clear `DELIVERY_OWNER_HOST` in a shared config that is also used by +API-enabled `jetmon2` hosts unless the intended state is active-active delivery +from both API hosts and standalone deliverer hosts. + +## Rollout Checks + +Before enabling standalone delivery: + +- `bin/jetmon-deliverer version` reports the expected build. +- `JETMON_CONFIG=/opt/jetmon2/config/deliverer.json bin/jetmon-deliverer + validate-config` passes for the deliverer-specific config while running with + the same `DB_*` environment the service will use. +- `systemd-analyze verify systemd/jetmon-deliverer.service` passes, or the + deployment-system equivalent validates the service definition. +- The process can connect to MySQL using the same schema as `jetmon2`. +- `EMAIL_TRANSPORT` is set to `wpcom` or `smtp` in any environment where real + alert-contact emails should be delivered; `stub` is safe for dry runs. +- `DELIVERY_OWNER_HOST` behavior is validated with one start on each process + class before production traffic. + +During rollout: + +- No sustained growth in `status = 'pending'` rows. +- No unexpected increase in `status = 'abandoned'` rows. +- Logs show only the intended process class running workers. +- Webhook and alert-contact manual retry endpoints still work. + +After rollout: + +- Keep embedded delivery disabled on API hosts unless intentionally testing + active-active behavior. +- Revisit `internal/webhooks` and `internal/alerting` duplication only after + standalone delivery has run long enough to expose real operational drift. +- Plan WPCOM legacy notification migration into this process once alert-contact + parity and recipient inventory are known. + +## Failure Modes + +| Failure | Expected behavior | Operator action | +|---|---|---| +| Deliverer process exits | In-flight leases expire after the claim lock duration; rows become claimable again | Restart deliverer or roll back to embedded delivery | +| Wrong owner hostname | Deliverer starts but idles | Fix `DELIVERY_OWNER_HOST` or process hostname/config | +| Shared config accidentally clears owner guard | API hosts and deliverer hosts may all dispatch | Restore per-process configs; row claims prevent duplicate row claims but extra processes add load | +| Email transport left as `stub` | Email alerts are logged but not sent | Set `EMAIL_TRANSPORT` and transport credentials, then restart | +| Third-party outage | Rows retry on the documented ladder and eventually abandon | Fix destination or provider issue, then use manual retry endpoints | diff --git a/docs/outbound-credential-encryption-plan.md b/docs/outbound-credential-encryption-plan.md new file mode 100644 index 00000000..d50e6c31 --- /dev/null +++ b/docs/outbound-credential-encryption-plan.md @@ -0,0 +1,139 @@ +# Outbound Credential Encryption Plan + +**Status:** Planning note, not an accepted architecture decision. + +ADR-0003 accepts plaintext storage for outbound-dispatch credentials under the +current internal-only v2 threat model. This note captures the migration path +for the next hardening step: application-level encryption at rest for webhook +signing secrets and alert-contact destination credentials. + +## Current State + +Two columns contain raw outbound credentials because dispatch needs the +original value at send time: + +- `jetmon_webhooks.secret`: HMAC signing secret used to sign webhook delivery + bodies. +- `jetmon_alert_contacts.destination`: transport-specific JSON containing an + email address, PagerDuty integration key, Slack/Teams webhook URL, or SMTP + password. + +Handlers never return these values after creation or rotation. Normal reads +return only `secret_preview` or `destination_preview`; dispatch workers load the +raw value through separate helper functions. + +## Goals + +- Protect credentials from database-only compromise, read replicas, SQL dumps, + and backup exposure. +- Keep dispatch fast enough that decrypting credentials does not become the + bottleneck during event storms. +- Preserve the existing API contract: create/rotate still return a one-time + secret where applicable, and reads still expose only previews. +- Allow rollback during migration without losing the ability to dispatch + existing webhooks and alert contacts. + +## Non-Goals + +- This does not protect against a fully compromised application host. The + dispatcher must hold decrypt-capable key material in memory to send alerts. +- This does not replace webhook HMAC signing with asymmetric signatures. +- This does not define the public/customer tenant model; that remains a public + API design item. +- This does not encrypt delivery payload history. Payloads contain event data, + not destination credentials. + +## Target Design + +Use envelope-style application encryption with a versioned service data key: + +1. A production key manager exposes the active credential-encryption key and + key id to Jetmon at startup. +2. Jetmon keeps the plaintext data key only in memory. +3. Each credential value is encrypted locally with AES-256-GCM before storage. +4. Each encrypted row stores the ciphertext, nonce, key id, and algorithm. +5. Load helpers decrypt locally using the in-memory key matching the stored key + id. + +This avoids a KMS round trip on every delivery while still protecting database +contents and backups from credential disclosure. If the deployment environment +requires KMS unwrap per key version, do that once at process startup or reload, +not inside the per-delivery hot path. + +Recommended config shape: + +- `CREDENTIAL_ENCRYPTION_MODE`: `plaintext`, `dual_write`, or + `encrypted_required`. +- `CREDENTIAL_ENCRYPTION_KEY_ID`: current key version identifier. +- `CREDENTIAL_ENCRYPTION_KEY_SOURCE`: local dev key, environment-provided key, + or production KMS-backed provider. + +## Schema Path + +Add encrypted columns alongside the existing plaintext columns: + +- `jetmon_webhooks.secret_ciphertext` +- `jetmon_webhooks.secret_nonce` +- `jetmon_webhooks.secret_key_id` +- `jetmon_webhooks.secret_alg` +- `jetmon_alert_contacts.destination_ciphertext` +- `jetmon_alert_contacts.destination_nonce` +- `jetmon_alert_contacts.destination_key_id` +- `jetmon_alert_contacts.destination_alg` + +Keep `secret_preview` and `destination_preview` unchanged. Previews are not +credentials and stay useful for operator display. + +After backfill and one stable release, make the encrypted columns required for +new rows. Dropping or nulling the plaintext columns should be a separate +deployment step after production has run in `encrypted_required` mode long +enough to prove there is no fallback traffic. + +## Migration Phases + +1. **Introduce encryption helpers.** Add a small internal package for encrypt + and decrypt operations, with test vectors and explicit key id handling. +2. **Add nullable encrypted columns.** Existing plaintext rows continue to + dispatch without behavior change. +3. **Dual-write new credentials.** Create, update, and rotate paths write both + plaintext and encrypted values. Load helpers prefer encrypted values and + fall back to plaintext. +4. **Backfill existing rows.** A CLI or migration command encrypts existing + plaintext values in batches. It should be idempotent and safe to resume. +5. **Require encrypted reads.** Flip production to `encrypted_required` once + every row has encrypted material. Fallback to plaintext becomes an error and + a metric. +6. **Remove plaintext storage.** In a later release, null or drop the plaintext + columns after backup retention and rollback windows make that safe. + +## Operational Requirements + +- Metrics for encrypt failures, decrypt failures, plaintext fallback count, and + unknown key id count. +- A startup check that fails fast in `dual_write` or `encrypted_required` when + the configured key source is unavailable. +- A key rotation runbook: add new key id, dual-write new data with it, rewrap + old rows, then retire the old key after the rollback window. +- A break-glass procedure for restoring dispatch if the key source is + unavailable. + +## Test Requirements + +- Unit tests for encryption round trips, wrong-key failures, nonce uniqueness, + and malformed ciphertext. +- Repository tests proving create/update/rotate paths write encrypted values in + `dual_write` and `encrypted_required`. +- Dispatch tests proving load helpers prefer encrypted columns and emit errors + instead of silently using plaintext when `encrypted_required` is active. +- Migration/backfill tests proving the backfill is resumable and leaves previews + unchanged. + +## Open Questions + +- Which production key manager should be the first provider? +- Should local development use a generated throwaway key, a config-provided key, + or stay in `plaintext` mode by default? +- What is the minimum stable period in `encrypted_required` before plaintext + columns can be removed? +- Do backups or replica access policies require encrypted columns before public + API work starts, or only before customer-managed secrets are exposed directly? diff --git a/docs/public-api-gateway-tenant-contract.md b/docs/public-api-gateway-tenant-contract.md new file mode 100644 index 00000000..077ee852 --- /dev/null +++ b/docs/public-api-gateway-tenant-contract.md @@ -0,0 +1,136 @@ +# Public API Gateway Tenant Contract + +**Status:** Gateway tenant context and Jetmon-side ownership checks are +implemented for internal gateway-routed requests. Native public exposure remains +deferred. + +This document defines the expected boundary between a customer-facing gateway +and Jetmon if the internal API is exposed through that gateway. It captures the +implemented ownership-enforcement shape and the remaining public-API +prerequisites before Jetmon could be exposed without that gateway. + +ADR-0002 remains the current implementation decision: Jetmon's API is internal +only, every caller is a trusted service, and tenant isolation lives outside +Jetmon. This contract describes the next shape if a gateway turns Jetmon into a +customer-facing product surface. + +## Boundary Summary + +The gateway owns customer identity. Jetmon owns monitoring correctness. + +| Concern | Gateway responsibility | Jetmon responsibility | +|---|---|---| +| Customer authentication | Authenticate the customer, user, team, app, or service token. | Accept only trusted internal service credentials. | +| Tenant identity | Derive a stable tenant id from the authenticated customer context. Never accept tenant ids from the public request body. | Accept gateway-derived tenant context only from the trusted gateway consumer and use it for ownership checks. | +| Public authorization | Enforce customer plan, feature flags, public scopes, and role membership. | Enforce internal `read` / `write` / `admin` service scopes and resource relationship invariants. | +| Resource ownership | Decide whether the public caller may see or mutate a site, webhook, alert contact, or delivery. | Enforce site mappings and owner columns for gateway-routed resources while preserving unscoped internal-operator behavior. | +| Error vocabulary | Collapse or sanitize 403/404 and internal errors for customers. | Return operator-accurate internal errors to the gateway. | +| Rate limits | Apply customer fairness, abuse, plan, and route-specific limits. | Keep per-service-key rate limits for internal service protection. | +| Auditing | Record public actor, tenant, OAuth/client app, and gateway decision details. | Record internal consumer, Jetmon request id, and any gateway-derived tenant context that reaches Jetmon. | + +## Request Context + +When the gateway calls Jetmon on behalf of a customer, it should authenticate +with its normal internal Bearer token and attach public request context as +headers. These headers are not trusted customer input; they are assertions from +the gateway service. + +| Header | Required | Meaning | +|---|---|---| +| `X-Jetmon-Tenant-ID` | Yes for customer-routed requests | Stable opaque tenant id derived by the gateway. | +| `X-Jetmon-Actor-ID` | Yes when a human or customer app initiated the request | Stable opaque actor id for audit correlation. | +| `X-Jetmon-Public-Scopes` | Yes for public API calls | Space-separated public scopes that the gateway has already granted, such as `sites:read events:read`. | +| `X-Jetmon-Gateway-Request-ID` | Yes | Gateway request id to correlate public support tickets with Jetmon logs. | +| `X-Jetmon-Plan` | Optional | Plan/tier snapshot useful for audit and abuse investigations. | + +Jetmon should only honor these headers from the configured gateway consumer +identity. A non-gateway API key sending public-context headers should be +rejected. Jetmon currently treats `consumer_name = "gateway"` as that trusted +gateway identity, requires tenant id, public scopes, and gateway request id +when any public-context header is present, and records accepted gateway context +in API audit metadata. + +## Tenant Checks + +The gateway should remain the first and strongest tenant boundary. Jetmon-side +tenant enforcement is still useful as defense in depth and becomes required if +Jetmon ever serves customers without a gateway in front. + +| Route family | Gateway checks | Jetmon checks before public exposure | +|---|---|---| +| Sites list/detail | Caller can access each `blog_id`; plan allows monitoring data. | Implemented through `jetmon_site_tenants` when gateway context is present. | +| Event/history/SLA reads | Caller can access the parent site; requested time range and filters are allowed. | Implemented through the parent site's `jetmon_site_tenants` mapping. | +| Site/check writes | Caller can manage the parent site; plan permits monitor mutation and trigger-now. | Implemented through the parent site's `jetmon_site_tenants` mapping; orchestrator/eventstore invariants remain unchanged. | +| Webhook CRUD/deliveries | Caller can manage tenant-owned webhooks; endpoint URL policy is satisfied. | Implemented with `owner_tenant_id`; delivery visibility and manual retry are derived through the owned webhook. | +| Alert contact CRUD/deliveries | Caller can manage tenant-owned alert contacts; transport is allowed by plan. | Implemented with `owner_tenant_id`; delivery visibility, manual retry, and send-test are derived through the owned contact. | +| Manual retries/tests | Caller owns the parent webhook/contact and route-specific abuse limits allow the operation. | Implemented by verifying parent ownership before enqueueing, retrying, or dispatching. | +| Health, `/me`, OpenAPI | Gateway decides whether to expose them at all. | No tenant filtering; these remain service introspection routes unless a public variant is designed. | + +## Ownership Model + +The tenant id should be opaque to Jetmon. It should not encode a WPCOM user id, +blog id, plan, or account type. If those concepts change, the gateway can keep +the same tenant id stable. + +For customer-owned resources created in Jetmon, prefer explicit ownership: + +- `jetmon_site_tenants(tenant_id, blog_id)` for monitored-site visibility +- `jetmon_webhooks.owner_tenant_id` +- `jetmon_alert_contacts.owner_tenant_id` +- delivery visibility derived from the owned webhook/contact +- idempotency cache scoped by `(tenant_id, api_key_id, idempotency_key)` if the + cache is made durable or shared across public tenants + +For monitored sites, do not assume ownership is always one-to-one with +`blog_id`. Jetmon now enforces site visibility for gateway-routed requests with +the `jetmon_site_tenants(tenant_id, blog_id)` mapping table, which preserves +room for shared ownership or gateway-derived delegation. + +Do not use `created_by` as ownership. It records the internal API key consumer +that created a row and is audit-only. + +## Public Error Shape + +Jetmon can keep returning honest internal errors to the gateway. The gateway is +responsible for public-safe behavior: + +- return 404 instead of 403 when a customer tries to access a resource outside + their tenant +- redact DB stages, verifier names, hostnames, SQL messages, and internal + delivery errors +- keep Jetmon's `request_id` or gateway request id available for support + escalation + +If Jetmon later implements a native public mode, that mode should have its own +error rendering path instead of weakening the internal API's operator-friendly +errors. + +## Migration Path + +1. Keep the v2 internal API unchanged while the gateway is the only public + entry point. +2. Request-context parsing for the headers above is implemented in the API + middleware and restricted to the gateway API key. Accepted context is logged + in audit metadata; non-gateway keys asserting it are rejected. +3. Gateway-routed webhook and alert-contact CRUD now set/filter + `owner_tenant_id`. Delivery history and manual retry visibility are derived + through the owned webhook/contact, and alert-contact send-test verifies the + contact owner before loading the destination credential. +4. Gateway-routed site, event/history, SLA/stat, and trigger-now routes now use + `jetmon_site_tenants` for defense-in-depth ownership checks. +5. Backfill/reconcile `jetmon_site_tenants` from the gateway's source of truth + before any customer traffic depends on direct Jetmon enforcement. The initial + operator path is `jetmon2 site-tenants import --file `, where the CSV is + `tenant_id,blog_id`; pruning stale mappings still depends on an agreed + gateway export/reconciliation policy. +6. Add public-scope and redaction tests route family by route family. +7. Only after those checks exist, consider exposing Jetmon without a gateway. + +## Non-Goals + +- This does not add customer authentication to Jetmon. +- This does not change the current internal `read` / `write` / `admin` API key + scopes. +- This does not decide the customer-facing OAuth, app-token, or WordPress.com + auth model. +- This does not require tenant columns before the v2 production rollout. diff --git a/docs/v1-to-v2-pinned-rollout.md b/docs/v1-to-v2-pinned-rollout.md new file mode 100644 index 00000000..d87245eb --- /dev/null +++ b/docs/v1-to-v2-pinned-rollout.md @@ -0,0 +1,169 @@ +# v1 to v2 Pinned Bucket Rollout + +**Status:** Production migration runbook for the first v1-to-v2 cutover. + +This rollout replaces one v1 static-bucket host with one v2 host pinned to the +same inclusive bucket range. It avoids mixed ownership between v1 static config +and v2 `jetmon_hosts` dynamic ownership during the riskiest part of the +migration. + +## Why Pinned Mode Exists + +v1 and v2 do not share a bucket ownership protocol: + +- v1 uses static `BUCKET_NO_MIN` / `BUCKET_NO_MAX` config per host. +- v2 normally uses the `jetmon_hosts` table with heartbeat and reclaim. + +During a mixed fleet rollout, dynamic v2 ownership cannot know which buckets are +still covered by v1. Pinned mode keeps each replacement host on the exact range +its v1 predecessor owned and disables `jetmon_hosts` ownership for that v2 host. + +## Configuration + +Prefer explicit pinned keys in v2 config: + +```json +{ + "PINNED_BUCKET_MIN": 0, + "PINNED_BUCKET_MAX": 99, + "LEGACY_STATUS_PROJECTION_ENABLE": true, + "API_PORT": 0 +} +``` + +The legacy v1 names `BUCKET_NO_MIN` and `BUCKET_NO_MAX` are accepted as aliases +for pinned mode. If both forms are present, they must describe the same range. + +While pinned: + +- the host checks only `PINNED_BUCKET_MIN <= bucket_no <= PINNED_BUCKET_MAX` +- the host does not claim or heartbeat `jetmon_hosts` +- shutdown does not release a `jetmon_hosts` row +- `BUCKET_TOTAL`, `BUCKET_TARGET`, and `BUCKET_HEARTBEAT_GRACE_SEC` still + validate, but dynamic ownership does not use them on that host + +## Preflight + +1. Confirm the v1 fleet's static bucket ranges are complete and non-overlapping. +2. Build all v2 binaries and run `make test`, `make test-race`, and `make all`. +3. Apply additive migrations before the cutover: + + ```bash + ./jetmon2 migrate + ``` + +4. Keep `LEGACY_STATUS_PROJECTION_ENABLE=true` so legacy readers continue to see + `jetpack_monitor_sites.site_status` and `last_status_change`. +5. Keep `API_PORT=0` on monitor hosts during initial replacement unless the API + and delivery owner plan has been explicitly approved. +6. Run `./jetmon2 validate-config` with the prepared v2 config and confirm it + prints the pinned rollout preflight command plus the projection-drift command. +7. Verify Veriflier endpoints, WPCOM auth, StatsD, log paths, and config reload + behavior in staging. + +## Per-Host Cutover + +For each v1 host: + +1. Record the host name and v1 bucket range. +2. Prepare the v2 config with the same pinned range. +3. Before stopping v1, run `./jetmon2 validate-config` and confirm it reports: + - `legacy_status_projection=enabled` + - `bucket_ownership=pinned range=-` + - `rollout_preflight=./jetmon2 rollout pinned-check` + - `rollout_drift_report=./jetmon2 rollout projection-drift` +4. Stop the v1 process for that host. +5. Start the v2 process. +6. Run the pinned rollout preflight: + + ```bash + ./jetmon2 rollout pinned-check + ``` + + This check fails if the host is not in pinned mode, legacy projection writes + are disabled, the current host still has a `jetmon_hosts` ownership row, or + the active sites in the pinned range have projection drift. It also prints the + active site count for the range. If projection drift is reported, list the + mismatched rows before continuing: + + ```bash + ./jetmon2 rollout projection-drift + ``` + + If checking a config before running on the final hostname, pass the expected + host id explicitly: + + ```bash + ./jetmon2 rollout pinned-check --host= + ``` + +7. Verify the process logs: + - `legacy_status_projection=enabled` + - `bucket_ownership=pinned range=-` + - `orchestrator: using pinned buckets -` +8. If `DASHBOARD_PORT` is enabled, open the operator dashboard and confirm: + - rollout ownership shows the pinned range + - legacy projection is enabled + - delivery workers are disabled unless the delivery owner plan explicitly + enables them on this host + - dependency health is green for MySQL, configured Verifliers, log/stats + directory writes, and StatsD initialization; WPCOM must not show an open + circuit +9. Watch one full check round for that bucket range. +10. Confirm: + - checks are running only for the pinned range + - Veriflier confirmation works + - WPCOM notifications retain the v1 payload shape + - `jetmon_events` and `jetmon_event_transitions` receive event mutations + - `jetpack_monitor_sites.site_status` projection updates when enabled + - no unexpected rows are claimed in `jetmon_hosts` by the pinned host + +## Rollback + +Rollback is host-local: + +1. Stop the v2 process. +2. Restart the original v1 process with the same `BUCKET_NO_MIN` / + `BUCKET_NO_MAX` config. +3. Verify v1 checks the range again. + +The v2 migrations are additive, and legacy projection writes keep the old status +fields meaningful while `LEGACY_STATUS_PROJECTION_ENABLE=true`, so rollback does +not require schema rollback. + +## Transition to Dynamic v2 Ownership + +After every monitor host is on v2 and stable in pinned mode: + +1. Confirm no v1 monitor hosts remain active. +2. Plan a coordinated dynamic-ownership cutover. Pinned hosts do not write + `jetmon_hosts`, so avoid leaving a long-lived mixed fleet where some v2 + hosts are pinned and others use dynamic ownership. +3. Remove `PINNED_BUCKET_MIN` / `PINNED_BUCKET_MAX` (and any legacy + `BUCKET_NO_MIN` / `BUCKET_NO_MAX` aliases) from the v2 monitor configs. +4. Restart the v2 monitor hosts in the approved deployment window. +5. Run `./jetmon2 validate-config` and confirm it reports + `rollout_preflight=./jetmon2 rollout dynamic-check`. +6. Run the dynamic ownership preflight: + + ```bash + ./jetmon2 rollout dynamic-check + ``` + + This check fails if pinned mode is still configured, legacy projection writes + are disabled, `jetmon_hosts` rows are missing, stale, inactive, overlapping, + or gapped, or the legacy projection has drifted. + + To inspect projection drift details across the dynamic range: + + ```bash + ./jetmon2 rollout projection-drift --limit=100 + ``` + +7. Continue using the normal v2 rolling-update process from `README.md`. + +Do not run a mixed configuration where some v1 hosts still own static ranges +while unpinned v2 hosts use dynamic `jetmon_hosts` ownership. Also avoid a +long-lived pinned-v2/dynamic-v2 mix: dynamic hosts cannot see pinned hosts in +`jetmon_hosts`, so the fleet can overlap checks even though it should not create +coverage gaps. diff --git a/docs/v3-probe-agent-architecture-options.md b/docs/v3-probe-agent-architecture-options.md new file mode 100644 index 00000000..cdeb7c46 --- /dev/null +++ b/docs/v3-probe-agent-architecture-options.md @@ -0,0 +1,402 @@ +# Jetmon v3 Probe-Agent Architecture Options + +## Status + +Planning note. This is not an accepted architecture decision and should not +block the v2 production migration. + +The intended migration order is: + +```text +v1 production + -> v2 compatibility rewrite + -> v2 production hardening and measurement + -> v3 probe-agent architecture in shadow mode + -> v3 gradual production cutover +``` + +The v3 architecture should be revisited only after v2 has been deployed to +production and has enough operating data to make the tradeoffs concrete. + +## Why Revisit This After v2? + +The currently implemented v2 shape keeps Jetmon close to the existing mental +model: main monitor servers own bucketed primary checks, and Verifliers provide +independent confirmation before a site moves from `Seems Down` to `Down`. + +That is the right near-term migration target because it limits product and +operational change while the Go rewrite, eventstore, API, alerting, and +delivery workers stabilize. + +After v2 is stable, the main question is whether Jetmon should keep the +separate "main monitor" and "Veriflier" roles or evolve into a more general +probe platform where regional agents execute both routine checks and +confirmation jobs while a central decision layer owns incident state. + +## Data To Gather During v2 + +The v3 decision should be based on production data from v2, especially: + +- Time from first local failure to `Seems Down`. +- Time from `Seems Down` to confirmed `Down`. +- False alarm rate by failure class. +- Veriflier agreement and disagreement rates. +- Veriflier latency and timeout rates by region/provider. +- Number of incidents where local failure was not confirmed remotely. +- Number of incidents where remote confirmation was mixed by region. +- Number of monitor-side failures that should be modeled as `Unknown`. +- Cost and capacity profile for primary checks versus confirmation checks. +- Operator pain points around explaining why an incident was or was not + confirmed. +- Customer-impacting notification parity against the legacy WPCOM path. + +Without this data, v3 risks optimizing for hypothetical problems instead of +the production failure modes that actually matter. + +The v2 monitor emits the first production evidence slice through StatsD: +`detection.*` timing metrics cover the local-failure to lifecycle-state path, +class-specific `detection.*..count` counters split confirmed, +false-alarm, and probe-cleared outcomes, and `verifier.host..*` counters +split RPC health and confirm/disagree votes by configured Veriflier host. Use +the host naming convention to preserve region/provider information in those +series. Legacy WPCOM notification parity is tracked through +`wpcom.notification.*` counters for attempts, deliveries, retries, errors, and +final failures, with status-specific splits for `down`, `running`, and +`confirmed_down`. + +## Current v2 Baseline + +The v2 flow is: + +```text +Up + -> Seems Down local probe failed, retry/confirmation in progress + -> Down enough independent Verifliers confirmed + -> Resolved local or confirmed recovery +``` + +The v2 deployment shape is: + +- Main `jetmon2` servers claim site buckets and perform primary checks. +- Failed local checks open or update eventstore incidents. +- After enough local failures, the orchestrator asks Verifliers to confirm. +- Veriflier agreement promotes the same event from `Seems Down` to `Down`. +- Veriflier disagreement closes the event as a false alarm. +- Legacy WPCOM notification behavior remains preserved around the confirmed + `Down` and recovery transitions. + +This is intentionally conservative and remains the correct v2 production +target. + +## Question 1: Is There A Better Flow Than Seems Down To Confirmed Down? + +Externally, the `Seems Down -> Down -> Resolved` lifecycle is still a good +operator and customer-facing model. It is simple, useful, and maps well to the +current false-positive reduction goal. + +Internally, v3 may need a richer decision model: + +| Internal state | Meaning | +|---|---| +| `Suspected` | First failure observed, not enough evidence yet | +| `Confirming` | Confirmation probes are in flight | +| `ConfirmedGlobalDown` | Enough independent regions agree the site is down | +| `RegionalFailure` | Some regions fail while others succeed | +| `Unknown` | Monitor/probe infrastructure cannot produce trustworthy evidence | +| `FalseAlarm` | The original failure was not confirmed | + +Those internal states do not need to leak directly to every consumer. They can +still project to the v2 public states where compatibility matters: + +```text +Suspected / Confirming -> Seems Down +ConfirmedGlobalDown -> Down +RegionalFailure -> Degraded or Regional Failure, depending on taxonomy +Unknown -> Unknown, not downtime +FalseAlarm -> Resolved with reason=false_alarm +``` + +## Question 2: Should Main Servers And Verifliers Remain Separate? + +For v2, yes. It keeps the migration safe. + +For v3, probably not as a permanent distinction. A better long-term shape is +likely: + +- **Decision layer:** owns scheduling, quorum rules, eventstore writes, and + notification decisions. +- **Probe agents:** execute check jobs from one or more regions/providers. +- **Durable job bus:** stores check jobs, claims, results, retries, and agent + heartbeats. + +In that model, "primary check" and "confirmation check" are job types, not +separate binary roles. + +## Question 3: What Does The Current Shape Leave On The Table? + +Compared with a probe-agent architecture, the current v2 shape gives up or +delays: + +- Continuous regional baseline data. +- First-class regional or partial-outage classification. +- Durable confirmation jobs independent of orchestrator memory. +- Cleaner backpressure and retry accounting for probe work. +- Easier addition of new probe types, such as synthetic flows or TCP checks. +- Per-vantage-point latency and SLA reporting. +- Better explanations for mixed outcomes. +- More flexible capacity planning, because every probe agent can execute any + supported check job. + +These are good v3 motivations, but they should not be bundled into the v2 +production cutover. + +## Candidate Architectures To Revisit + +### Candidate 1: v2 Plus Stronger Probe Metadata + +Keep the main-server-plus-Veriflier structure, but record richer evidence for +every vote: probe identity, region, provider, timing, failure class, and +decision inputs. + +Flow: + +```text +main check fails -> Seems Down +local retries fail -> Veriflier confirmation +event transition stores each vote and decision input +quorum -> Down, disagreement -> false_alarm +``` + +Pros: + +- Lowest risk after v2. +- Improves support and operator explainability quickly. +- Produces better data for future v3 decisions. +- Minimal deployment changes. + +Cons: + +- Keeps the main/Veriflier split. +- Remote perspective is still mostly gathered after suspicion. +- Does not fully support regional baseline or synthetic-check expansion. + +When to choose: + +- v2 works well, but operators mainly need better evidence and dashboards. + +### Candidate 2: Peer Probe Mesh + +Every monitor host can perform both primary and confirmation probes. A host +that detects a failure asks peer monitor hosts in other regions/providers for +confirmation. + +Flow: + +```text +bucket owner detects failure +bucket owner requests peer probes +peer votes return directly to owner +owner writes event transition and notifications +``` + +Pros: + +- Removes a separate Veriflier fleet. +- Uses monitor capacity more evenly. +- Simpler than introducing a full scheduler and job bus. +- Can become region-aware if monitor hosts are deployed across regions. + +Cons: + +- Monitor hosts become more coupled. +- A monitor-host incident can affect both primary and confirmation capacity. +- Harder to enforce anti-correlation rules unless host metadata is rigorous. +- Still centers decisions on the bucket owner. + +When to choose: + +- The Veriflier fleet is operationally awkward, but a full scheduler is too + large a step. + +### Candidate 3: Central Scheduler Plus Regional Probe Agents + +This is the leading v3 candidate. + +A scheduler/decision service owns check plans and durable jobs. Regional probe +agents claim jobs, execute checks, and write results. The decision layer +evaluates evidence and writes eventstore transitions. + +Flow: + +```text +scheduler creates routine probe jobs +regional probe agents claim and execute jobs +decision layer evaluates results +first failure opens Suspected/Seems Down +confirmation jobs are scheduled to independent agents +quorum/classifier promotes to Down, RegionalFailure, Unknown, or false_alarm +eventstore writes remain the source of truth +delivery workers notify from event transitions +``` + +Pros: + +- Best long-term separation of concerns. +- Durable jobs replace in-memory confirmation state. +- Probe agents are simple and horizontally scalable. +- Primary and confirmation checks use the same execution path. +- Supports regional status, confidence scoring, per-vantage SLA, synthetic + checks, and richer diagnostics. +- Lets Jetmon add new probe types without reshaping the decision layer. + +Cons: + +- Largest implementation effort. +- Requires durable job claiming and result deduplication. +- Requires careful shadow-mode comparison before becoming authoritative. +- More operational components than the v2 single-binary shape. + +When to choose: + +- v2 production data shows confirmation latency, regional ambiguity, or + operator explainability are material problems. +- Jetmon needs regional SLAs, synthetic checks, or more probe types. +- The team is ready to invest in a platform-shaped monitoring architecture. + +### Candidate 4: Always-On Multi-Region Quorum + +Every monitored site is checked from multiple regions continuously or +near-continuously. Incidents are classified from live quorum rather than a +second-stage confirmation request. + +Flow: + +```text +regional agents check every site on schedule +decision layer continuously evaluates current regional evidence +multi-region failure -> Down +single-region failure -> RegionalFailure or Degraded +probe infrastructure failure -> Unknown +``` + +Pros: + +- Fastest confirmation. +- Best regional visibility. +- Strong latency and SLA data by vantage point. +- Removes most of the "wait for retries, then confirm" gap. + +Cons: + +- Much higher check volume. +- More customer-site load. +- Higher cost. +- Needs careful aggregation to avoid noisy partial failures. +- Probably too expensive for every site unless tiers or sampling are added. + +When to choose: + +- Product requirements demand regional SLA visibility or very fast + confirmation, and the cost profile is acceptable. + +### Candidate 5: External Probes Plus Site/WPCOM Signals + +Combine external probe evidence with internal or site-side signals such as +Jetpack heartbeat, wp-admin reachability, cron heartbeat, or WPCOM-side +activity. + +Flow: + +```text +external probe failure opens Suspected/Seems Down +decision layer checks corroborating Jetpack/WPCOM/site signals +external + internal evidence agree -> Down +external failure only -> Confirming, RegionalFailure, or Unknown +internal signal missing only -> agent/heartbeat problem, not customer downtime +``` + +Pros: + +- Better distinction between site downtime, regional network issues, and + monitor-side failures. +- Better support diagnostics. +- Can reduce false positives. +- Complements any probe-agent architecture. + +Cons: + +- Depends on signal quality from Jetpack/WPCOM/site-side systems. +- Heartbeats can be delayed for reasons other than downtime. +- More data contracts outside Jetmon. +- Not a replacement for external probing. + +When to choose: + +- v2 data shows many false positives that external probes alone cannot + classify confidently, or support needs better causal diagnostics. + +## Current Recommendation + +Do not change the v2 production target. + +The recommended path is: + +1. Finish and deploy v2 with the current main-server-plus-Veriflier shape. +2. Stabilize v2 in production. +3. Gather the data listed above. +4. Revisit these candidates with real evidence. +5. If the evidence supports it, evolve toward Candidate 3. + +Candidate 3 is the current best long-term option because it turns Jetmon into a +durable probe platform instead of a monitor-plus-confirmers system. It offers +the best path to regional status, richer classification, synthetic checks, and +more predictable scaling. + +Candidate 1 is the likely first step regardless of final v3 choice because +better probe metadata makes every other option easier to evaluate. + +## Candidate 3 Migration Sketch After v2 Stabilizes + +The v2-to-v3 migration should be incremental: + +1. **Add probe metadata to v2 results.** + Record region, provider, probe identity, timing, failure class, and vote + details for local and Veriflier checks. + +2. **Introduce durable confirmation jobs.** + Keep primary checks in v2, but replace direct Veriflier fanout with jobs in + MySQL. Existing Verifliers or new probe agents claim jobs and write results. + +3. **Generalize Veriflier into probe-agent.** + Make confirmation an execution mode of a generic agent rather than a + special-purpose service. + +4. **Run primary probe jobs in shadow mode.** + Schedule routine check jobs for a small cohort but do not let them affect + customer-visible state. + +5. **Compare v2 decisions to v3 decisions.** + Measure detection latency, confirmation latency, false positives, missed + incidents, regional disagreement, and WPCOM notification parity. + +6. **Cut over confirmation decisions.** + Let the job-based confirmation path become authoritative for + `Seems Down -> Down` after it matches or beats v2 behavior in shadow mode. + +7. **Cut over primary checks gradually.** + Move bucket ranges or site cohorts from direct v2 primary checks to scheduled + probe jobs. + +8. **Retire the main/Veriflier distinction.** + The central decision layer owns scheduling and state; probe agents execute + jobs from any supported check type. + +## Non-Goals Until After v2 Is Stable + +- Do not skip directly from v1 to v3. +- Do not change customer-visible notification semantics during the v2 cutover. +- Do not replace eventstore as the source of truth. +- Do not require a new queueing system before MySQL-backed job claiming has + been evaluated. +- Do not make regional classifications customer-visible until the taxonomy and + support story are ready. diff --git a/go.mod b/go.mod index 7fa0009f..bab269db 100644 --- a/go.mod +++ b/go.mod @@ -3,3 +3,5 @@ module github.com/Automattic/jetmon go 1.22 require github.com/go-sql-driver/mysql v1.7.1 + +require github.com/DATA-DOG/go-sqlmock v1.5.2 diff --git a/go.sum b/go.sum index fd7ae076..fd205b6d 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,5 @@ +github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= +github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= +github.com/kisielk/sqlstruct v0.0.0-20201105191214-5f3e10d3ab46/go.mod h1:yyMNCyc/Ib3bDTKd379tNMpB/7/H5TjM2Y9QJ5THLbE= diff --git a/internal/alerting/alerting.go b/internal/alerting/alerting.go new file mode 100644 index 00000000..9e8fd3b5 --- /dev/null +++ b/internal/alerting/alerting.go @@ -0,0 +1,275 @@ +// Package alerting manages outbound alert contact subscriptions and the +// delivery worker that fans transitions out to managed transports. +// +// An alert contact is a registration that says "send a Jetmon-rendered +// notification through this transport when matching transitions fire." +// A delivery is one alert contact firing — created when an event +// transition matches the contact's site_filter and severity gate, then +// dispatched by the background worker through the configured transport. +// +// Where webhooks (internal/webhooks) deliver a raw signed event stream +// for the consumer to render, alert contacts deliver a Jetmon-rendered +// notification through a transport Jetmon owns end-to-end (subject lines, +// PagerDuty severity mapping, Slack Block Kit rendering, etc.). +// +// See API.md "Family 5" for the public design and ROADMAP.md for deferred +// items (SMS, OpsGenie, alert grouping, WPCOM-flow migration). +package alerting + +import ( + "context" + "encoding/json" + "errors" + "time" + + "github.com/Automattic/jetmon/internal/eventstore" +) + +// Storage note: destination credentials are stored in plaintext in +// jetmon_alert_contacts.destination. Same rationale as +// jetmon_webhooks.secret — outbound dispatch needs the raw value at +// every send. A hash is useless because we'd have to recover the +// original to call the transport. Encryption at rest with a master +// key is on ROADMAP.md as a future hardening step. + +// Status enumerates the lifecycle states of a delivery row. +type Status string + +const ( + StatusPending Status = "pending" + StatusDelivered Status = "delivered" + StatusFailed Status = "failed" + StatusAbandoned Status = "abandoned" +) + +// Transport identifies which managed channel a contact delivers through. +// New transports are added (never renamed) so existing contact configs +// don't break — the ENUM in the migration mirrors this set. +type Transport string + +const ( + TransportEmail Transport = "email" + TransportPagerDuty Transport = "pagerduty" + TransportSlack Transport = "slack" + TransportTeams Transport = "teams" +) + +// AllTransports returns the canonical set of transport identifiers. +// Used by validators (a contact's transport must be one of these) and +// by docs/listings. +func AllTransports() []Transport { + return []Transport{TransportEmail, TransportPagerDuty, TransportSlack, TransportTeams} +} + +// IsValidTransport reports whether s is one of the known transports. +func IsValidTransport(s string) bool { + for _, t := range AllTransports() { + if string(t) == s { + return true + } + } + return false +} + +// Sentinel errors returned by package functions. +var ( + ErrContactNotFound = errors.New("alerting: alert contact not found") + ErrDeliveryNotFound = errors.New("alerting: alert delivery not found") + ErrInvalidTransport = errors.New("alerting: unknown transport") + ErrInvalidSeverity = errors.New("alerting: unknown severity") +) + +// AlertContact is the in-memory shape of a jetmon_alert_contacts row. +// The raw destination credential is never stored here — it's loaded +// separately by the worker via LoadDestination so it can't leak through +// serialization of the AlertContact struct. +type AlertContact struct { + ID int64 + Label string + Active bool + OwnerTenantID *string + Transport Transport + DestinationPreview string // last 4 chars of the credential, for display + SiteFilter SiteFilter // empty = match all sites + MinSeverity uint8 // matches eventstore.Severity* (0=Up..4=Down) + MaxPerHour int // 0 = unlimited + CreatedBy string + CreatedAt time.Time + UpdatedAt time.Time +} + +// SiteFilter restricts deliveries to a fixed list of sites. Empty +// SiteIDs (or a nil filter) means "match all sites." Same shape as +// webhooks.SiteFilter — kept as a separate type so alerting can evolve +// independently of the webhooks package. +type SiteFilter struct { + SiteIDs []int64 `json:"site_ids,omitempty"` +} + +// Matches reports whether this contact should fire for a given +// transition. The filter rule is: +// +// site_id ∈ site_filter.site_ids (or site_filter empty → all sites) +// AND ( +// new_severity >= min_severity // escalation / sustained +// OR (prev_severity >= min_severity // recovery from a +// AND new_severity == SeverityUp) // previously-paging state +// ) +// +// Within-band changes (e.g. Down → SeemsDown when min_severity=Warning) +// fire as flickers. The per-contact max_per_hour cap absorbs the noise. +// +// Recovery firing requires both prev and new severity because Matches +// doesn't see the transition reason — it can't distinguish "resolved" +// from "transitioned through Up by accident." Practically, transitions +// to Up only happen on real recoveries. +func (c *AlertContact) Matches(prevSeverity, newSeverity uint8, siteID int64) bool { + if !c.Active { + return false + } + if len(c.SiteFilter.SiteIDs) > 0 && !containsInt64(c.SiteFilter.SiteIDs, siteID) { + return false + } + if newSeverity >= c.MinSeverity { + return true + } + if prevSeverity >= c.MinSeverity && newSeverity == eventstore.SeverityUp { + return true + } + return false +} + +// CreateInput is the data needed to insert a new alert contact. +// Label, Transport, and Destination are required; everything else has +// sensible defaults (Active=true, SiteFilter empty=match-all, +// MinSeverity=SeverityDown, MaxPerHour=60). +type CreateInput struct { + Label string + Active *bool // nil → true + OwnerTenantID *string + Transport Transport + Destination json.RawMessage // transport-specific shape; validated per transport + SiteFilter SiteFilter + MinSeverity *uint8 // nil → SeverityDown + MaxPerHour *int // nil → 60 + CreatedBy string +} + +// UpdateInput is a sparse patch. nil fields are unchanged. An explicit +// empty SiteFilter clears the filter (restores match-all). Transport +// and Destination cannot be updated together via PATCH — change of +// transport requires creating a new contact (the destination shape +// is transport-specific and validating cross-transport changes is +// more brittle than just deleting+recreating). +type UpdateInput struct { + Label *string + Active *bool + Destination json.RawMessage // transport-specific; nil = unchanged + SiteFilter *SiteFilter + MinSeverity *uint8 + MaxPerHour *int +} + +// Notification is the rendered shape passed to a Transport.Send +// implementation. The worker builds this once per delivery from the +// frozen-at-fire-time payload; transports translate it into their +// channel-specific representation. +// +// IsTest=true is used by the send-test endpoint to flag synthetic +// notifications. Transports may use this to add a banner ("This is a +// Jetmon test notification") or to choose dedup keys that won't +// collide with real alerts. +type Notification struct { + SiteID int64 + SiteURL string + EventID int64 + EventType string + Severity uint8 + SeverityName string + State string + Reason string + Timestamp time.Time + DedupKey string + Recovery bool + IsTest bool +} + +// Dispatcher defines the contract every concrete transport +// (email/pagerduty/slack/teams) implements. Send is responsible for +// translating Notification into the channel-specific request and +// reporting the outcome. +// +// statusCode is the channel's idiomatic status (HTTP code for +// HTTP-based transports, SMTP reply class for email — e.g. 250 +// becomes 250). responseBody is a truncated summary suitable for +// storing in jetmon_alert_deliveries.last_response (max 2048 chars; +// the worker truncates if needed). +// +// Returning err != nil means the dispatch failed in a way the worker +// should retry on the standard ladder. Returning err == nil with a +// non-2xx-equivalent status also schedules a retry; the worker +// treats both as failures for retry purposes but distinguishes them +// for diagnostics. +type Dispatcher interface { + Send(ctx context.Context, destination json.RawMessage, n Notification) (statusCode int, responseBody string, err error) +} + +// SeverityName returns the canonical string form of a severity uint8, +// matching the constants in internal/eventstore. Used by the API +// layer (which returns severity names in JSON) and by transport +// renderers (PagerDuty severity field, email subjects, Slack message +// bodies). +// +// Returns "" for unknown values rather than panicking — some callers +// pass user-supplied input that hasn't been validated yet. +func SeverityName(s uint8) string { + switch s { + case eventstore.SeverityUp: + return "Up" + case eventstore.SeverityWarning: + return "Warning" + case eventstore.SeverityDegraded: + return "Degraded" + case eventstore.SeveritySeemsDown: + return "SeemsDown" + case eventstore.SeverityDown: + return "Down" + default: + return "" + } +} + +// SeverityFromName parses a severity string back into the eventstore +// uint8 constant. Used by the API layer to validate min_severity +// inputs from JSON. Returns ErrInvalidSeverity on unknown names. +func SeverityFromName(s string) (uint8, error) { + switch s { + case "Up": + return eventstore.SeverityUp, nil + case "Warning": + return eventstore.SeverityWarning, nil + case "Degraded": + return eventstore.SeverityDegraded, nil + case "SeemsDown": + return eventstore.SeveritySeemsDown, nil + case "Down": + return eventstore.SeverityDown, nil + default: + return 0, ErrInvalidSeverity + } +} + +// AllSeverityNames returns the full ordered list of severity names, +// least-to-most severe. Used by docs and validators. +func AllSeverityNames() []string { + return []string{"Up", "Warning", "Degraded", "SeemsDown", "Down"} +} + +func containsInt64(haystack []int64, needle int64) bool { + for _, v := range haystack { + if v == needle { + return true + } + } + return false +} diff --git a/internal/alerting/alerting_test.go b/internal/alerting/alerting_test.go new file mode 100644 index 00000000..9fd29e97 --- /dev/null +++ b/internal/alerting/alerting_test.go @@ -0,0 +1,159 @@ +package alerting + +import ( + "testing" + + "github.com/Automattic/jetmon/internal/eventstore" +) + +func TestSeverityNameRoundTrip(t *testing.T) { + for _, name := range AllSeverityNames() { + s, err := SeverityFromName(name) + if err != nil { + t.Errorf("SeverityFromName(%q) returned error: %v", name, err) + continue + } + if got := SeverityName(s); got != name { + t.Errorf("round-trip %q → %d → %q failed", name, s, got) + } + } +} + +func TestSeverityNameUnknown(t *testing.T) { + if got := SeverityName(99); got != "" { + t.Errorf("SeverityName(99) = %q, want empty string", got) + } + if _, err := SeverityFromName("Bogus"); err == nil { + t.Error("SeverityFromName(\"Bogus\") should error") + } +} + +func TestIsValidTransport(t *testing.T) { + for _, valid := range []string{"email", "pagerduty", "slack", "teams"} { + if !IsValidTransport(valid) { + t.Errorf("IsValidTransport(%q) = false, want true", valid) + } + } + for _, bad := range []string{"", "Email", "sms", "opsgenie", "EMAIL"} { + if IsValidTransport(bad) { + t.Errorf("IsValidTransport(%q) = true, want false", bad) + } + } +} + +// TestMatchesInactive verifies an inactive contact never fires regardless +// of severity — a deactivated contact should be invisible to the worker. +func TestMatchesInactive(t *testing.T) { + c := &AlertContact{ + Active: false, + MinSeverity: eventstore.SeverityWarning, + } + if c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, 1) { + t.Error("inactive contact should not match") + } +} + +// TestMatchesEmptySiteFilter verifies an empty site filter matches all sites +// — the documented "empty = match all" semantic. +func TestMatchesEmptySiteFilter(t *testing.T) { + c := &AlertContact{ + Active: true, + MinSeverity: eventstore.SeverityDown, + // SiteFilter is zero value → empty SiteIDs → match all. + } + for _, siteID := range []int64{1, 42, 99999} { + if !c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, siteID) { + t.Errorf("empty site filter should match site %d", siteID) + } + } +} + +func TestMatchesSiteFilterWhitelist(t *testing.T) { + c := &AlertContact{ + Active: true, + SiteFilter: SiteFilter{SiteIDs: []int64{42, 99}}, + MinSeverity: eventstore.SeverityDown, + } + if !c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, 42) { + t.Error("site 42 should match") + } + if !c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, 99) { + t.Error("site 99 should match") + } + if c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, 7) { + t.Error("site 7 should not match (not in whitelist)") + } +} + +// TestMatchesSeverityGate covers the escalation half of the gate: +// new_severity >= min_severity fires, regardless of prev_severity. +func TestMatchesSeverityGate(t *testing.T) { + c := &AlertContact{ + Active: true, + MinSeverity: eventstore.SeverityDegraded, // 2 + } + cases := []struct { + prev, next uint8 + want bool + desc string + }{ + {eventstore.SeverityUp, eventstore.SeverityWarning, false, "Up→Warning, both below gate"}, + {eventstore.SeverityUp, eventstore.SeverityDegraded, true, "Up→Degraded, crosses gate"}, + {eventstore.SeverityWarning, eventstore.SeverityDegraded, true, "Warning→Degraded, crosses gate"}, + {eventstore.SeverityDegraded, eventstore.SeveritySeemsDown, true, "Degraded→SeemsDown, within gated band"}, + {eventstore.SeveritySeemsDown, eventstore.SeverityDown, true, "SeemsDown→Down, within gated band"}, + } + for _, tc := range cases { + got := c.Matches(tc.prev, tc.next, 0) + if got != tc.want { + t.Errorf("%s: Matches(%d,%d) = %v, want %v", tc.desc, tc.prev, tc.next, got, tc.want) + } + } +} + +// TestMatchesRecovery covers the recovery half: a transition back to Up +// fires only if prev_severity was at or above the gate. +func TestMatchesRecovery(t *testing.T) { + c := &AlertContact{ + Active: true, + MinSeverity: eventstore.SeverityDegraded, // 2 + } + cases := []struct { + prev, next uint8 + want bool + desc string + }{ + {eventstore.SeverityDown, eventstore.SeverityUp, true, "Down→Up: previously paged, now recovered"}, + {eventstore.SeverityDegraded, eventstore.SeverityUp, true, "Degraded→Up: at-gate recovery fires"}, + {eventstore.SeverityWarning, eventstore.SeverityUp, false, "Warning→Up: never paged, no recovery to send"}, + {eventstore.SeverityUp, eventstore.SeverityUp, false, "Up→Up: no transition meaning"}, + } + for _, tc := range cases { + got := c.Matches(tc.prev, tc.next, 0) + if got != tc.want { + t.Errorf("%s: Matches(%d,%d) = %v, want %v", tc.desc, tc.prev, tc.next, got, tc.want) + } + } +} + +// TestMatchesAllDimensions verifies the AND across all dimensions: +// a contact must satisfy active, site_filter, and severity gate. +func TestMatchesAllDimensions(t *testing.T) { + c := &AlertContact{ + Active: true, + SiteFilter: SiteFilter{SiteIDs: []int64{42}}, + MinSeverity: eventstore.SeverityDown, // 4 + } + // All dimensions match. + if !c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, 42) { + t.Error("all dimensions matching should fire") + } + // Wrong site, severity matches. + if c.Matches(eventstore.SeverityUp, eventstore.SeverityDown, 7) { + t.Error("wrong site should not fire") + } + // Right site, severity below gate (and no recovery: prev was below gate too). + if c.Matches(eventstore.SeverityUp, eventstore.SeverityWarning, 42) { + t.Error("severity below gate should not fire when prev also below") + } +} diff --git a/internal/alerting/contacts.go b/internal/alerting/contacts.go new file mode 100644 index 00000000..bf7d213a --- /dev/null +++ b/internal/alerting/contacts.go @@ -0,0 +1,459 @@ +package alerting + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "strings" +) + +// Create inserts a new alert contact and returns the persisted record. +// Unlike webhooks.Create (which returns the one-time raw secret), the +// destination is supplied by the caller — they already know the +// credential, so there's nothing to return-once. Subsequent reads +// expose only DestinationPreview. +func Create(ctx context.Context, db *sql.DB, in CreateInput) (*AlertContact, error) { + if err := validateCreateInput(in); err != nil { + return nil, err + } + active := true + if in.Active != nil { + active = *in.Active + } + minSev := uint8(4) // SeverityDown + if in.MinSeverity != nil { + minSev = *in.MinSeverity + } + maxPerHour := 60 + if in.MaxPerHour != nil { + maxPerHour = *in.MaxPerHour + } + preview := destinationPreview(in.Transport, in.Destination) + siteFilterJSON, _ := json.Marshal(in.SiteFilter) + + res, err := db.ExecContext(ctx, ` + INSERT INTO jetmon_alert_contacts + (label, active, owner_tenant_id, transport, destination, destination_preview, + site_filter, min_severity, max_per_hour, created_by) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + in.Label, boolToTinyint(active), nullableString(in.OwnerTenantID), string(in.Transport), []byte(in.Destination), preview, + siteFilterJSON, minSev, maxPerHour, in.CreatedBy, + ) + if err != nil { + return nil, fmt.Errorf("alerting: insert contact: %w", err) + } + id, err := res.LastInsertId() + if err != nil { + return nil, fmt.Errorf("alerting: last insert id: %w", err) + } + return Get(ctx, db, id) +} + +// Get returns a single contact by id, or ErrContactNotFound. Does not +// load the destination credential — use LoadDestination for that. +func Get(ctx context.Context, db *sql.DB, id int64) (*AlertContact, error) { + return get(ctx, db, id, "") +} + +// GetForTenant returns a single contact owned by ownerTenantID. It hides +// cross-tenant rows behind ErrContactNotFound. +func GetForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (*AlertContact, error) { + if ownerTenantID == "" { + return nil, errors.New("alerting: owner tenant id is required") + } + return get(ctx, db, id, ownerTenantID) +} + +func get(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (*AlertContact, error) { + q := selectContactSQL + " WHERE id = ?" + args := []any{id} + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + row := db.QueryRowContext(ctx, q, args...) + c, err := scanContactRow(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrContactNotFound + } + return nil, err + } + return c, nil +} + +// List returns all contacts ordered by id ASC. +func List(ctx context.Context, db *sql.DB) ([]AlertContact, error) { + return list(ctx, db, "") +} + +// ListForTenant returns only contacts owned by ownerTenantID. +func ListForTenant(ctx context.Context, db *sql.DB, ownerTenantID string) ([]AlertContact, error) { + if ownerTenantID == "" { + return nil, errors.New("alerting: owner tenant id is required") + } + return list(ctx, db, ownerTenantID) +} + +func list(ctx context.Context, db *sql.DB, ownerTenantID string) ([]AlertContact, error) { + q := selectContactSQL + args := []any{} + if ownerTenantID != "" { + q += " WHERE owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + q += " ORDER BY id ASC" + rows, err := db.QueryContext(ctx, q, args...) + if err != nil { + return nil, fmt.Errorf("alerting: list contacts: %w", err) + } + defer rows.Close() + var out []AlertContact + for rows.Next() { + c, err := scanContactRow(rows) + if err != nil { + return nil, err + } + out = append(out, *c) + } + return out, rows.Err() +} + +// ListActive returns only contacts with active=1. Used by the delivery +// dispatcher; inactive contacts don't get matched against new +// transitions. +func ListActive(ctx context.Context, db *sql.DB) ([]AlertContact, error) { + rows, err := db.QueryContext(ctx, selectContactSQL+" WHERE active = 1 ORDER BY id ASC") + if err != nil { + return nil, fmt.Errorf("alerting: list active contacts: %w", err) + } + defer rows.Close() + var out []AlertContact + for rows.Next() { + c, err := scanContactRow(rows) + if err != nil { + return nil, err + } + out = append(out, *c) + } + return out, rows.Err() +} + +// Update applies a partial patch and returns the updated contact. The +// transport itself cannot be changed via PATCH (the destination shape +// is transport-specific and validating cross-transport changes is +// brittle); callers who want to switch transport delete and re-create. +func Update(ctx context.Context, db *sql.DB, id int64, in UpdateInput) (*AlertContact, error) { + return update(ctx, db, id, "", in) +} + +// UpdateForTenant updates a contact only when it is owned by ownerTenantID. +func UpdateForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string, in UpdateInput) (*AlertContact, error) { + if ownerTenantID == "" { + return nil, errors.New("alerting: owner tenant id is required") + } + return update(ctx, db, id, ownerTenantID, in) +} + +func update(ctx context.Context, db *sql.DB, id int64, ownerTenantID string, in UpdateInput) (*AlertContact, error) { + // Validate input fields that don't depend on the existing row first + // (fail fast — no DB hit on obviously bad PATCH bodies). + if in.Label != nil && *in.Label == "" { + return nil, errors.New("alerting: label must not be empty") + } + if in.MinSeverity != nil { + if err := validateSeverity(*in.MinSeverity); err != nil { + return nil, err + } + } + if in.MaxPerHour != nil && *in.MaxPerHour < 0 { + return nil, errors.New("alerting: max_per_hour must be >= 0") + } + + // The destination shape is transport-specific, so we need the + // existing row to know what to validate against. + current, err := get(ctx, db, id, ownerTenantID) + if err != nil { + return nil, err + } + if in.Destination != nil { + if err := validateDestination(current.Transport, in.Destination); err != nil { + return nil, err + } + } + + clauses := []string{} + args := []any{} + if in.Label != nil { + clauses = append(clauses, "label = ?") + args = append(args, *in.Label) + } + if in.Active != nil { + clauses = append(clauses, "active = ?") + args = append(args, boolToTinyint(*in.Active)) + } + if in.Destination != nil { + clauses = append(clauses, "destination = ?", "destination_preview = ?") + args = append(args, []byte(in.Destination), destinationPreview(current.Transport, in.Destination)) + } + if in.SiteFilter != nil { + b, _ := json.Marshal(*in.SiteFilter) + clauses = append(clauses, "site_filter = ?") + args = append(args, b) + } + if in.MinSeverity != nil { + clauses = append(clauses, "min_severity = ?") + args = append(args, *in.MinSeverity) + } + if in.MaxPerHour != nil { + clauses = append(clauses, "max_per_hour = ?") + args = append(args, *in.MaxPerHour) + } + + if len(clauses) == 0 { + return current, nil + } + + args = append(args, id) + q := "UPDATE jetmon_alert_contacts SET " + strings.Join(clauses, ", ") + " WHERE id = ?" + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + if _, err := db.ExecContext(ctx, q, args...); err != nil { + return nil, fmt.Errorf("alerting: update contact: %w", err) + } + return get(ctx, db, id, ownerTenantID) +} + +// Delete removes an alert contact. Existing rows in +// jetmon_alert_deliveries are intentionally NOT cascaded — they +// remain for audit and manual retry, mirroring webhooks.Delete. +func Delete(ctx context.Context, db *sql.DB, id int64) error { + return deleteContact(ctx, db, id, "") +} + +// DeleteForTenant removes a contact only when it is owned by ownerTenantID. +func DeleteForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) error { + if ownerTenantID == "" { + return errors.New("alerting: owner tenant id is required") + } + return deleteContact(ctx, db, id, ownerTenantID) +} + +func deleteContact(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) error { + q := "DELETE FROM jetmon_alert_contacts WHERE id = ?" + args := []any{id} + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + res, err := db.ExecContext(ctx, q, args...) + if err != nil { + return fmt.Errorf("alerting: delete contact: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return ErrContactNotFound + } + return nil +} + +// LoadDestination returns the raw destination JSON for a contact, +// used by the worker to call the configured Dispatcher. Kept as a +// separate function (not a field on AlertContact) so the credential +// can't leak through serialization of the AlertContact struct. +func LoadDestination(ctx context.Context, db *sql.DB, id int64) (json.RawMessage, error) { + return loadDestination(ctx, db, id, "") +} + +// LoadDestinationForTenant loads a contact credential only when it is owned +// by ownerTenantID. +func LoadDestinationForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (json.RawMessage, error) { + if ownerTenantID == "" { + return nil, errors.New("alerting: owner tenant id is required") + } + return loadDestination(ctx, db, id, ownerTenantID) +} + +func loadDestination(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (json.RawMessage, error) { + var raw []byte + q := `SELECT destination FROM jetmon_alert_contacts WHERE id = ?` + args := []any{id} + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + err := db.QueryRowContext(ctx, + q, args..., + ).Scan(&raw) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrContactNotFound + } + return nil, fmt.Errorf("alerting: load destination: %w", err) + } + return raw, nil +} + +// validateCreateInput enforces the required-fields contract for Create. +func validateCreateInput(in CreateInput) error { + if in.Label == "" { + return errors.New("alerting: label is required") + } + if !IsValidTransport(string(in.Transport)) { + return fmt.Errorf("%w: %q", ErrInvalidTransport, in.Transport) + } + if err := validateDestination(in.Transport, in.Destination); err != nil { + return err + } + if in.MinSeverity != nil { + if err := validateSeverity(*in.MinSeverity); err != nil { + return err + } + } + if in.MaxPerHour != nil && *in.MaxPerHour < 0 { + return errors.New("alerting: max_per_hour must be >= 0") + } + return nil +} + +// validateDestination checks that the destination JSON has the shape +// the transport requires. Validates field presence, not field +// well-formedness — a malformed Slack webhook URL surfaces as a +// transport error at delivery time, which is fine because operators +// can use the send-test endpoint to catch it before real alerts fire. +func validateDestination(t Transport, dest json.RawMessage) error { + if len(dest) == 0 { + return errors.New("alerting: destination is required") + } + switch t { + case TransportEmail: + var d emailDestination + if err := json.Unmarshal(dest, &d); err != nil { + return fmt.Errorf("alerting: destination not valid JSON: %w", err) + } + if d.Address == "" { + return errors.New("alerting: email destination requires an address") + } + case TransportPagerDuty: + var d pagerDutyDestination + if err := json.Unmarshal(dest, &d); err != nil { + return fmt.Errorf("alerting: destination not valid JSON: %w", err) + } + if d.IntegrationKey == "" { + return errors.New("alerting: pagerduty destination requires an integration_key") + } + case TransportSlack: + var d slackDestination + if err := json.Unmarshal(dest, &d); err != nil { + return fmt.Errorf("alerting: destination not valid JSON: %w", err) + } + if d.WebhookURL == "" { + return errors.New("alerting: slack destination requires a webhook_url") + } + case TransportTeams: + var d teamsDestination + if err := json.Unmarshal(dest, &d); err != nil { + return fmt.Errorf("alerting: destination not valid JSON: %w", err) + } + if d.WebhookURL == "" { + return errors.New("alerting: teams destination requires a webhook_url") + } + default: + return fmt.Errorf("%w: %q", ErrInvalidTransport, t) + } + return nil +} + +// validateSeverity rejects severity values outside the eventstore range. +// Anything 0..4 is accepted; 5+ is reserved per the eventstore comment +// for future "worse than down" signals but isn't usable as a gate yet. +func validateSeverity(s uint8) error { + if s > 4 { + return fmt.Errorf("%w: %d (allowed 0-4)", ErrInvalidSeverity, s) + } + return nil +} + +// destinationPreview returns the last 4 chars of the credential field +// for the given transport. Used as a UI hint so operators can identify +// a contact without exposing the full credential. +func destinationPreview(t Transport, dest json.RawMessage) string { + var s string + switch t { + case TransportEmail: + var d emailDestination + _ = json.Unmarshal(dest, &d) + s = d.Address + case TransportPagerDuty: + var d pagerDutyDestination + _ = json.Unmarshal(dest, &d) + s = d.IntegrationKey + case TransportSlack: + var d slackDestination + _ = json.Unmarshal(dest, &d) + s = d.WebhookURL + case TransportTeams: + var d teamsDestination + _ = json.Unmarshal(dest, &d) + s = d.WebhookURL + } + if len(s) <= 4 { + return s + } + return s[len(s)-4:] +} + +// boolToTinyint mirrors the helper in internal/webhooks/webhooks.go. +func boolToTinyint(b bool) int { + if b { + return 1 + } + return 0 +} + +const selectContactSQL = ` + SELECT id, label, active, owner_tenant_id, transport, destination_preview, + site_filter, min_severity, max_per_hour, + created_by, created_at, updated_at + FROM jetmon_alert_contacts` + +type rowScanner interface { + Scan(...any) error +} + +func scanContactRow(s rowScanner) (*AlertContact, error) { + var ( + c AlertContact + active uint8 + ownerTenantID sql.NullString + transport string + siteFilterJSON sql.NullString + ) + if err := s.Scan( + &c.ID, &c.Label, &active, &ownerTenantID, &transport, &c.DestinationPreview, + &siteFilterJSON, &c.MinSeverity, &c.MaxPerHour, + &c.CreatedBy, &c.CreatedAt, &c.UpdatedAt, + ); err != nil { + return nil, err + } + c.Active = active == 1 + if ownerTenantID.Valid { + c.OwnerTenantID = &ownerTenantID.String + } + c.Transport = Transport(transport) + if siteFilterJSON.Valid && siteFilterJSON.String != "" { + _ = json.Unmarshal([]byte(siteFilterJSON.String), &c.SiteFilter) + } + return &c, nil +} + +func nullableString(s *string) any { + if s == nil { + return nil + } + return *s +} diff --git a/internal/alerting/deliveries.go b/internal/alerting/deliveries.go new file mode 100644 index 00000000..7ee560ca --- /dev/null +++ b/internal/alerting/deliveries.go @@ -0,0 +1,359 @@ +package alerting + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "time" +) + +// Delivery is the in-memory shape of a jetmon_alert_deliveries row. +type Delivery struct { + ID int64 + AlertContactID int64 + TransitionID int64 + EventID int64 + EventType string + Severity uint8 + Payload json.RawMessage + Status Status + Attempt int + NextAttemptAt *time.Time + LastStatusCode *int + LastResponse *string + LastAttemptAt *time.Time + DeliveredAt *time.Time + CreatedAt time.Time +} + +// EnqueueInput carries everything needed to insert a delivery row. +type EnqueueInput struct { + AlertContactID int64 + TransitionID int64 + EventID int64 + EventType string + Severity uint8 + Payload json.RawMessage +} + +// Enqueue inserts a pending delivery with attempt=0 and +// next_attempt_at=now. Uses INSERT IGNORE against the +// (alert_contact_id, transition_id) UNIQUE KEY so concurrent +// dispatchers don't create duplicate deliveries. Returns the new id, +// or 0 if the row was a duplicate. +func Enqueue(ctx context.Context, db *sql.DB, in EnqueueInput) (int64, error) { + res, err := db.ExecContext(ctx, ` + INSERT IGNORE INTO jetmon_alert_deliveries + (alert_contact_id, transition_id, event_id, event_type, severity, + payload, status, attempt, next_attempt_at) + VALUES (?, ?, ?, ?, ?, ?, 'pending', 0, CURRENT_TIMESTAMP)`, + in.AlertContactID, in.TransitionID, in.EventID, in.EventType, in.Severity, + []byte(in.Payload), + ) + if err != nil { + return 0, fmt.Errorf("alerting: enqueue delivery: %w", err) + } + id, err := res.LastInsertId() + if err != nil { + return 0, fmt.Errorf("alerting: last insert id: %w", err) + } + if affected, _ := res.RowsAffected(); affected == 0 { + return 0, nil + } + return id, nil +} + +// claimLockDuration is how far ClaimReady pushes next_attempt_at out +// when it claims a row. Must outlast the worker's per-delivery wall +// clock so an in-flight goroutine has time to write its real result +// before the in-flight lease expires. The default DispatchTimeout is +// 30s with a 5s buffer; 60s gives comfortable headroom. A crashed +// goroutine that never updates the row recovers naturally when the +// lease expires. +const claimLockDuration = 60 * time.Second + +// ClaimReady returns up to limit pending deliveries whose +// next_attempt_at is in the past. It claims rows with SELECT ... FOR UPDATE +// inside a transaction so active-active delivery workers cannot claim the same +// row. Each claimed row then gets an in-flight lease by pushing next_attempt_at +// to NOW + claimLockDuration before the transaction commits, so subsequent +// ticks don't re-claim a row whose dispatch is still in-flight. The dispatch +// goroutine overwrites next_attempt_at with its real value when it finishes. +// +// Without the in-flight lease, the deliver loop's 1-second tick re-claims +// any in-flight row up to the per-contact cap, producing concurrent +// dispatches that inflate the attempt counter and effectively skip +// retry-schedule steps. The lease prevents that after the transaction commits. +func ClaimReady(ctx context.Context, db *sql.DB, limit int) ([]Delivery, error) { + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return nil, fmt.Errorf("alerting: begin claim: %w", err) + } + committed := false + defer func() { + if !committed { + _ = tx.Rollback() + } + }() + + rows, err := tx.QueryContext(ctx, ` + SELECT id, alert_contact_id, transition_id, event_id, event_type, severity, payload, + status, attempt, next_attempt_at, last_status_code, last_response, + last_attempt_at, delivered_at, created_at + FROM jetmon_alert_deliveries + WHERE status = 'pending' + AND (next_attempt_at IS NULL OR next_attempt_at <= CURRENT_TIMESTAMP) + ORDER BY next_attempt_at ASC + LIMIT ? + FOR UPDATE`, limit) + if err != nil { + return nil, fmt.Errorf("alerting: claim ready: %w", err) + } + var claimed []Delivery + for rows.Next() { + d, err := scanDeliveryRow(rows) + if err != nil { + rows.Close() + return nil, err + } + claimed = append(claimed, *d) + } + if err := rows.Err(); err != nil { + rows.Close() + return nil, err + } + if err := rows.Close(); err != nil { + return nil, fmt.Errorf("alerting: close claim rows: %w", err) + } + + lockUntil := time.Now().Add(claimLockDuration).UTC() + for i := range claimed { + res, err := tx.ExecContext(ctx, ` + UPDATE jetmon_alert_deliveries + SET next_attempt_at = ? + WHERE id = ? + AND status = 'pending'`, + lockUntil, claimed[i].ID) + if err != nil { + return nil, fmt.Errorf("alerting: claim row %d: %w", claimed[i].ID, err) + } + affected, err := res.RowsAffected() + if err != nil { + return nil, fmt.Errorf("alerting: claim row %d rows affected: %w", claimed[i].ID, err) + } + if affected != 1 { + return nil, fmt.Errorf("alerting: claim row %d affected %d rows, want 1", claimed[i].ID, affected) + } + } + if err := tx.Commit(); err != nil { + return nil, fmt.Errorf("alerting: commit claim: %w", err) + } + committed = true + return claimed, nil +} + +// MarkDelivered records a successful delivery. +func MarkDelivered(ctx context.Context, db *sql.DB, id int64, statusCode int, responseBody string) error { + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_alert_deliveries + SET status = 'delivered', + last_status_code = ?, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + delivered_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = NULL + WHERE id = ?`, + statusCode, truncate(responseBody, 2048), id) + if err != nil { + return fmt.Errorf("alerting: mark delivered: %w", err) + } + return nil +} + +// MarkSuppressed records a delivery that was dropped by the per-contact +// rate cap. The delivery never went out and is terminal — there's no +// useful retry because by the time the cap re-opens, the alert is +// stale. Status='abandoned' with a distinguishing last_response so +// operators can see why. +func MarkSuppressed(ctx context.Context, db *sql.DB, id int64, reason string) error { + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_alert_deliveries + SET status = 'abandoned', + last_status_code = 429, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = NULL + WHERE id = ?`, truncate(reason, 2048), id) + if err != nil { + return fmt.Errorf("alerting: mark suppressed: %w", err) + } + return nil +} + +// ScheduleRetry bumps the attempt counter and sets next_attempt_at +// per the retry schedule. abandon=true marks the row terminal instead. +func ScheduleRetry(ctx context.Context, db *sql.DB, id int64, statusCode int, responseBody string, nextAttempt time.Time, abandon bool) error { + if abandon { + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_alert_deliveries + SET status = 'abandoned', + last_status_code = ?, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = NULL + WHERE id = ?`, + statusCode, truncate(responseBody, 2048), id) + if err != nil { + return fmt.Errorf("alerting: abandon: %w", err) + } + return nil + } + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_alert_deliveries + SET last_status_code = ?, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = ? + WHERE id = ?`, + statusCode, truncate(responseBody, 2048), nextAttempt.UTC(), id) + if err != nil { + return fmt.Errorf("alerting: schedule retry: %w", err) + } + return nil +} + +// GetDelivery returns a single delivery row by id. +func GetDelivery(ctx context.Context, db *sql.DB, id int64) (*Delivery, error) { + row := db.QueryRowContext(ctx, ` + SELECT id, alert_contact_id, transition_id, event_id, event_type, severity, payload, + status, attempt, next_attempt_at, last_status_code, last_response, + last_attempt_at, delivered_at, created_at + FROM jetmon_alert_deliveries + WHERE id = ?`, id) + d, err := scanDeliveryRow(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrDeliveryNotFound + } + return nil, err + } + return d, nil +} + +// ListDeliveries returns deliveries for a contact, optionally filtered +// by status, ordered by id DESC. Cursor-paginated on id. +func ListDeliveries(ctx context.Context, db *sql.DB, contactID int64, status Status, cursorID int64, limit int) ([]Delivery, error) { + args := []any{contactID} + q := ` + SELECT id, alert_contact_id, transition_id, event_id, event_type, severity, payload, + status, attempt, next_attempt_at, last_status_code, last_response, + last_attempt_at, delivered_at, created_at + FROM jetmon_alert_deliveries + WHERE alert_contact_id = ?` + if status != "" { + q += " AND status = ?" + args = append(args, string(status)) + } + if cursorID > 0 { + q += " AND id < ?" + args = append(args, cursorID) + } + q += " ORDER BY id DESC LIMIT ?" + args = append(args, limit) + + rows, err := db.QueryContext(ctx, q, args...) + if err != nil { + return nil, fmt.Errorf("alerting: list deliveries: %w", err) + } + defer rows.Close() + var out []Delivery + for rows.Next() { + d, err := scanDeliveryRow(rows) + if err != nil { + return nil, err + } + out = append(out, *d) + } + return out, rows.Err() +} + +// RetryDelivery resets an abandoned delivery to pending so the worker +// picks it up on the next tick. Mirrors webhooks.RetryDelivery — only +// abandoned deliveries can be retried. +func RetryDelivery(ctx context.Context, db *sql.DB, id int64) error { + res, err := db.ExecContext(ctx, ` + UPDATE jetmon_alert_deliveries + SET status = 'pending', + attempt = 0, + next_attempt_at = CURRENT_TIMESTAMP, + last_status_code = NULL, + last_response = NULL, + last_attempt_at = NULL + WHERE id = ? AND status = 'abandoned'`, id) + if err != nil { + return fmt.Errorf("alerting: retry delivery: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + d, getErr := GetDelivery(ctx, db, id) + if getErr != nil { + return getErr + } + return fmt.Errorf("alerting: delivery %d is %s, only abandoned deliveries can be retried", id, d.Status) + } + return nil +} + +func scanDeliveryRow(s rowScanner) (*Delivery, error) { + var ( + d Delivery + payload sql.NullString + nextAttemptAt sql.NullTime + lastStatusCode sql.NullInt64 + lastResponse sql.NullString + lastAttemptAt sql.NullTime + deliveredAt sql.NullTime + statusStr string + ) + if err := s.Scan( + &d.ID, &d.AlertContactID, &d.TransitionID, &d.EventID, &d.EventType, &d.Severity, + &payload, &statusStr, &d.Attempt, &nextAttemptAt, &lastStatusCode, &lastResponse, + &lastAttemptAt, &deliveredAt, &d.CreatedAt, + ); err != nil { + return nil, err + } + d.Status = Status(statusStr) + if payload.Valid { + d.Payload = json.RawMessage(payload.String) + } + if nextAttemptAt.Valid { + d.NextAttemptAt = &nextAttemptAt.Time + } + if lastStatusCode.Valid { + v := int(lastStatusCode.Int64) + d.LastStatusCode = &v + } + if lastResponse.Valid { + d.LastResponse = &lastResponse.String + } + if lastAttemptAt.Valid { + d.LastAttemptAt = &lastAttemptAt.Time + } + if deliveredAt.Valid { + d.DeliveredAt = &deliveredAt.Time + } + return &d, nil +} + +func truncate(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] +} diff --git a/internal/alerting/deliveries_test.go b/internal/alerting/deliveries_test.go new file mode 100644 index 00000000..ead23fcb --- /dev/null +++ b/internal/alerting/deliveries_test.go @@ -0,0 +1,116 @@ +package alerting + +import ( + "context" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" +) + +const selectClaimReadySQL = ` SELECT id, alert_contact_id, transition_id, event_id, event_type, severity, payload, status, attempt, next_attempt_at, last_status_code, last_response, last_attempt_at, delivered_at, created_at FROM jetmon_alert_deliveries WHERE status = 'pending' AND (next_attempt_at IS NULL OR next_attempt_at <= CURRENT_TIMESTAMP) ORDER BY next_attempt_at ASC LIMIT ? FOR UPDATE` + +const leaseClaimedSQL = ` UPDATE jetmon_alert_deliveries SET next_attempt_at = ? WHERE id = ? AND status = 'pending'` + +var columnsClaimedDelivery = []string{ + "id", "alert_contact_id", "transition_id", "event_id", "event_type", "severity", + "payload", "status", "attempt", "next_attempt_at", "last_status_code", "last_response", + "last_attempt_at", "delivered_at", "created_at", +} + +// TestClaimReadyClaimsRowsTransactionally verifies that ClaimReady uses +// row-level locks and then leases each claimed row so subsequent ticks do not +// re-claim a still-in-flight delivery. +func TestClaimReadyClaimsRowsTransactionally(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + rows := sqlmock.NewRows(columnsClaimedDelivery). + AddRow(int64(1), int64(11), int64(100), int64(900), "alert.opened", uint8(4), + []byte(`{}`), "pending", 0, now, nil, nil, nil, nil, now). + AddRow(int64(2), int64(11), int64(101), int64(901), "alert.opened", uint8(4), + []byte(`{}`), "pending", 0, now, nil, nil, nil, nil, now) + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50).WillReturnRows(rows) + mock.ExpectExec(leaseClaimedSQL). + WithArgs(sqlmock.AnyArg(), int64(1)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(leaseClaimedSQL). + WithArgs(sqlmock.AnyArg(), int64(2)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + + out, err := ClaimReady(context.Background(), db, 50) + if err != nil { + t.Fatalf("ClaimReady: %v", err) + } + if len(out) != 2 { + t.Errorf("got %d claimed, want 2", len(out)) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expectations: %v", err) + } +} + +func TestClaimReadyRollsBackWhenLeaseUpdateMisses(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + rows := sqlmock.NewRows(columnsClaimedDelivery). + AddRow(int64(1), int64(11), int64(100), int64(900), "alert.opened", uint8(4), + []byte(`{}`), "pending", 0, now, nil, nil, nil, nil, now) + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50).WillReturnRows(rows) + mock.ExpectExec(leaseClaimedSQL). + WithArgs(sqlmock.AnyArg(), int64(1)). + WillReturnResult(sqlmock.NewResult(0, 0)) + mock.ExpectRollback() + + out, err := ClaimReady(context.Background(), db, 50) + if err == nil { + t.Fatal("ClaimReady succeeded after lease update missed") + } + if len(out) != 0 { + t.Fatalf("got %d claimed rows with failed lease update, want 0", len(out)) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expectations: %v", err) + } +} + +// TestClaimReadyNoCandidatesCommitsWithoutLeaseUpdates verifies that when the +// SELECT returns nothing, ClaimReady issues no UPDATEs (no extra DB traffic on +// idle ticks). +func TestClaimReadyNoCandidatesCommitsWithoutLeaseUpdates(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50). + WillReturnRows(sqlmock.NewRows(columnsClaimedDelivery)) + mock.ExpectCommit() + + out, err := ClaimReady(context.Background(), db, 50) + if err != nil { + t.Fatalf("ClaimReady: %v", err) + } + if len(out) != 0 { + t.Errorf("got %d claimed, want 0", len(out)) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expectations: %v", err) + } +} diff --git a/internal/alerting/email.go b/internal/alerting/email.go new file mode 100644 index 00000000..619947e9 --- /dev/null +++ b/internal/alerting/email.go @@ -0,0 +1,340 @@ +package alerting + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net/http" + "net/smtp" + "strings" + "sync" + "time" +) + +// EmailMessage is the rendered email handed to a Sender. It's +// transport-agnostic — the Sender translates it into whatever the +// underlying channel needs (HTTP POST body for WPCOM, MIME for SMTP, +// log line for stub). +type EmailMessage struct { + From string + To string + Subject string + PlainBody string + HTMLBody string +} + +// Sender abstracts the actual email-sending mechanism. Concrete impls +// in this file: WPCOMSender (production), SMTPSender (dev / staging), +// StubSender (unit tests). +// +// Send returns an error if the email could not be delivered. The +// returned error string is recorded in jetmon_alert_deliveries for +// debugging — keep it short and useful, not a stack trace. +type Sender interface { + Send(ctx context.Context, msg EmailMessage) error +} + +// emailDispatcher implements alerting.Dispatcher by translating a +// Notification into an EmailMessage and delegating to a Sender. The +// rendering lives here (not in the Sender) so swapping transports +// doesn't require re-implementing the subject/body logic. +type emailDispatcher struct { + sender Sender + from string +} + +// NewEmailDispatcher returns a Dispatcher that renders Notifications +// into emails and delivers them via the given Sender. The from address +// becomes the EmailMessage.From for every dispatched message. +func NewEmailDispatcher(sender Sender, from string) Dispatcher { + return &emailDispatcher{sender: sender, from: from} +} + +// emailDestination is the contact's destination JSON shape for email. +type emailDestination struct { + Address string `json:"address"` +} + +// Send renders the Notification into an EmailMessage and hands it to +// the configured Sender. Returns SMTP-style status codes for symmetry +// with the HTTP-based transports: 250 on success, 5xx on failure. +func (d *emailDispatcher) Send(ctx context.Context, destination json.RawMessage, n Notification) (int, string, error) { + var dest emailDestination + if err := json.Unmarshal(destination, &dest); err != nil { + return 550, "invalid destination JSON", fmt.Errorf("parse email destination: %w", err) + } + if dest.Address == "" { + return 550, "destination missing address", errors.New("alerting/email: destination missing address") + } + + msg := EmailMessage{ + From: d.from, + To: dest.Address, + Subject: renderEmailSubject(n), + PlainBody: renderEmailPlain(n), + HTMLBody: renderEmailHTML(n), + } + + if err := d.sender.Send(ctx, msg); err != nil { + // Cap the error message at last_response's column width. + summary := err.Error() + if len(summary) > 2048 { + summary = summary[:2048] + } + return 554, summary, err + } + return 250, "delivered", nil +} + +// renderEmailSubject is short enough to fit in mobile notification +// previews. Severity name and site URL are the most-relevant info at +// a glance; recovery and test prefixes are explicit. Strips CRLF +// from the URL to prevent MIME header injection — the URL is +// operator-controlled (jetpack_monitor_sites.monitor_url) but the +// column doesn't enforce CRLF-free, so defense-in-depth lives here. +func renderEmailSubject(n Notification) string { + url := stripCRLF(n.SiteURL) + switch { + case n.IsTest: + return fmt.Sprintf("[Jetmon test] %s", url) + case n.Recovery: + return fmt.Sprintf("[Recovered] %s", url) + default: + return fmt.Sprintf("[%s] %s", stripCRLF(n.SeverityName), url) + } +} + +// stripCRLF removes carriage return and newline characters. Used on +// any field that becomes part of a MIME header (Subject, From, To) +// to prevent header injection via untrusted strings. +func stripCRLF(s string) string { + r := strings.NewReplacer("\r", "", "\n", "") + return r.Replace(s) +} + +// renderEmailPlain is the plain-text body. Same fields as the HTML +// version; consumers receiving multipart see whichever their client +// prefers. The plain body is also the fallback for email clients +// that strip HTML. +func renderEmailPlain(n Notification) string { + var b strings.Builder + if n.IsTest { + b.WriteString("*** Jetmon test notification ***\n\n") + } + if n.Recovery { + b.WriteString("Recovery: site is back to Up.\n\n") + } + fmt.Fprintf(&b, "Site: %s (id %d)\n", n.SiteURL, n.SiteID) + fmt.Fprintf(&b, "Severity: %s\n", n.SeverityName) + if n.State != "" { + fmt.Fprintf(&b, "State: %s\n", n.State) + } + fmt.Fprintf(&b, "Event: #%d (%s)\n", n.EventID, n.EventType) + if n.Reason != "" { + fmt.Fprintf(&b, "Reason: %s\n", n.Reason) + } + fmt.Fprintf(&b, "Time: %s\n", n.Timestamp.UTC().Format(time.RFC3339)) + return b.String() +} + +// renderEmailHTML mirrors the plain body in a minimal HTML wrapper. +// No external CSS or images — keeps the payload small and renders +// the same in every client. +func renderEmailHTML(n Notification) string { + var b strings.Builder + b.WriteString("") + if n.IsTest { + b.WriteString("

*** Jetmon test notification ***

") + } + if n.Recovery { + b.WriteString("

Recovery: site is back to Up.

") + } + b.WriteString("") + fmt.Fprintf(&b, "", htmlEscape(n.SiteURL), n.SiteID) + fmt.Fprintf(&b, "", htmlEscape(n.SeverityName)) + if n.State != "" { + fmt.Fprintf(&b, "", htmlEscape(n.State)) + } + fmt.Fprintf(&b, "", n.EventID, htmlEscape(n.EventType)) + if n.Reason != "" { + fmt.Fprintf(&b, "", htmlEscape(n.Reason)) + } + fmt.Fprintf(&b, "", n.Timestamp.UTC().Format(time.RFC3339)) + b.WriteString("
Site%s (id %d)
Severity%s
State%s
Event#%d (%s)
Reason%s
Time%s
") + return b.String() +} + +func htmlEscape(s string) string { + r := strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + "\"", """, + "'", "'", + ) + return r.Replace(s) +} + +// StubSender records every message in memory and (by default) also +// logs a one-line summary to stdout. Used by unit tests and by +// EMAIL_TRANSPORT="stub" in environments where a real send is not +// configured. Never returns an error. +type StubSender struct { + Logger func(EmailMessage) // optional; defaults to log.Printf + + mu sync.Mutex + sent []EmailMessage +} + +// Send records the message and (optionally) logs a summary. +func (s *StubSender) Send(_ context.Context, m EmailMessage) error { + s.mu.Lock() + s.sent = append(s.sent, m) + s.mu.Unlock() + if s.Logger != nil { + s.Logger(m) + } else { + log.Printf("alerting/email: stub send From=%s To=%s Subject=%q", m.From, m.To, m.Subject) + } + return nil +} + +// Sent returns a snapshot of every message recorded so far. Used by +// tests to assert against rendered output. +func (s *StubSender) Sent() []EmailMessage { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]EmailMessage, len(s.sent)) + copy(out, s.sent) + return out +} + +// Reset clears the sent buffer. Useful between test cases. +func (s *StubSender) Reset() { + s.mu.Lock() + s.sent = nil + s.mu.Unlock() +} + +// SMTPSender connects to an SMTP server and sends multipart emails. +// Uses Go's stdlib net/smtp; doesn't take a per-call context (smtp +// package predates context). The worker bounds runtime via its own +// timeouts; an SMTP send that hangs blocks the worker goroutine until +// the underlying socket times out (typically 5–10 minutes on Linux). +// +// For dev/staging only — production uses WPCOMSender. STARTTLS is +// optional; AUTH PLAIN is used when Username is non-empty. +type SMTPSender struct { + Host string + Port int + Username string // optional; if empty, no AUTH is performed + Password string + UseTLS bool // controls whether AUTH PLAIN is sent (auth on plaintext SMTP is rejected by net/smtp without UseTLS) +} + +// Send delivers msg via SMTP. The MIME body is multipart/alternative +// with both plain and HTML parts. +func (s *SMTPSender) Send(_ context.Context, m EmailMessage) error { + addr := fmt.Sprintf("%s:%d", s.Host, s.Port) + body := buildMIMEMessage(m) + var auth smtp.Auth + if s.Username != "" && s.UseTLS { + auth = smtp.PlainAuth("", s.Username, s.Password, s.Host) + } + if err := smtp.SendMail(addr, auth, m.From, []string{m.To}, []byte(body)); err != nil { + return fmt.Errorf("alerting/email/smtp: send to %s: %w", addr, err) + } + return nil +} + +// buildMIMEMessage produces a multipart/alternative MIME body with +// both plain-text and HTML parts. Boundary is fixed; the message is +// short and self-contained, so collisions are not a concern. +// +// CRLF is stripped from From/To/Subject to prevent header injection. +// The body parts are content, not headers — CRLF inside them is +// expected and handled by the MIME boundary structure. +func buildMIMEMessage(m EmailMessage) string { + const boundary = "JetmonAlertBoundary_4d8f31a2" + var b strings.Builder + fmt.Fprintf(&b, "From: %s\r\n", stripCRLF(m.From)) + fmt.Fprintf(&b, "To: %s\r\n", stripCRLF(m.To)) + fmt.Fprintf(&b, "Subject: %s\r\n", stripCRLF(m.Subject)) + b.WriteString("MIME-Version: 1.0\r\n") + fmt.Fprintf(&b, "Content-Type: multipart/alternative; boundary=%q\r\n\r\n", boundary) + + fmt.Fprintf(&b, "--%s\r\n", boundary) + b.WriteString("Content-Type: text/plain; charset=\"UTF-8\"\r\n\r\n") + b.WriteString(m.PlainBody) + b.WriteString("\r\n") + + fmt.Fprintf(&b, "--%s\r\n", boundary) + b.WriteString("Content-Type: text/html; charset=\"UTF-8\"\r\n\r\n") + b.WriteString(m.HTMLBody) + b.WriteString("\r\n") + + fmt.Fprintf(&b, "--%s--\r\n", boundary) + return b.String() +} + +// WPCOMSender posts to a WPCOM-owned email API endpoint with a Bearer +// token. Same shape as the existing internal/wpcom client — Bearer +// auth, JSON body, 4xx/5xx → error. Body shape is intentionally +// generic; the production endpoint can adapt or we wrap the body in +// whatever shape they require. +type WPCOMSender struct { + Endpoint string + AuthToken string + HTTPClient *http.Client // if nil, a default with a 10s timeout is used +} + +// wpcomEmailRequest is the JSON body posted to the WPCOM email API. +type wpcomEmailRequest struct { + From string `json:"from"` + To string `json:"to"` + Subject string `json:"subject"` + PlainBody string `json:"plain"` + HTMLBody string `json:"html"` +} + +// Send POSTs the message to the configured endpoint. +func (s *WPCOMSender) Send(ctx context.Context, m EmailMessage) error { + if s.Endpoint == "" { + return errors.New("alerting/email/wpcom: endpoint not configured") + } + body, err := json.Marshal(wpcomEmailRequest{ + From: m.From, To: m.To, Subject: m.Subject, + PlainBody: m.PlainBody, HTMLBody: m.HTMLBody, + }) + if err != nil { + return fmt.Errorf("alerting/email/wpcom: marshal: %w", err) + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, s.Endpoint, bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("alerting/email/wpcom: build request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + if s.AuthToken != "" { + req.Header.Set("Authorization", "Bearer "+s.AuthToken) + } + + client := s.HTTPClient + if client == nil { + client = &http.Client{Timeout: 10 * time.Second} + } + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("alerting/email/wpcom: post: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode >= 400 { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) + return fmt.Errorf("alerting/email/wpcom: status %d: %s", resp.StatusCode, respBody) + } + return nil +} diff --git a/internal/alerting/email_test.go b/internal/alerting/email_test.go new file mode 100644 index 00000000..2a452a43 --- /dev/null +++ b/internal/alerting/email_test.go @@ -0,0 +1,324 @@ +package alerting + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/Automattic/jetmon/internal/eventstore" +) + +func makeTestNotification() Notification { + return Notification{ + SiteID: 42, + SiteURL: "https://example.com", + EventID: 777, + EventType: "event.opened", + Severity: eventstore.SeverityDown, + SeverityName: "Down", + State: "Down", + Reason: "verifier_confirmed", + Timestamp: time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC), + } +} + +func TestRenderEmailSubjectVariants(t *testing.T) { + cases := []struct { + mutate func(*Notification) + want string + }{ + {func(n *Notification) {}, "[Down] https://example.com"}, + {func(n *Notification) { n.Recovery = true }, "[Recovered] https://example.com"}, + {func(n *Notification) { n.IsTest = true }, "[Jetmon test] https://example.com"}, + } + for i, tc := range cases { + n := makeTestNotification() + tc.mutate(&n) + got := renderEmailSubject(n) + if got != tc.want { + t.Errorf("case %d: got %q, want %q", i, got, tc.want) + } + } +} + +func TestRenderEmailPlainContainsKeyFields(t *testing.T) { + n := makeTestNotification() + body := renderEmailPlain(n) + for _, want := range []string{ + "https://example.com", + "id 42", + "Down", + "#777", + "event.opened", + "verifier_confirmed", + "2026-04-25T12:00:00Z", + } { + if !strings.Contains(body, want) { + t.Errorf("plain body missing %q\nbody:\n%s", want, body) + } + } +} + +func TestRenderEmailHTMLEscapesUntrustedFields(t *testing.T) { + n := makeTestNotification() + n.SiteURL = `` + n.Reason = `a & b` + body := renderEmailHTML(n) + // The raw script tag must not appear. + if strings.Contains(body, " ` diff --git a/internal/dashboard/dashboard_test.go b/internal/dashboard/dashboard_test.go index c36f155c..a850ef99 100644 --- a/internal/dashboard/dashboard_test.go +++ b/internal/dashboard/dashboard_test.go @@ -12,7 +12,16 @@ import ( func TestHandleState(t *testing.T) { srv := New("test-host") - srv.Update(State{WorkerCount: 5, QueueDepth: 3}) + srv.Update(State{ + WorkerCount: 5, + QueueDepth: 3, + BucketOwnership: "pinned range=0-99", + LegacyStatusProjectionEnabled: true, + DeliveryWorkersEnabled: true, + DeliveryOwnerHost: "api-1", + RolloutPreflightCommand: "./jetmon2 rollout pinned-check", + ProjectionDriftCommand: "./jetmon2 rollout projection-drift", + }) r := httptest.NewRequest(http.MethodGet, "/api/state", nil) w := httptest.NewRecorder() @@ -31,6 +40,24 @@ func TestHandleState(t *testing.T) { if st.Hostname != "test-host" { t.Fatalf("Hostname = %q, want test-host", st.Hostname) } + if st.BucketOwnership != "pinned range=0-99" { + t.Fatalf("BucketOwnership = %q, want pinned range=0-99", st.BucketOwnership) + } + if !st.LegacyStatusProjectionEnabled { + t.Fatal("LegacyStatusProjectionEnabled = false, want true") + } + if !st.DeliveryWorkersEnabled { + t.Fatal("DeliveryWorkersEnabled = false, want true") + } + if st.DeliveryOwnerHost != "api-1" { + t.Fatalf("DeliveryOwnerHost = %q, want api-1", st.DeliveryOwnerHost) + } + if st.RolloutPreflightCommand != "./jetmon2 rollout pinned-check" { + t.Fatalf("RolloutPreflightCommand = %q", st.RolloutPreflightCommand) + } + if st.ProjectionDriftCommand != "./jetmon2 rollout projection-drift" { + t.Fatalf("ProjectionDriftCommand = %q", st.ProjectionDriftCommand) + } } func TestHandleHealth(t *testing.T) { @@ -74,6 +101,15 @@ func TestHandleIndex(t *testing.T) { if !strings.Contains(w.Body.String(), "Jetmon") { t.Fatal("body does not contain expected HTML content") } + if !strings.Contains(w.Body.String(), "id=\"preflight\"") { + t.Fatal("body does not contain rollout preflight card") + } + if !strings.Contains(w.Body.String(), "id=\"delivery-owner\"") { + t.Fatal("body does not contain delivery owner card") + } + if !strings.Contains(w.Body.String(), "id=\"health\"") { + t.Fatal("body does not contain dependency health grid") + } } func TestUpdateSetsHostnameAndTimestamp(t *testing.T) { diff --git a/internal/db/migrations.go b/internal/db/migrations.go index 6598484b..52f0310b 100644 --- a/internal/db/migrations.go +++ b/internal/db/migrations.go @@ -90,6 +90,283 @@ var migrations = []migration{ ADD COLUMN last_checked_at DATETIME NULL, ADD COLUMN last_alert_sent_at DATETIME NULL, ADD INDEX idx_bucket_monitor_last_checked (bucket_no, monitor_active, last_checked_at)`}, + + // Migration 9 retires jetmon_audit_log's site-state columns. Per-probe data lives in + // jetmon_check_history; status transitions move to jetmon_event_transitions (migration 11). + // What remains is purely operational: WPCOM, retries, verifier RPC, suppression, config. + {9, `ALTER TABLE jetmon_audit_log + DROP COLUMN http_code, + DROP COLUMN error_code, + DROP COLUMN rtt_ms, + DROP COLUMN old_status, + DROP COLUMN new_status, + MODIFY COLUMN blog_id BIGINT UNSIGNED NULL, + MODIFY COLUMN detail VARCHAR(1024) NULL, + ADD COLUMN event_id BIGINT UNSIGNED NULL AFTER blog_id, + ADD COLUMN metadata JSON NULL AFTER detail, + ADD INDEX idx_event_id (event_id), + ADD INDEX idx_event_type_created (event_type, created_at)`}, + + // Migration 10 creates the events table — current authoritative state of every incident. + // dedup_key is a generated column that is NULL while ended_at IS NULL, full identity tuple while open. + // The UNIQUE KEY enforces "one open event per tuple" without requiring partial indexes (which MySQL lacks). + {10, `CREATE TABLE IF NOT EXISTS jetmon_events ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + blog_id BIGINT UNSIGNED NOT NULL, + endpoint_id BIGINT UNSIGNED NULL, + check_type VARCHAR(64) NOT NULL, + discriminator VARCHAR(128) NULL, + severity TINYINT UNSIGNED NOT NULL, + state VARCHAR(32) NOT NULL, + started_at TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), + ended_at TIMESTAMP(3) NULL, + resolution_reason VARCHAR(64) NULL, + cause_event_id BIGINT UNSIGNED NULL, + metadata JSON NULL, + updated_at TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3) ON UPDATE CURRENT_TIMESTAMP(3), + dedup_key VARCHAR(255) GENERATED ALWAYS AS ( + IF(ended_at IS NULL, + CONCAT_WS(':', blog_id, COALESCE(endpoint_id, 0), check_type, COALESCE(discriminator, '')), + NULL) + ) STORED, + UNIQUE KEY uk_open_dedup (dedup_key), + INDEX idx_blog_id_started (blog_id, started_at), + INDEX idx_blog_id_active (blog_id, ended_at), + INDEX idx_check_type_started (check_type, started_at), + INDEX idx_cause_event_id (cause_event_id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 11 creates the append-only history of every mutation to jetmon_events. + // One row per change; never updated, never deleted. Together with jetmon_events, + // this is the full event-sourced record. blog_id is denormalized to keep SLA queries + // off the events table. + {11, `CREATE TABLE IF NOT EXISTS jetmon_event_transitions ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + event_id BIGINT UNSIGNED NOT NULL, + blog_id BIGINT UNSIGNED NOT NULL, + severity_before TINYINT UNSIGNED NULL, + severity_after TINYINT UNSIGNED NULL, + state_before VARCHAR(32) NULL, + state_after VARCHAR(32) NULL, + reason VARCHAR(64) NOT NULL, + source VARCHAR(255) NOT NULL DEFAULT 'local', + metadata JSON NULL, + changed_at TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), + INDEX idx_event_id_changed (event_id, changed_at), + INDEX idx_blog_id_changed (blog_id, changed_at), + INDEX idx_changed_at (changed_at) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 12 creates the API key registry. Keys are sha256-hashed at rest; + // the raw token is shown only once at creation time via the CLI. Per-key rate + // limit, scope, expiry, and revocation are all stored here. consumer_name is + // the audit-log key — every authenticated API request logs against it so we + // can track and revoke specific internal systems. See API.md "Authentication". + {12, `CREATE TABLE IF NOT EXISTS jetmon_api_keys ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + key_hash CHAR(64) NOT NULL, + consumer_name VARCHAR(128) NOT NULL, + scope ENUM('read','write','admin') NOT NULL DEFAULT 'read', + rate_limit_per_minute INT NOT NULL DEFAULT 60, + expires_at TIMESTAMP NULL, + revoked_at TIMESTAMP NULL, + last_used_at TIMESTAMP NULL, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + created_by VARCHAR(128) NOT NULL DEFAULT 'cli', + UNIQUE KEY uk_key_hash (key_hash), + INDEX idx_consumer (consumer_name) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 13 creates the webhook registry. secret_hash is sha256 of the + // raw secret (which is shown once at creation, mirrors jetmon_api_keys). + // events / site_filter / state_filter are JSON to allow flexible filter + // shapes without per-filter columns; semantics: empty = match all, AND + // across dimensions, whitelist within each. See API.md "Family 4". + // secret stores the raw HMAC signing key in plaintext. Unlike + // jetmon_api_keys (sha256-hashed at rest, used for inbound auth where + // hash is sufficient), webhook secrets are used to SIGN outbound + // deliveries — HMAC needs the actual key material in memory, not its + // hash. We never verify inbound signatures with this secret, so + // hash-at-rest would buy us no verification benefit while making + // signing impossible. + // + // Threat model: anyone with read access to jetmon_webhooks can mint + // valid deliveries. For the internal API behind a gateway, that's + // equivalent to the existing access-to-events threat. Encryption at + // rest with a master key (KMS-style) is in ROADMAP.md as a future + // hardening step. + {13, `CREATE TABLE IF NOT EXISTS jetmon_webhooks ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + url VARCHAR(2083) NOT NULL, + active TINYINT UNSIGNED NOT NULL DEFAULT 1, + events JSON NULL, + site_filter JSON NULL, + state_filter JSON NULL, + secret VARCHAR(80) NOT NULL, + secret_preview VARCHAR(8) NOT NULL DEFAULT '', + created_by VARCHAR(128) NOT NULL DEFAULT '', + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_active (active) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 14 creates the per-fire delivery records. One row per + // (webhook, transition) match — transition_id is the fan-in point: a + // single jetmon_event_transitions row can produce many deliveries (one + // per matching webhook), but a webhook gets at most one delivery per + // transition (enforced by uk_webhook_transition). + // + // payload is frozen at row creation: consumer sees the event as it was + // when the webhook fired, not as it is now (closed-and-amended events + // don't retroactively change delivery contents — that's the contract). + // + // status lifecycle: pending → (delivered | abandoned). "failed" is reserved + // for permanent client/server errors that we wouldn't retry (currently + // unused; pending captures the in-retry case). + {14, `CREATE TABLE IF NOT EXISTS jetmon_webhook_deliveries ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + webhook_id BIGINT UNSIGNED NOT NULL, + transition_id BIGINT UNSIGNED NOT NULL, + event_id BIGINT UNSIGNED NOT NULL, + event_type VARCHAR(64) NOT NULL, + payload JSON NOT NULL, + status ENUM('pending','delivered','failed','abandoned') NOT NULL DEFAULT 'pending', + attempt INT UNSIGNED NOT NULL DEFAULT 0, + next_attempt_at TIMESTAMP NULL, + last_status_code INT NULL, + last_response VARCHAR(2048) NULL, + last_attempt_at TIMESTAMP NULL, + delivered_at TIMESTAMP NULL, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE KEY uk_webhook_transition (webhook_id, transition_id), + INDEX idx_status_next_attempt (status, next_attempt_at), + INDEX idx_webhook_id_created (webhook_id, created_at), + INDEX idx_event_id (event_id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 15 records the webhook dispatcher's progress. One row per + // jetmon2 instance keeps last_transition_id high-water mark so the + // dispatcher polls only new transitions. The UNIQUE KEY on instance_id + // makes upsert (INSERT … ON DUPLICATE KEY UPDATE) trivial. + {15, `CREATE TABLE IF NOT EXISTS jetmon_webhook_dispatch_progress ( + instance_id VARCHAR(255) NOT NULL PRIMARY KEY, + last_transition_id BIGINT UNSIGNED NOT NULL DEFAULT 0, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 16 creates the alert contacts registry. Same shape as the + // webhook registry but with a simpler filter model (site_filter + + // min_severity, no event-type / state filter — see API.md Family 5). + // + // destination is JSON because each transport has a different shape: + // email → {"address":"ops@example.com"} + // pagerduty → {"integration_key":""} + // slack → {"webhook_url":"https://hooks.slack.com/..."} + // teams → {"webhook_url":"https://outlook.office.com/webhook/..."} + // destination stores the credential in plaintext for the same reason + // jetmon_webhooks.secret does (see migration 13): outbound dispatch + // needs the raw value at every send. A hash is useless because we'd + // have to recover the original to call the transport. Threat model and + // future encryption-at-rest plan are identical. + // + // min_severity is a TINYINT matching internal/eventstore.Severity* + // (0=Up, 1=Warning, 2=Degraded, 3=SeemsDown, 4=Down). Default 4 (Down) + // avoids accidental noise from new contacts. The API serializes by + // string name; the column stores the underlying uint8. + // + // max_per_hour caps notification rate per contact (default 60, 0 = + // unlimited). Per-contact because different destinations have + // different tolerance — a Slack channel can take far more than a + // PagerDuty oncall can. + {16, `CREATE TABLE IF NOT EXISTS jetmon_alert_contacts ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + label VARCHAR(128) NOT NULL, + active TINYINT UNSIGNED NOT NULL DEFAULT 1, + transport ENUM('email','pagerduty','slack','teams') NOT NULL, + destination JSON NOT NULL, + destination_preview VARCHAR(8) NOT NULL DEFAULT '', + site_filter JSON NULL, + min_severity TINYINT UNSIGNED NOT NULL DEFAULT 4, + max_per_hour INT UNSIGNED NOT NULL DEFAULT 60, + created_by VARCHAR(128) NOT NULL DEFAULT '', + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_active (active) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 17 creates the per-fire alert delivery records. One row per + // (alert_contact, transition) match — same fan-in shape as + // jetmon_webhook_deliveries: one transition produces many deliveries + // (one per matching contact), one contact gets at most one delivery + // per transition (enforced by uk_alert_transition). + // + // payload is frozen at row creation: contact sees the event as it was + // when the alert fired, not as it is now. + // + // status lifecycle and 'failed' semantics are identical to + // jetmon_webhook_deliveries. + {17, `CREATE TABLE IF NOT EXISTS jetmon_alert_deliveries ( + id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + alert_contact_id BIGINT UNSIGNED NOT NULL, + transition_id BIGINT UNSIGNED NOT NULL, + event_id BIGINT UNSIGNED NOT NULL, + event_type VARCHAR(64) NOT NULL, + severity TINYINT UNSIGNED NOT NULL, + payload JSON NOT NULL, + status ENUM('pending','delivered','failed','abandoned') NOT NULL DEFAULT 'pending', + attempt INT UNSIGNED NOT NULL DEFAULT 0, + next_attempt_at TIMESTAMP NULL, + last_status_code INT NULL, + last_response VARCHAR(2048) NULL, + last_attempt_at TIMESTAMP NULL, + delivered_at TIMESTAMP NULL, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE KEY uk_alert_transition (alert_contact_id, transition_id), + INDEX idx_status_next_attempt (status, next_attempt_at), + INDEX idx_contact_id_created (alert_contact_id, created_at), + INDEX idx_event_id (event_id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 18 records the alert dispatcher's progress. Mirrors + // jetmon_webhook_dispatch_progress — one row per jetmon2 instance with + // the high-water mark for jetmon_event_transitions.id. + {18, `CREATE TABLE IF NOT EXISTS jetmon_alert_dispatch_progress ( + instance_id VARCHAR(255) NOT NULL PRIMARY KEY, + last_transition_id BIGINT UNSIGNED NOT NULL DEFAULT 0, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, + + // Migration 19 adds a nullable tenant owner to webhooks. Internal v2 + // callers leave it NULL, preserving the shared internal registry from + // ADR-0002. Gateway-routed API paths set owner_tenant_id and use + // tenant-scoped repository helpers so customer-owned webhooks are filtered + // in Jetmon as defense in depth. + {19, `ALTER TABLE jetmon_webhooks + ADD COLUMN owner_tenant_id VARCHAR(128) NULL AFTER active, + ADD INDEX idx_owner_tenant_id (owner_tenant_id)`}, + + // Migration 20 mirrors webhook ownership on alert contacts. Deliveries + // derive visibility through their parent contact; this column owns the + // customer-managed registration itself. + {20, `ALTER TABLE jetmon_alert_contacts + ADD COLUMN owner_tenant_id VARCHAR(128) NULL AFTER active, + ADD INDEX idx_owner_tenant_id (owner_tenant_id)`}, + + // Migration 21 adds a many-to-many tenant mapping for sites. Sites are + // still stored in the legacy jetpack_monitor_sites table; this mapping is + // the public/gateway ownership projection Jetmon can enforce without + // changing the drop-in v1-compatible site row. A site can appear under + // multiple tenants if the gateway's product model allows shared ownership + // or delegation. + {21, `CREATE TABLE IF NOT EXISTS jetmon_site_tenants ( + tenant_id VARCHAR(128) NOT NULL, + blog_id BIGINT UNSIGNED NOT NULL, + source VARCHAR(64) NOT NULL DEFAULT 'gateway', + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (tenant_id, blog_id), + INDEX idx_blog_id (blog_id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`}, } // Migrate applies all pending migrations idempotently. diff --git a/internal/db/queries.go b/internal/db/queries.go index 11e8c51f..608c280d 100644 --- a/internal/db/queries.go +++ b/internal/db/queries.go @@ -3,6 +3,7 @@ package db import ( "context" "database/sql" + "errors" "fmt" "sort" "time" @@ -61,6 +62,23 @@ func GetSitesForBucket(ctx context.Context, bucketMin, bucketMax, batchSize int, return sites, rows.Err() } +// CountActiveSitesForBucketRange returns the number of active monitor rows in +// the inclusive bucket range. +func CountActiveSitesForBucketRange(ctx context.Context, bucketMin, bucketMax int) (int, error) { + var count int + err := db.QueryRowContext(ctx, ` + SELECT COUNT(*) + FROM jetpack_monitor_sites + WHERE monitor_active = 1 + AND bucket_no BETWEEN ? AND ?`, + bucketMin, bucketMax, + ).Scan(&count) + if err != nil { + return 0, fmt.Errorf("count active sites: %w", err) + } + return count, nil +} + // UpdateSiteStatus updates site_status and last_status_change for a site. func UpdateSiteStatus(ctx context.Context, blogID int64, status int, changedAt time.Time) error { _, err := db.ExecContext(ctx, @@ -70,6 +88,120 @@ func UpdateSiteStatus(ctx context.Context, blogID int64, status int, changedAt t return err } +// UpdateSiteStatusTx is the transaction-aware variant of UpdateSiteStatus, used +// when the projection write must commit atomically with an event mutation. +func UpdateSiteStatusTx(ctx context.Context, tx *sql.Tx, blogID int64, status int, changedAt time.Time) error { + _, err := tx.ExecContext(ctx, + `UPDATE jetpack_monitor_sites SET site_status = ?, last_status_change = ? WHERE blog_id = ?`, + status, changedAt.UTC(), blogID, + ) + return err +} + +// CountLegacyProjectionDrift returns the number of active sites in the bucket +// range whose v1 site_status projection disagrees with the authoritative open +// HTTP event, if any. +func CountLegacyProjectionDrift(ctx context.Context, bucketMin, bucketMax int) (int, error) { + var count int + err := db.QueryRowContext(ctx, ` + SELECT COUNT(*) + FROM jetpack_monitor_sites s + LEFT JOIN jetmon_events e + ON e.blog_id = s.blog_id + AND e.check_type = 'http' + AND e.ended_at IS NULL + WHERE s.monitor_active = 1 + AND s.bucket_no BETWEEN ? AND ? + AND s.site_status <> CASE + WHEN e.state = 'Down' THEN 2 + WHEN e.state = 'Seems Down' THEN 0 + ELSE 1 + END`, + bucketMin, bucketMax, + ).Scan(&count) + if err != nil { + return 0, fmt.Errorf("count projection drift: %w", err) + } + return count, nil +} + +// ProjectionDriftRow identifies one active site whose legacy site_status +// projection disagrees with the authoritative open HTTP event, if any. +type ProjectionDriftRow struct { + BlogID int64 + BucketNo int + SiteStatus int + ExpectedStatus int + EventID *int64 + EventState *string +} + +// ListLegacyProjectionDrift returns active sites in the bucket range whose v1 +// site_status projection disagrees with the authoritative open HTTP event. +func ListLegacyProjectionDrift(ctx context.Context, bucketMin, bucketMax, limit int) ([]ProjectionDriftRow, error) { + if limit <= 0 { + limit = 50 + } + rows, err := db.QueryContext(ctx, ` + SELECT s.blog_id, + s.bucket_no, + s.site_status, + CASE + WHEN e.state = 'Down' THEN 2 + WHEN e.state = 'Seems Down' THEN 0 + ELSE 1 + END AS expected_status, + e.id, + e.state + FROM jetpack_monitor_sites s + LEFT JOIN jetmon_events e + ON e.blog_id = s.blog_id + AND e.check_type = 'http' + AND e.ended_at IS NULL + WHERE s.monitor_active = 1 + AND s.bucket_no BETWEEN ? AND ? + AND s.site_status <> CASE + WHEN e.state = 'Down' THEN 2 + WHEN e.state = 'Seems Down' THEN 0 + ELSE 1 + END + ORDER BY s.bucket_no ASC, s.blog_id ASC + LIMIT ?`, + bucketMin, bucketMax, limit, + ) + if err != nil { + return nil, fmt.Errorf("list projection drift: %w", err) + } + defer rows.Close() + + var out []ProjectionDriftRow + for rows.Next() { + var row ProjectionDriftRow + var eventID sql.NullInt64 + var eventState sql.NullString + if err := rows.Scan( + &row.BlogID, + &row.BucketNo, + &row.SiteStatus, + &row.ExpectedStatus, + &eventID, + &eventState, + ); err != nil { + return nil, fmt.Errorf("scan projection drift: %w", err) + } + if eventID.Valid { + v := eventID.Int64 + row.EventID = &v + } + if eventState.Valid { + v := eventState.String + row.EventState = &v + } + out = append(out, row) + } + return out, rows.Err() +} + // MarkSiteChecked records when a site was last checked. func MarkSiteChecked(ctx context.Context, blogID int64, checkedAt time.Time) error { _, err := db.ExecContext(ctx, @@ -204,6 +336,23 @@ func ReleaseHost(ctx context.Context, hostID string) error { return err } +// HostRowExists reports whether a host currently has a jetmon_hosts ownership +// row. +func HostRowExists(ctx context.Context, hostID string) (bool, error) { + var exists int + err := db.QueryRowContext(ctx, + `SELECT 1 FROM jetmon_hosts WHERE host_id = ? LIMIT 1`, + hostID, + ).Scan(&exists) + if errors.Is(err, sql.ErrNoRows) { + return false, nil + } + if err != nil { + return false, fmt.Errorf("check host row: %w", err) + } + return true, nil +} + // GetAllHosts returns all rows from jetmon_hosts for operator visibility. func GetAllHosts() ([]HostRow, error) { rows, err := db.Query( diff --git a/internal/db/queries_test.go b/internal/db/queries_test.go index 48877b9d..bf6f7310 100644 --- a/internal/db/queries_test.go +++ b/internal/db/queries_test.go @@ -1,8 +1,12 @@ package db import ( + "context" "reflect" "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" ) func TestAssignBucketRanges(t *testing.T) { @@ -66,3 +70,361 @@ func TestAssignBucketRanges(t *testing.T) { }) } } + +func withMockDB(t *testing.T) (sqlmock.Sqlmock, func()) { + t.Helper() + mockDB, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + orig := db + db = mockDB + cleanup := func() { + db = orig + _ = mockDB.Close() + } + return mock, cleanup +} + +func TestGlobalDBAccessors(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectPing() + if DB() == nil { + t.Fatal("DB() = nil") + } + if err := Ping(); err != nil { + t.Fatalf("Ping: %v", err) + } + if Hostname() == "" { + t.Fatal("Hostname() returned empty string") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestGetSitesForBucketScansRowsAndDefaultRedirectPolicy(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + now := time.Now().UTC() + rows := sqlmock.NewRows([]string{ + "jetpack_monitor_site_id", "blog_id", "bucket_no", "monitor_url", + "monitor_active", "site_status", "last_status_change", "check_interval", "last_checked_at", + "ssl_expiry_date", "check_keyword", "maintenance_start", "maintenance_end", + "custom_headers", "timeout_seconds", "redirect_policy", "alert_cooldown_minutes", "last_alert_sent_at", + }).AddRow( + int64(1), int64(42), 7, "https://site.example", + true, 1, now, 5, now, + nil, nil, nil, nil, + nil, nil, nil, nil, nil, + ) + mock.ExpectQuery("SELECT"). + WithArgs(0, 99, 50). + WillReturnRows(rows) + + sites, err := GetSitesForBucket(context.Background(), 0, 99, 50, false) + if err != nil { + t.Fatalf("GetSitesForBucket: %v", err) + } + if len(sites) != 1 { + t.Fatalf("sites len = %d, want 1", len(sites)) + } + if sites[0].BlogID != 42 || sites[0].RedirectPolicy != "follow" { + t.Fatalf("site = %+v", sites[0]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestCountActiveSitesForBucketRange(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery("SELECT COUNT"). + WithArgs(10, 19). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(42)) + + count, err := CountActiveSitesForBucketRange(context.Background(), 10, 19) + if err != nil { + t.Fatalf("CountActiveSitesForBucketRange: %v", err) + } + if count != 42 { + t.Fatalf("CountActiveSitesForBucketRange = %d, want 42", count) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestSimpleMutationQueries(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + now := time.Now().UTC() + mock.ExpectExec("UPDATE jetpack_monitor_sites SET site_status"). + WithArgs(2, now, int64(42)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetpack_monitor_sites SET last_checked_at"). + WithArgs(now, int64(42)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetpack_monitor_sites SET last_alert_sent_at"). + WithArgs(now, int64(42)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetpack_monitor_sites SET ssl_expiry_date"). + WithArgs(now, int64(42)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetmon_hosts SET last_heartbeat"). + WithArgs("host-a"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetmon_hosts SET status = 'draining'"). + WithArgs("host-a"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("DELETE FROM jetmon_hosts"). + WithArgs("host-a"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO jetmon_false_positives"). + WithArgs(int64(42), 500, 1, int64(123)). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectExec("INSERT INTO jetmon_check_history"). + WithArgs(int64(42), 200, 0, int64(100), int64(1), int64(2), int64(3), int64(4)). + WillReturnResult(sqlmock.NewResult(1, 1)) + + if err := UpdateSiteStatus(context.Background(), 42, 2, now); err != nil { + t.Fatalf("UpdateSiteStatus: %v", err) + } + if err := MarkSiteChecked(context.Background(), 42, now); err != nil { + t.Fatalf("MarkSiteChecked: %v", err) + } + if err := UpdateLastAlertSent(context.Background(), 42, now); err != nil { + t.Fatalf("UpdateLastAlertSent: %v", err) + } + if err := UpdateSSLExpiry(context.Background(), 42, now); err != nil { + t.Fatalf("UpdateSSLExpiry: %v", err) + } + if err := Heartbeat(context.Background(), "host-a"); err != nil { + t.Fatalf("Heartbeat: %v", err) + } + if err := MarkHostDraining(context.Background(), "host-a"); err != nil { + t.Fatalf("MarkHostDraining: %v", err) + } + if err := ReleaseHost(context.Background(), "host-a"); err != nil { + t.Fatalf("ReleaseHost: %v", err) + } + if err := RecordFalsePositive(42, 500, 1, 123); err != nil { + t.Fatalf("RecordFalsePositive: %v", err) + } + if err := RecordCheckHistory(42, 200, 0, 100, 1, 2, 3, 4); err != nil { + t.Fatalf("RecordCheckHistory: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestUpdateSiteStatusTx(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + now := time.Now().UTC() + mock.ExpectBegin() + mock.ExpectExec("UPDATE jetpack_monitor_sites SET site_status"). + WithArgs(2, now, int64(42)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + + tx, err := db.Begin() + if err != nil { + t.Fatalf("Begin: %v", err) + } + if err := UpdateSiteStatusTx(context.Background(), tx, 42, 2, now); err != nil { + t.Fatalf("UpdateSiteStatusTx: %v", err) + } + if err := tx.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestHostRowExists(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery("SELECT 1 FROM jetmon_hosts"). + WithArgs("host-a"). + WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(1)) + mock.ExpectQuery("SELECT 1 FROM jetmon_hosts"). + WithArgs("host-b"). + WillReturnRows(sqlmock.NewRows([]string{"exists"})) + + exists, err := HostRowExists(context.Background(), "host-a") + if err != nil { + t.Fatalf("HostRowExists(host-a): %v", err) + } + if !exists { + t.Fatal("HostRowExists(host-a) = false, want true") + } + + exists, err = HostRowExists(context.Background(), "host-b") + if err != nil { + t.Fatalf("HostRowExists(host-b): %v", err) + } + if exists { + t.Fatal("HostRowExists(host-b) = true, want false") + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestCountLegacyProjectionDrift(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery("SELECT COUNT"). + WithArgs(0, 99). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(3)) + + count, err := CountLegacyProjectionDrift(context.Background(), 0, 99) + if err != nil { + t.Fatalf("CountLegacyProjectionDrift: %v", err) + } + if count != 3 { + t.Fatalf("CountLegacyProjectionDrift = %d, want 3", count) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestListLegacyProjectionDrift(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery("SELECT s.blog_id"). + WithArgs(0, 99, 50). + WillReturnRows(sqlmock.NewRows([]string{ + "blog_id", "bucket_no", "site_status", "expected_status", "id", "state", + }). + AddRow(int64(42), 7, 1, 2, int64(123), "Down"). + AddRow(int64(43), 8, 0, 1, nil, nil)) + + rows, err := ListLegacyProjectionDrift(context.Background(), 0, 99, 0) + if err != nil { + t.Fatalf("ListLegacyProjectionDrift: %v", err) + } + if len(rows) != 2 { + t.Fatalf("rows len = %d, want 2", len(rows)) + } + if rows[0].BlogID != 42 || rows[0].BucketNo != 7 || rows[0].SiteStatus != 1 || rows[0].ExpectedStatus != 2 { + t.Fatalf("row 0 = %+v", rows[0]) + } + if rows[0].EventID == nil || *rows[0].EventID != 123 { + t.Fatalf("row 0 EventID = %v, want 123", rows[0].EventID) + } + if rows[0].EventState == nil || *rows[0].EventState != "Down" { + t.Fatalf("row 0 EventState = %v, want Down", rows[0].EventState) + } + if rows[1].EventID != nil || rows[1].EventState != nil { + t.Fatalf("row 1 event fields = %+v, want nil", rows[1]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestGetAllHostsScansRows(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + now := time.Now().UTC() + mock.ExpectQuery("SELECT host_id, bucket_min, bucket_max"). + WillReturnRows(sqlmock.NewRows([]string{"host_id", "bucket_min", "bucket_max", "last_heartbeat", "status"}). + AddRow("host-a", 0, 49, now, "active"). + AddRow("host-b", 50, 99, now, "draining")) + + hosts, err := GetAllHosts() + if err != nil { + t.Fatalf("GetAllHosts: %v", err) + } + if len(hosts) != 2 || hosts[1].Status != "draining" { + t.Fatalf("hosts = %+v", hosts) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestClaimBucketsRebalancesKnownHosts(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectBegin() + mock.ExpectExec("DELETE FROM jetmon_hosts"). + WithArgs(60, "host-b"). + WillReturnResult(sqlmock.NewResult(0, 0)) + mock.ExpectQuery("SELECT host_id FROM jetmon_hosts"). + WithArgs("host-b"). + WillReturnRows(sqlmock.NewRows([]string{"host_id"}).AddRow("host-a")) + mock.ExpectExec("INSERT INTO jetmon_hosts"). + WithArgs("host-a", 0, 4). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO jetmon_hosts"). + WithArgs("host-b", 5, 9). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + + minBucket, maxBucket, err := ClaimBuckets("host-b", 10, 10, 60) + if err != nil { + t.Fatalf("ClaimBuckets: %v", err) + } + if minBucket != 5 || maxBucket != 9 { + t.Fatalf("claimed range = %d..%d, want 5..9", minBucket, maxBucket) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestMigrateAppliesOnlyPendingMigrations(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + origMigrations := migrations + migrations = []migration{ + {id: 1, sql: "CREATE TABLE jetmon_schema_migrations"}, + {id: 2, sql: "ALTER TABLE already_done"}, + {id: 3, sql: "ALTER TABLE pending_change"}, + } + defer func() { migrations = origMigrations }() + + mock.ExpectExec("CREATE TABLE jetmon_schema_migrations"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT IGNORE INTO jetmon_schema_migrations"). + WithArgs(1). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery("SELECT COUNT"). + WithArgs(2). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + mock.ExpectQuery("SELECT COUNT"). + WithArgs(3). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + mock.ExpectExec("ALTER TABLE pending_change"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT IGNORE INTO jetmon_schema_migrations"). + WithArgs(3). + WillReturnResult(sqlmock.NewResult(0, 1)) + + if err := Migrate(); err != nil { + t.Fatalf("Migrate: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} diff --git a/internal/db/site_tenants.go b/internal/db/site_tenants.go new file mode 100644 index 00000000..db8b2b26 --- /dev/null +++ b/internal/db/site_tenants.go @@ -0,0 +1,75 @@ +package db + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" +) + +// SiteTenantMapping links one gateway/customer tenant to one monitored site. +// The mapping is many-to-many so gateway-side shared ownership or delegated +// access does not require changing the legacy site row. +type SiteTenantMapping struct { + TenantID string + BlogID int64 +} + +// UpsertSiteTenantMappings inserts or refreshes site tenant mappings from a +// gateway-owned source of truth. It intentionally does not delete mappings; +// pruning requires a source-specific reconciliation policy. +func UpsertSiteTenantMappings(ctx context.Context, conn *sql.DB, mappings []SiteTenantMapping, source string) (int64, error) { + if conn == nil { + return 0, errors.New("db is nil") + } + source = strings.TrimSpace(source) + if source == "" { + source = "gateway" + } + if len(mappings) == 0 { + return 0, nil + } + + tx, err := conn.BeginTx(ctx, nil) + if err != nil { + return 0, fmt.Errorf("begin site tenant import: %w", err) + } + defer tx.Rollback() + + stmt, err := tx.PrepareContext(ctx, ` + INSERT INTO jetmon_site_tenants (tenant_id, blog_id, source) + VALUES (?, ?, ?) + ON DUPLICATE KEY UPDATE + source = VALUES(source), + updated_at = CURRENT_TIMESTAMP`) + if err != nil { + return 0, fmt.Errorf("prepare site tenant import: %w", err) + } + defer stmt.Close() + + var affected int64 + for _, m := range mappings { + tenantID := strings.TrimSpace(m.TenantID) + if tenantID == "" { + return 0, errors.New("tenant id is required") + } + if m.BlogID <= 0 { + return 0, fmt.Errorf("blog id must be positive for tenant %q", tenantID) + } + res, err := stmt.ExecContext(ctx, tenantID, m.BlogID, source) + if err != nil { + return 0, fmt.Errorf("upsert site tenant mapping tenant=%q blog_id=%d: %w", tenantID, m.BlogID, err) + } + n, err := res.RowsAffected() + if err != nil { + return 0, fmt.Errorf("read site tenant import result: %w", err) + } + affected += n + } + + if err := tx.Commit(); err != nil { + return 0, fmt.Errorf("commit site tenant import: %w", err) + } + return affected, nil +} diff --git a/internal/db/site_tenants_test.go b/internal/db/site_tenants_test.go new file mode 100644 index 00000000..c7e08cc1 --- /dev/null +++ b/internal/db/site_tenants_test.go @@ -0,0 +1,53 @@ +package db + +import ( + "context" + "testing" + + "github.com/DATA-DOG/go-sqlmock" +) + +func TestUpsertSiteTenantMappings(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectBegin() + prep := mock.ExpectPrepare("INSERT INTO jetmon_site_tenants") + prep.ExpectExec(). + WithArgs("tenant-a", int64(42), "gateway"). + WillReturnResult(sqlmock.NewResult(0, 1)) + prep.ExpectExec(). + WithArgs("tenant-b", int64(43), "gateway"). + WillReturnResult(sqlmock.NewResult(0, 2)) + mock.ExpectCommit() + + affected, err := UpsertSiteTenantMappings(context.Background(), DB(), []SiteTenantMapping{ + {TenantID: "tenant-a", BlogID: 42}, + {TenantID: "tenant-b", BlogID: 43}, + }, "") + if err != nil { + t.Fatalf("UpsertSiteTenantMappings: %v", err) + } + if affected != 3 { + t.Fatalf("affected = %d, want 3", affected) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestUpsertSiteTenantMappingsValidatesInput(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectBegin() + mock.ExpectPrepare("INSERT INTO jetmon_site_tenants") + mock.ExpectRollback() + + _, err := UpsertSiteTenantMappings(context.Background(), DB(), []SiteTenantMapping{ + {TenantID: " ", BlogID: 42}, + }, "gateway") + if err == nil { + t.Fatal("UpsertSiteTenantMappings accepted empty tenant id") + } +} diff --git a/internal/deliverer/deliverer.go b/internal/deliverer/deliverer.go new file mode 100644 index 00000000..fdca0879 --- /dev/null +++ b/internal/deliverer/deliverer.go @@ -0,0 +1,108 @@ +// Package deliverer owns outbound delivery worker wiring. +package deliverer + +import ( + "database/sql" + "log" + + "github.com/Automattic/jetmon/internal/alerting" + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/webhooks" +) + +// Config is the runtime wiring needed by the outbound deliverer. +type Config struct { + DB *sql.DB + InstanceID string + Dispatchers map[alerting.Transport]alerting.Dispatcher + Logger *log.Logger +} + +// Runtime holds the active delivery workers. +type Runtime struct { + hookWorker *webhooks.Worker + alertWorker *alerting.Worker + logger *log.Logger +} + +// Start launches webhook and alert-contact delivery workers. +func Start(cfg Config) *Runtime { + logger := cfg.Logger + if logger == nil { + logger = log.Default() + } + + hookWorker := webhooks.NewWorker(webhooks.WorkerConfig{ + DB: cfg.DB, + InstanceID: cfg.InstanceID, + }) + hookWorker.Start() + logger.Println("webhooks: delivery worker started") + + alertWorker := alerting.NewWorker(alerting.WorkerConfig{ + DB: cfg.DB, + InstanceID: cfg.InstanceID, + Dispatchers: cfg.Dispatchers, + }) + alertWorker.Start() + logger.Printf("alerting: delivery worker started (transports=%d)", len(cfg.Dispatchers)) + + return &Runtime{ + hookWorker: hookWorker, + alertWorker: alertWorker, + logger: logger, + } +} + +// Stop drains both delivery workers. +func (r *Runtime) Stop() { + if r == nil { + return + } + if r.hookWorker != nil { + r.hookWorker.Stop() + r.logger.Println("webhooks: delivery worker stopped") + } + if r.alertWorker != nil { + r.alertWorker.Stop() + r.logger.Println("alerting: delivery worker stopped") + } +} + +// BuildAlertDispatchers constructs the per-transport Dispatcher map +// from runtime config. Always returns the three webhook-shaped +// transports (PagerDuty, Slack, Teams) because they have no per-instance +// config beyond the destination credential stored on each alert contact. +// Email is selected with EMAIL_TRANSPORT: "wpcom"/"smtp" wire the +// corresponding sender, and "stub" or empty falls back to log-only. +func BuildAlertDispatchers(cfg *config.Config) map[alerting.Transport]alerting.Dispatcher { + out := map[alerting.Transport]alerting.Dispatcher{ + alerting.TransportPagerDuty: &alerting.PagerDutyDispatcher{}, + alerting.TransportSlack: &alerting.SlackDispatcher{}, + alerting.TransportTeams: &alerting.TeamsDispatcher{}, + } + + var sender alerting.Sender + switch cfg.EmailTransport { + case "wpcom": + sender = &alerting.WPCOMSender{ + Endpoint: cfg.WPCOMEmailEndpoint, + AuthToken: cfg.WPCOMEmailAuthToken, + } + log.Printf("alerting/email: using wpcom sender (endpoint=%s)", cfg.WPCOMEmailEndpoint) + case "smtp": + sender = &alerting.SMTPSender{ + Host: cfg.SMTPHost, + Port: cfg.SMTPPort, + Username: cfg.SMTPUsername, + Password: cfg.SMTPPassword, + UseTLS: cfg.SMTPUseTLS, + } + log.Printf("alerting/email: using smtp sender (%s:%d)", cfg.SMTPHost, cfg.SMTPPort) + default: + sender = &alerting.StubSender{} + log.Println("alerting/email: using stub sender (set EMAIL_TRANSPORT to enable real delivery)") + } + out[alerting.TransportEmail] = alerting.NewEmailDispatcher(sender, cfg.EmailFrom) + return out +} diff --git a/internal/eventstore/eventstore.go b/internal/eventstore/eventstore.go new file mode 100644 index 00000000..5a1032fa --- /dev/null +++ b/internal/eventstore/eventstore.go @@ -0,0 +1,715 @@ +// Package eventstore is the sole writer for jetmon_events and jetmon_event_transitions. +// +// Site state in Jetmon is event-sourced across two tables: +// +// - jetmon_events holds the current state of every incident — one row per +// (blog_id, endpoint_id, check_type, discriminator) tuple while open, mutable +// until ended_at is set, then frozen. +// - jetmon_event_transitions is the append-only history of every mutation made +// to a jetmon_events row. One row per change. Never updated, never deleted. +// +// The load-bearing invariant is: every mutation to jetmon_events writes exactly +// one row into jetmon_event_transitions, in the same database transaction. This +// package enforces that by being the only writer for both tables. External +// callers go through Open, UpdateSeverity, UpdateState, LinkCause, and Close. +// +// Two API surfaces: +// +// - Store.Open / Store.Promote / Store.Close (etc.) — each opens its own +// transaction, performs the event mutation + transition write, and commits. +// Use these when the event mutation is the only DB write. +// +// - Store.Begin → *Tx → Tx.Open / Tx.Promote / Tx.Close (etc.) → Tx.Commit — +// caller controls transaction boundaries, can run additional SQL on the +// same transaction (e.g. updating jetpack_monitor_sites.site_status as a +// v1 projection alongside the event write). +// +// See EVENTS.md for the full design rationale and TAXONOMY.md for the data model. +package eventstore + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" +) + +// State labels written to jetmon_events.state and jetmon_event_transitions.state_*. +// The state column is VARCHAR(32) rather than ENUM so new states can be added in +// code without a schema migration. +const ( + StateUp = "Up" + StateWarning = "Warning" + StateDegraded = "Degraded" + StateSeemsDown = "Seems Down" + StateDown = "Down" + StatePaused = "Paused" + StateMaintenance = "Maintenance" + StateUnknown = "Unknown" + StateResolved = "Resolved" +) + +// Severity is the numeric, ordered companion to State. Higher = worse. Stored +// as TINYINT UNSIGNED so values 0–255 are valid; the canonical scale below +// covers the lifecycle states. Severity moves independently of state — a +// degradation worsening bumps severity without changing state, and severity +// values above SeverityDown can be reserved for future "worse than down" +// signals (e.g. data loss, security compromise) without breaking rollup. +const ( + SeverityUp uint8 = 0 + SeverityWarning uint8 = 1 + SeverityDegraded uint8 = 2 + SeveritySeemsDown uint8 = 3 + SeverityDown uint8 = 4 +) + +// Transition reasons written to jetmon_event_transitions.reason. The closed-event +// reasons are also written to jetmon_events.resolution_reason on Close. +const ( + ReasonOpened = "opened" + ReasonSeverityEscalation = "severity_escalation" + ReasonSeverityDeescalation = "severity_deescalation" + ReasonStateChange = "state_change" + ReasonVerifierConfirmed = "verifier_confirmed" + ReasonVerifierCleared = "verifier_cleared" + ReasonProbeCleared = "probe_cleared" + ReasonFalseAlarm = "false_alarm" + ReasonManualOverride = "manual_override" + ReasonMaintenanceSwallowed = "maintenance_swallowed" + ReasonSuperseded = "superseded" + ReasonAutoTimeout = "auto_timeout" + ReasonCauseLinked = "cause_linked" + ReasonCauseUnlinked = "cause_unlinked" +) + +// ErrEventClosed is returned when a caller attempts to mutate an event that is +// already closed (ended_at IS NOT NULL). Closed events are immutable. +var ErrEventClosed = errors.New("eventstore: event is closed") + +// ErrEventNotFound is returned when a caller references an event id that does +// not exist. +var ErrEventNotFound = errors.New("eventstore: event not found") + +// Identity is the dedup tuple for an event. Two open events cannot share the +// same Identity — the schema's dedup_key + UNIQUE INDEX enforces this. +type Identity struct { + BlogID int64 + EndpointID *int64 // nil for site-level checks (DNS, TLS expiry, domain) + CheckType string + Discriminator string // empty when the (blog, endpoint, check_type) is single-failure +} + +// OpenInput carries the fields needed to open (or reopen) an event. +type OpenInput struct { + Identity Identity + Severity uint8 + State string + Source string // who detected the failure: "local", "veriflier:us-west", … + Metadata json.RawMessage // optional check-type-specific payload +} + +// OpenResult describes the outcome of an Open call. +type OpenResult struct { + EventID int64 + Opened bool // true if a new event was inserted; false if an existing open event matched the identity + CurrentSeverity uint8 // severity on the event row after the call + CurrentState string // state on the event row after the call +} + +// Store is the sole writer for jetmon_events and jetmon_event_transitions. +type Store struct { + db *sql.DB +} + +// New returns a Store backed by the given database handle. A nil db is allowed +// (writes become no-ops) so packages that depend on Store can still construct +// in tests where the database isn't available. +func New(db *sql.DB) *Store { + return &Store{db: db} +} + +// Tx wraps a single database transaction and exposes the same event-mutation +// API as Store, but without committing. Callers who need to coordinate event +// writes with other SQL (e.g. updating a v1 projection like +// jetpack_monitor_sites.site_status) start a Tx, perform the event mutation, +// run their other writes via Tx.Tx().Exec(...), then Commit. +// +// A Tx returned from a nil-db Store is itself a no-op shell; all methods +// short-circuit and Commit/Rollback are safe to call. +type Tx struct { + tx *sql.Tx // nil when Store had no db +} + +// Begin starts a new transaction. Caller must Commit or Rollback. Calling on a +// nil-db Store returns an empty Tx whose methods are no-ops. +func (s *Store) Begin(ctx context.Context) (*Tx, error) { + if s.db == nil { + return &Tx{}, nil + } + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return nil, fmt.Errorf("begin tx: %w", err) + } + return &Tx{tx: tx}, nil +} + +// Tx returns the underlying *sql.Tx so the caller can run additional SQL on +// the same transaction. Returns nil when the Tx is in nil-db mode. +func (t *Tx) Tx() *sql.Tx { return t.tx } + +// Commit commits the transaction. No-op in nil-db mode. +func (t *Tx) Commit() error { + if t.tx == nil { + return nil + } + return t.tx.Commit() +} + +// Rollback rolls back the transaction. No-op in nil-db mode. Safe to call +// after Commit (the underlying sql.ErrTxDone is swallowed) so it composes +// with `defer tx.Rollback()`. +func (t *Tx) Rollback() error { + if t.tx == nil { + return nil + } + if err := t.tx.Rollback(); err != nil && !errors.Is(err, sql.ErrTxDone) { + return err + } + return nil +} + +// Open opens a new event for the given identity, or returns the existing open +// event's id if one already exists. Idempotent — repeated calls with the same +// identity return the same event id and only write one "opened" transition +// row (the one for the actual insert). +// +// Severity escalation on a re-detection should go through UpdateSeverity, not +// through repeated Opens. +func (t *Tx) Open(ctx context.Context, in OpenInput) (OpenResult, error) { + if t.tx == nil { + return OpenResult{}, nil + } + if in.Identity.CheckType == "" { + return OpenResult{}, errors.New("eventstore: Open requires CheckType") + } + if in.State == "" { + return OpenResult{}, errors.New("eventstore: Open requires State") + } + + // LAST_INSERT_ID(id) on the UPDATE branch makes the driver return the + // existing row's id. RowsAffected is 1 on insert, 2 on update (per the + // MySQL driver convention). We only write an "opened" transition on insert. + res, err := t.tx.ExecContext(ctx, ` + INSERT INTO jetmon_events + (blog_id, endpoint_id, check_type, discriminator, severity, state, metadata) + VALUES (?, ?, ?, ?, ?, ?, ?) + ON DUPLICATE KEY UPDATE id = LAST_INSERT_ID(id)`, + in.Identity.BlogID, + nullableEndpoint(in.Identity.EndpointID), + in.Identity.CheckType, + nullableDiscriminator(in.Identity.Discriminator), + in.Severity, + in.State, + nullableJSON(in.Metadata), + ) + if err != nil { + return OpenResult{}, fmt.Errorf("insert event: %w", err) + } + eventID, err := res.LastInsertId() + if err != nil { + return OpenResult{}, fmt.Errorf("last insert id: %w", err) + } + rowsAffected, err := res.RowsAffected() + if err != nil { + return OpenResult{}, fmt.Errorf("rows affected: %w", err) + } + opened := rowsAffected == 1 + + var currentSeverity uint8 + var currentState string + if opened { + currentSeverity = in.Severity + currentState = in.State + sev := in.Severity + if err := writeTransition(ctx, t.tx, transitionInput{ + eventID: eventID, + blogID: in.Identity.BlogID, + severityBefore: nil, + severityAfter: &sev, + stateBefore: "", + stateAfter: in.State, + reason: ReasonOpened, + source: in.Source, + metadata: in.Metadata, + }); err != nil { + return OpenResult{}, err + } + } else { + // Existing open event matched. Read its current severity/state so the + // caller can decide whether to follow up with UpdateSeverity/UpdateState. + if err := t.tx.QueryRowContext(ctx, + `SELECT severity, state FROM jetmon_events WHERE id = ?`, eventID, + ).Scan(¤tSeverity, ¤tState); err != nil { + return OpenResult{}, fmt.Errorf("read existing event: %w", err) + } + } + + return OpenResult{ + EventID: eventID, + Opened: opened, + CurrentSeverity: currentSeverity, + CurrentState: currentState, + }, nil +} + +// UpdateSeverity changes the severity of an open event. If the new severity +// equals the current one, no row is written and (false, nil) is returned. +func (t *Tx) UpdateSeverity(ctx context.Context, eventID int64, newSeverity uint8, reason, source string, metadata json.RawMessage) (bool, error) { + if t.tx == nil { + return false, nil + } + return t.mutate(ctx, eventID, mutation{ + severityAfter: &newSeverity, + reason: reason, + source: source, + metadata: metadata, + }) +} + +// UpdateState changes the lifecycle state of an open event (e.g., +// Seems Down → Down on verifier confirmation). If the new state equals the +// current one, no row is written. +func (t *Tx) UpdateState(ctx context.Context, eventID int64, newState, reason, source string, metadata json.RawMessage) (bool, error) { + if t.tx == nil { + return false, nil + } + return t.mutate(ctx, eventID, mutation{ + stateAfter: &newState, + reason: reason, + source: source, + metadata: metadata, + }) +} + +// Promote bumps state and severity together with one transition row. Used for +// the common "verifier confirms a Seems Down event as Down" path. +func (t *Tx) Promote(ctx context.Context, eventID int64, newSeverity uint8, newState, reason, source string, metadata json.RawMessage) (bool, error) { + if t.tx == nil { + return false, nil + } + return t.mutate(ctx, eventID, mutation{ + severityAfter: &newSeverity, + stateAfter: &newState, + reason: reason, + source: source, + metadata: metadata, + }) +} + +// LinkCause sets or clears the cause_event_id on an open event. Passing 0 (or +// a negative value) clears the existing link. +func (t *Tx) LinkCause(ctx context.Context, eventID, causeEventID int64, source string) (bool, error) { + if t.tx == nil { + return false, nil + } + cur, err := readEventForUpdate(ctx, t.tx, eventID) + if err != nil { + return false, err + } + if cur.endedAt.Valid { + return false, ErrEventClosed + } + + var newCause sql.NullInt64 + if causeEventID > 0 { + newCause = sql.NullInt64{Int64: causeEventID, Valid: true} + } + if cur.causeEventID == newCause { + return false, nil + } + + if _, err := t.tx.ExecContext(ctx, + `UPDATE jetmon_events SET cause_event_id = ? WHERE id = ?`, + nullableInt64(newCause), eventID, + ); err != nil { + return false, fmt.Errorf("update cause: %w", err) + } + + reason := ReasonCauseLinked + if !newCause.Valid { + reason = ReasonCauseUnlinked + } + meta, err := json.Marshal(map[string]any{ + "cause_event_id_before": nullableInt64ToAny(cur.causeEventID), + "cause_event_id_after": nullableInt64ToAny(newCause), + }) + if err != nil { + return false, fmt.Errorf("marshal cause metadata: %w", err) + } + if err := writeTransition(ctx, t.tx, transitionInput{ + eventID: eventID, + blogID: cur.blogID, + severityBefore: &cur.severity, + severityAfter: &cur.severity, + stateBefore: cur.state, + stateAfter: cur.state, + reason: reason, + source: source, + metadata: meta, + }); err != nil { + return false, err + } + return true, nil +} + +// Close marks an open event as resolved. resolutionReason is recorded on the +// event row and used as the transition reason. Closing an already-closed event +// returns ErrEventClosed; closing a missing event returns ErrEventNotFound. +func (t *Tx) Close(ctx context.Context, eventID int64, resolutionReason, source string, metadata json.RawMessage) error { + if t.tx == nil { + return nil + } + if resolutionReason == "" { + return errors.New("eventstore: Close requires resolutionReason") + } + cur, err := readEventForUpdate(ctx, t.tx, eventID) + if err != nil { + return err + } + if cur.endedAt.Valid { + return ErrEventClosed + } + + if _, err := t.tx.ExecContext(ctx, ` + UPDATE jetmon_events + SET ended_at = CURRENT_TIMESTAMP(3), + resolution_reason = ? + WHERE id = ?`, + resolutionReason, eventID, + ); err != nil { + return fmt.Errorf("close event: %w", err) + } + + resolved := StateResolved + return writeTransition(ctx, t.tx, transitionInput{ + eventID: eventID, + blogID: cur.blogID, + severityBefore: &cur.severity, + severityAfter: nil, + stateBefore: cur.state, + stateAfter: resolved, + reason: resolutionReason, + source: source, + metadata: metadata, + }) +} + +// ActiveEvent is the minimal snapshot of an open event needed by callers that +// found it via FindActiveByBlog and now want to close, promote, or otherwise +// mutate it without a second round-trip to read its state. +type ActiveEvent struct { + ID int64 + Severity uint8 + State string +} + +// FindActiveByBlog returns the open event for (blog_id, check_type) — the +// most common lookup the orchestrator needs on recovery. Returns +// ErrEventNotFound if no open event exists. Used when the caller doesn't have +// the event id cached (e.g. a recovery in a round after the open was forgotten +// across a process restart). +func (t *Tx) FindActiveByBlog(ctx context.Context, blogID int64, checkType string) (ActiveEvent, error) { + if t.tx == nil { + return ActiveEvent{}, nil + } + var ae ActiveEvent + err := t.tx.QueryRowContext(ctx, ` + SELECT id, severity, state FROM jetmon_events + WHERE blog_id = ? AND check_type = ? AND ended_at IS NULL + ORDER BY started_at ASC + LIMIT 1`, blogID, checkType, + ).Scan(&ae.ID, &ae.Severity, &ae.State) + if errors.Is(err, sql.ErrNoRows) { + return ActiveEvent{}, ErrEventNotFound + } + if err != nil { + return ActiveEvent{}, fmt.Errorf("find active event: %w", err) + } + return ae, nil +} + +// Standalone Store methods are thin wrappers that begin/commit a transaction +// around a single Tx call. Use these when no other writes need to land in the +// same transaction. + +// Open is the standalone (auto-commit) form of Tx.Open. +func (s *Store) Open(ctx context.Context, in OpenInput) (OpenResult, error) { + if s.db == nil { + return OpenResult{}, nil + } + tx, err := s.Begin(ctx) + if err != nil { + return OpenResult{}, err + } + defer func() { _ = tx.Rollback() }() + res, err := tx.Open(ctx, in) + if err != nil { + return OpenResult{}, err + } + if err := tx.Commit(); err != nil { + return OpenResult{}, fmt.Errorf("commit: %w", err) + } + return res, nil +} + +// UpdateSeverity is the standalone form of Tx.UpdateSeverity. +func (s *Store) UpdateSeverity(ctx context.Context, eventID int64, newSeverity uint8, reason, source string, metadata json.RawMessage) (bool, error) { + return s.runTx(ctx, func(tx *Tx) (bool, error) { + return tx.UpdateSeverity(ctx, eventID, newSeverity, reason, source, metadata) + }) +} + +// UpdateState is the standalone form of Tx.UpdateState. +func (s *Store) UpdateState(ctx context.Context, eventID int64, newState, reason, source string, metadata json.RawMessage) (bool, error) { + return s.runTx(ctx, func(tx *Tx) (bool, error) { + return tx.UpdateState(ctx, eventID, newState, reason, source, metadata) + }) +} + +// Promote is the standalone form of Tx.Promote. +func (s *Store) Promote(ctx context.Context, eventID int64, newSeverity uint8, newState, reason, source string, metadata json.RawMessage) (bool, error) { + return s.runTx(ctx, func(tx *Tx) (bool, error) { + return tx.Promote(ctx, eventID, newSeverity, newState, reason, source, metadata) + }) +} + +// LinkCause is the standalone form of Tx.LinkCause. +func (s *Store) LinkCause(ctx context.Context, eventID, causeEventID int64, source string) (bool, error) { + return s.runTx(ctx, func(tx *Tx) (bool, error) { + return tx.LinkCause(ctx, eventID, causeEventID, source) + }) +} + +// Close is the standalone form of Tx.Close. +func (s *Store) Close(ctx context.Context, eventID int64, resolutionReason, source string, metadata json.RawMessage) error { + if s.db == nil { + return nil + } + tx, err := s.Begin(ctx) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + if err := tx.Close(ctx, eventID, resolutionReason, source, metadata); err != nil { + return err + } + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit: %w", err) + } + return nil +} + +func (s *Store) runTx(ctx context.Context, fn func(*Tx) (bool, error)) (bool, error) { + if s.db == nil { + return false, nil + } + tx, err := s.Begin(ctx) + if err != nil { + return false, err + } + defer func() { _ = tx.Rollback() }() + changed, err := fn(tx) + if err != nil { + return false, err + } + if err := tx.Commit(); err != nil { + return false, fmt.Errorf("commit: %w", err) + } + return changed, nil +} + +// mutation captures the pieces of a single severity/state change. severityAfter +// or stateAfter (or both) must be non-nil for a mutation to be written. +type mutation struct { + severityAfter *uint8 + stateAfter *string + reason string + source string + metadata json.RawMessage +} + +func (t *Tx) mutate(ctx context.Context, eventID int64, m mutation) (bool, error) { + if m.severityAfter == nil && m.stateAfter == nil { + return false, errors.New("eventstore: mutate requires severityAfter or stateAfter") + } + if m.reason == "" { + return false, errors.New("eventstore: mutate requires reason") + } + + cur, err := readEventForUpdate(ctx, t.tx, eventID) + if err != nil { + return false, err + } + if cur.endedAt.Valid { + return false, ErrEventClosed + } + + severityChanged := m.severityAfter != nil && *m.severityAfter != cur.severity + stateChanged := m.stateAfter != nil && *m.stateAfter != cur.state + if !severityChanged && !stateChanged { + // No-op — do not write a transition row. + return false, nil + } + + switch { + case severityChanged && stateChanged: + _, err = t.tx.ExecContext(ctx, + `UPDATE jetmon_events SET severity = ?, state = ? WHERE id = ?`, + *m.severityAfter, *m.stateAfter, eventID) + case severityChanged: + _, err = t.tx.ExecContext(ctx, + `UPDATE jetmon_events SET severity = ? WHERE id = ?`, + *m.severityAfter, eventID) + case stateChanged: + _, err = t.tx.ExecContext(ctx, + `UPDATE jetmon_events SET state = ? WHERE id = ?`, + *m.stateAfter, eventID) + } + if err != nil { + return false, fmt.Errorf("update event: %w", err) + } + + severityBefore := cur.severity + severityAfter := cur.severity + if m.severityAfter != nil { + severityAfter = *m.severityAfter + } + stateAfter := cur.state + if m.stateAfter != nil { + stateAfter = *m.stateAfter + } + if err := writeTransition(ctx, t.tx, transitionInput{ + eventID: eventID, + blogID: cur.blogID, + severityBefore: &severityBefore, + severityAfter: &severityAfter, + stateBefore: cur.state, + stateAfter: stateAfter, + reason: m.reason, + source: m.source, + metadata: m.metadata, + }); err != nil { + return false, err + } + return true, nil +} + +// eventSnapshot is what readEventForUpdate returns: the columns we need to +// validate the mutation and to populate the *_before fields on the transition. +type eventSnapshot struct { + blogID int64 + severity uint8 + state string + endedAt sql.NullTime + causeEventID sql.NullInt64 +} + +func readEventForUpdate(ctx context.Context, tx *sql.Tx, eventID int64) (eventSnapshot, error) { + var snap eventSnapshot + err := tx.QueryRowContext(ctx, ` + SELECT blog_id, severity, state, ended_at, cause_event_id + FROM jetmon_events + WHERE id = ? + FOR UPDATE`, eventID, + ).Scan(&snap.blogID, &snap.severity, &snap.state, &snap.endedAt, &snap.causeEventID) + if errors.Is(err, sql.ErrNoRows) { + return snap, ErrEventNotFound + } + if err != nil { + return snap, fmt.Errorf("read event %d: %w", eventID, err) + } + return snap, nil +} + +type transitionInput struct { + eventID int64 + blogID int64 + severityBefore *uint8 + severityAfter *uint8 + stateBefore string + stateAfter string + reason string + source string + metadata json.RawMessage +} + +func writeTransition(ctx context.Context, tx *sql.Tx, t transitionInput) error { + source := t.source + if source == "" { + source = "local" + } + _, err := tx.ExecContext(ctx, ` + INSERT INTO jetmon_event_transitions + (event_id, blog_id, severity_before, severity_after, + state_before, state_after, reason, source, metadata) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + t.eventID, t.blogID, + nullableUint8(t.severityBefore), nullableUint8(t.severityAfter), + nullableString(t.stateBefore), nullableString(t.stateAfter), + t.reason, source, nullableJSON(t.metadata), + ) + if err != nil { + return fmt.Errorf("insert transition: %w", err) + } + return nil +} + +func nullableEndpoint(p *int64) any { + if p == nil { + return nil + } + return *p +} + +func nullableDiscriminator(s string) any { + if s == "" { + return nil + } + return s +} + +func nullableJSON(b json.RawMessage) any { + if len(b) == 0 { + return nil + } + return []byte(b) +} + +func nullableUint8(p *uint8) any { + if p == nil { + return nil + } + return *p +} + +func nullableString(s string) any { + if s == "" { + return nil + } + return s +} + +func nullableInt64(n sql.NullInt64) any { + if !n.Valid { + return nil + } + return n.Int64 +} + +func nullableInt64ToAny(n sql.NullInt64) any { + if !n.Valid { + return nil + } + return n.Int64 +} diff --git a/internal/eventstore/eventstore_test.go b/internal/eventstore/eventstore_test.go new file mode 100644 index 00000000..00a490dc --- /dev/null +++ b/internal/eventstore/eventstore_test.go @@ -0,0 +1,457 @@ +package eventstore + +import ( + "context" + "database/sql" + "encoding/json" + "testing" + + "github.com/DATA-DOG/go-sqlmock" +) + +func TestNewWithNilDB(t *testing.T) { + s := New(nil) + if s == nil { + t.Fatal("New(nil) returned nil Store") + } + + // All write operations should be no-ops when db is nil. + ctx := context.Background() + + res, err := s.Open(ctx, OpenInput{ + Identity: Identity{BlogID: 1, CheckType: "http"}, + Severity: SeveritySeemsDown, + State: StateSeemsDown, + }) + if err != nil { + t.Fatalf("Open with nil db: %v", err) + } + if res.EventID != 0 || res.Opened { + t.Fatalf("Open with nil db = %+v, want zero", res) + } + + if changed, err := s.UpdateSeverity(ctx, 42, SeverityDown, ReasonSeverityEscalation, "local", nil); err != nil || changed { + t.Fatalf("UpdateSeverity with nil db = (%v, %v)", changed, err) + } + + if changed, err := s.UpdateState(ctx, 42, StateDown, ReasonVerifierConfirmed, "local", nil); err != nil || changed { + t.Fatalf("UpdateState with nil db = (%v, %v)", changed, err) + } + + if changed, err := s.Promote(ctx, 42, SeverityDown, StateDown, ReasonVerifierConfirmed, "local", nil); err != nil || changed { + t.Fatalf("Promote with nil db = (%v, %v)", changed, err) + } + + if changed, err := s.LinkCause(ctx, 42, 99, "local"); err != nil || changed { + t.Fatalf("LinkCause with nil db = (%v, %v)", changed, err) + } + + if err := s.Close(ctx, 42, ReasonVerifierCleared, "local", nil); err != nil { + t.Fatalf("Close with nil db: %v", err) + } +} + +func TestNilDBTxIsNoOp(t *testing.T) { + // Begin on a nil-db Store returns a no-op Tx whose methods all short-circuit + // without touching a database. + s := New(nil) + ctx := context.Background() + + tx, err := s.Begin(ctx) + if err != nil { + t.Fatalf("Begin: %v", err) + } + if tx == nil { + t.Fatal("Begin returned nil Tx") + } + if tx.Tx() != nil { + t.Fatal("nil-db Tx should expose nil *sql.Tx") + } + + // All Tx methods should run without panicking. + res, err := tx.Open(ctx, OpenInput{ + Identity: Identity{BlogID: 1, CheckType: "http"}, + Severity: SeveritySeemsDown, + State: StateSeemsDown, + }) + if err != nil || res.EventID != 0 { + t.Fatalf("Tx.Open with nil db = (%+v, %v)", res, err) + } + if _, err := tx.UpdateSeverity(ctx, 1, SeverityDown, ReasonSeverityEscalation, "local", nil); err != nil { + t.Fatalf("Tx.UpdateSeverity: %v", err) + } + if _, err := tx.Promote(ctx, 1, SeverityDown, StateDown, ReasonVerifierConfirmed, "local", nil); err != nil { + t.Fatalf("Tx.Promote: %v", err) + } + if _, err := tx.UpdateState(ctx, 1, StateDown, ReasonStateChange, "local", nil); err != nil { + t.Fatalf("Tx.UpdateState: %v", err) + } + if _, err := tx.LinkCause(ctx, 1, 2, "local"); err != nil { + t.Fatalf("Tx.LinkCause: %v", err) + } + if err := tx.Close(ctx, 1, ReasonVerifierCleared, "local", nil); err != nil { + t.Fatalf("Tx.Close: %v", err) + } + ae, err := tx.FindActiveByBlog(ctx, 1, "http") + if err != nil { + t.Fatalf("Tx.FindActiveByBlog: %v", err) + } + if ae.ID != 0 { + t.Fatalf("FindActiveByBlog on nil-db = %+v, want zero", ae) + } + + if err := tx.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + // Rollback after Commit should also be a no-op. + if err := tx.Rollback(); err != nil { + t.Fatalf("Rollback after Commit: %v", err) + } +} + +func TestSQLTxBeginCommitAndRollback(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + s := New(db) + ctx := context.Background() + + mock.ExpectBegin() + tx, err := s.Begin(ctx) + if err != nil { + t.Fatalf("Begin for commit: %v", err) + } + if tx.Tx() == nil { + t.Fatal("sql-backed Tx should expose *sql.Tx") + } + mock.ExpectCommit() + if err := tx.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + // Rollback after Commit should swallow sql.ErrTxDone so callers can defer it. + if err := tx.Rollback(); err != nil { + t.Fatalf("Rollback after Commit: %v", err) + } + + mock.ExpectBegin() + tx, err = s.Begin(ctx) + if err != nil { + t.Fatalf("Begin for rollback: %v", err) + } + mock.ExpectRollback() + if err := tx.Rollback(); err != nil { + t.Fatalf("Rollback: %v", err) + } + // A second Rollback after the transaction is closed is also a no-op. + if err := tx.Rollback(); err != nil { + t.Fatalf("second Rollback: %v", err) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +var eventSnapshotColumns = []string{"blog_id", "severity", "state", "ended_at", "cause_event_id"} + +func eventSnapshotRow(blogID int64, severity uint8, state string, cause any) *sqlmock.Rows { + return sqlmock.NewRows(eventSnapshotColumns). + AddRow(blogID, severity, state, nil, cause) +} + +func TestStoreOpenInsertedEventWritesTransition(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO jetmon_events"). + WithArgs(int64(42), nil, "http", nil, SeveritySeemsDown, StateSeemsDown, nil). + WillReturnResult(sqlmock.NewResult(99, 1)) + mock.ExpectExec("INSERT INTO jetmon_event_transitions"). + WithArgs(int64(99), int64(42), nil, SeveritySeemsDown, nil, StateSeemsDown, ReasonOpened, "local", nil). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectCommit() + + res, err := New(db).Open(context.Background(), OpenInput{ + Identity: Identity{BlogID: 42, CheckType: "http"}, + Severity: SeveritySeemsDown, + State: StateSeemsDown, + }) + if err != nil { + t.Fatalf("Open: %v", err) + } + if res.EventID != 99 || !res.Opened || res.CurrentSeverity != SeveritySeemsDown || res.CurrentState != StateSeemsDown { + t.Fatalf("Open result = %+v", res) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestStoreOpenExistingEventReadsCurrentState(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO jetmon_events"). + WithArgs(int64(42), nil, "http", nil, SeveritySeemsDown, StateSeemsDown, nil). + WillReturnResult(sqlmock.NewResult(99, 2)) + mock.ExpectQuery("SELECT severity, state FROM jetmon_events"). + WithArgs(int64(99)). + WillReturnRows(sqlmock.NewRows([]string{"severity", "state"}).AddRow(SeverityDown, StateDown)) + mock.ExpectCommit() + + res, err := New(db).Open(context.Background(), OpenInput{ + Identity: Identity{BlogID: 42, CheckType: "http"}, + Severity: SeveritySeemsDown, + State: StateSeemsDown, + }) + if err != nil { + t.Fatalf("Open existing: %v", err) + } + if res.Opened || res.CurrentSeverity != SeverityDown || res.CurrentState != StateDown { + t.Fatalf("Open existing result = %+v", res) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestStoreUpdateSeverityNoopSkipsTransition(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery("SELECT blog_id, severity, state, ended_at, cause_event_id"). + WithArgs(int64(99)). + WillReturnRows(eventSnapshotRow(42, SeverityDown, StateDown, nil)) + mock.ExpectCommit() + + changed, err := New(db).UpdateSeverity(context.Background(), 99, SeverityDown, ReasonSeverityEscalation, "tester", nil) + if err != nil { + t.Fatalf("UpdateSeverity: %v", err) + } + if changed { + t.Fatal("UpdateSeverity reported change for same severity") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestStorePromoteWritesEventAndTransition(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery("SELECT blog_id, severity, state, ended_at, cause_event_id"). + WithArgs(int64(99)). + WillReturnRows(eventSnapshotRow(42, SeveritySeemsDown, StateSeemsDown, nil)) + mock.ExpectExec("UPDATE jetmon_events SET severity"). + WithArgs(SeverityDown, StateDown, int64(99)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO jetmon_event_transitions"). + WithArgs(int64(99), int64(42), SeveritySeemsDown, SeverityDown, StateSeemsDown, StateDown, ReasonVerifierConfirmed, "tester", nil). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectCommit() + + changed, err := New(db).Promote(context.Background(), 99, SeverityDown, StateDown, ReasonVerifierConfirmed, "tester", nil) + if err != nil { + t.Fatalf("Promote: %v", err) + } + if !changed { + t.Fatal("Promote reported no change") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestStoreLinkCauseWritesMetadataTransition(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery("SELECT blog_id, severity, state, ended_at, cause_event_id"). + WithArgs(int64(99)). + WillReturnRows(eventSnapshotRow(42, SeverityDown, StateDown, nil)) + mock.ExpectExec("UPDATE jetmon_events SET cause_event_id"). + WithArgs(int64(123), int64(99)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO jetmon_event_transitions"). + WithArgs(int64(99), int64(42), SeverityDown, SeverityDown, StateDown, StateDown, ReasonCauseLinked, "tester", sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectCommit() + + changed, err := New(db).LinkCause(context.Background(), 99, 123, "tester") + if err != nil { + t.Fatalf("LinkCause: %v", err) + } + if !changed { + t.Fatal("LinkCause reported no change") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestStoreCloseWritesResolvedTransition(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery("SELECT blog_id, severity, state, ended_at, cause_event_id"). + WithArgs(int64(99)). + WillReturnRows(eventSnapshotRow(42, SeverityDown, StateDown, nil)) + mock.ExpectExec("UPDATE jetmon_events"). + WithArgs(ReasonVerifierCleared, int64(99)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO jetmon_event_transitions"). + WithArgs(int64(99), int64(42), SeverityDown, nil, StateDown, StateResolved, ReasonVerifierCleared, "tester", nil). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectCommit() + + if err := New(db).Close(context.Background(), 99, ReasonVerifierCleared, "tester", nil); err != nil { + t.Fatalf("Close: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestTxFindActiveByBlog(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery("SELECT id, severity, state FROM jetmon_events"). + WithArgs(int64(42), "http"). + WillReturnRows(sqlmock.NewRows([]string{"id", "severity", "state"}).AddRow(int64(99), SeverityDown, StateDown)) + mock.ExpectRollback() + + tx, err := New(db).Begin(context.Background()) + if err != nil { + t.Fatalf("Begin: %v", err) + } + active, err := tx.FindActiveByBlog(context.Background(), 42, "http") + if err != nil { + t.Fatalf("FindActiveByBlog: %v", err) + } + if active.ID != 99 || active.Severity != SeverityDown || active.State != StateDown { + t.Fatalf("active = %+v", active) + } + if err := tx.Rollback(); err != nil { + t.Fatalf("Rollback: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestSeverityScale(t *testing.T) { + // Severity is intentionally a small ordered scale; relative ordering matters + // more than the exact numbers, but the constants must agree with what the + // orchestrator and dashboards expect. + if SeverityUp >= SeverityWarning || + SeverityWarning >= SeverityDegraded || + SeverityDegraded >= SeveritySeemsDown || + SeveritySeemsDown >= SeverityDown { + t.Fatalf("severity scale not strictly increasing: %d %d %d %d %d", + SeverityUp, SeverityWarning, SeverityDegraded, SeveritySeemsDown, SeverityDown) + } +} + +func TestStateAndReasonConstants(t *testing.T) { + if StateSeemsDown != "Seems Down" { + t.Fatalf("StateSeemsDown = %q, want %q", StateSeemsDown, "Seems Down") + } + if ReasonOpened != "opened" { + t.Fatalf("ReasonOpened = %q, want %q", ReasonOpened, "opened") + } + if ReasonProbeCleared != "probe_cleared" { + t.Fatalf("ReasonProbeCleared = %q, want %q", ReasonProbeCleared, "probe_cleared") + } + if ReasonFalseAlarm != "false_alarm" { + t.Fatalf("ReasonFalseAlarm = %q, want %q", ReasonFalseAlarm, "false_alarm") + } +} + +func TestNullableHelpers(t *testing.T) { + if nullableEndpoint(nil) != nil { + t.Fatal("nullableEndpoint(nil) should be nil") + } + id := int64(7) + if nullableEndpoint(&id) != int64(7) { + t.Fatalf("nullableEndpoint(&7) = %v, want 7", nullableEndpoint(&id)) + } + + if nullableDiscriminator("") != nil { + t.Fatal("nullableDiscriminator(\"\") should be nil") + } + if nullableDiscriminator("abc") != "abc" { + t.Fatal("nullableDiscriminator(\"abc\") should be \"abc\"") + } + + if nullableJSON(nil) != nil { + t.Fatal("nullableJSON(nil) should be nil") + } + if nullableJSON(json.RawMessage("")) != nil { + t.Fatal("nullableJSON(empty) should be nil") + } + if nullableJSON(json.RawMessage(`{"a":1}`)) == nil { + t.Fatal("nullableJSON(non-empty) should not be nil") + } + + if nullableUint8(nil) != nil { + t.Fatal("nullableUint8(nil) should be nil") + } + v := uint8(3) + if nullableUint8(&v) != uint8(3) { + t.Fatalf("nullableUint8(&3) = %v, want 3", nullableUint8(&v)) + } + + if nullableString("") != nil { + t.Fatal("nullableString(\"\") should be nil") + } + if nullableString("x") != "x" { + t.Fatal("nullableString(\"x\") should be \"x\"") + } + + if nullableInt64(sql.NullInt64{}) != nil { + t.Fatal("nullableInt64(invalid) should be nil") + } + validInt := sql.NullInt64{Int64: 12, Valid: true} + if nullableInt64(validInt) != int64(12) { + t.Fatalf("nullableInt64(valid 12) = %v, want 12", nullableInt64(validInt)) + } + if nullableInt64ToAny(sql.NullInt64{}) != nil { + t.Fatal("nullableInt64ToAny(invalid) should be nil") + } + if nullableInt64ToAny(validInt) != int64(12) { + t.Fatalf("nullableInt64ToAny(valid 12) = %v, want 12", nullableInt64ToAny(validInt)) + } +} diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go index 86093461..aa914b5e 100644 --- a/internal/metrics/metrics_test.go +++ b/internal/metrics/metrics_test.go @@ -1,6 +1,12 @@ package metrics -import "testing" +import ( + "bufio" + "net" + "strings" + "testing" + "time" +) func TestSanitize(t *testing.T) { tests := []struct { @@ -34,3 +40,92 @@ func TestWriteStatsFilesDoesNotPanic(t *testing.T) { // ignored by design — just verify this does not panic. WriteStatsFiles(10, 5, 1000) } + +func TestClientSendsStatsDMessages(t *testing.T) { + clientConn, serverConn := net.Pipe() + defer clientConn.Close() + defer serverConn.Close() + + c := &Client{ + prefix: "com.jetpack.jetmon.host_name", + conn: clientConn, + } + + lines := make(chan string, 5) + done := make(chan struct{}) + go func() { + defer close(done) + r := bufio.NewReader(serverConn) + for i := 0; i < 5; i++ { + line, err := r.ReadString('\n') + if err != nil { + return + } + lines <- strings.TrimSpace(line) + } + }() + + c.Increment("checks.total", 2) + c.Gauge("queue.depth", 7) + c.Timing("request.rtt", 1500*time.Millisecond) + c.EmitMemStats() + + got := make([]string, 0, 5) + for len(got) < 5 { + select { + case line := <-lines: + got = append(got, line) + case <-time.After(time.Second): + t.Fatalf("timed out waiting for metric lines; got %v", got) + } + } + _ = serverConn.Close() + <-done + + wantPrefix := "com.jetpack.jetmon.host_name." + expected := map[string]bool{ + wantPrefix + "checks.total:2|c": false, + wantPrefix + "queue.depth:7|g": false, + wantPrefix + "request.rtt:1500|ms": false, + } + for _, line := range got { + if _, ok := expected[line]; ok { + expected[line] = true + continue + } + if !strings.HasPrefix(line, wantPrefix+"process.") { + t.Fatalf("unexpected metric line %q in %v", line, got) + } + } + for line, seen := range expected { + if !seen { + t.Fatalf("missing metric line %q in %v", line, got) + } + } +} + +func TestInitSetsGlobalClient(t *testing.T) { + pc, err := net.ListenPacket("udp4", "127.0.0.1:0") + if err != nil { + t.Skipf("udp listener unavailable: %v", err) + } + defer pc.Close() + + orig := global + t.Cleanup(func() { + if global != nil && global.conn != nil { + _ = global.conn.Close() + } + global = orig + }) + + if err := Init(pc.LocalAddr().String(), "my-host.example"); err != nil { + t.Fatalf("Init: %v", err) + } + if Global() == nil { + t.Fatal("Global() = nil after Init") + } + if Global().prefix != "com.jetpack.jetmon.my_host_example" { + t.Fatalf("prefix = %q", Global().prefix) + } +} diff --git a/internal/orchestrator/orchestrator.go b/internal/orchestrator/orchestrator.go index 9914377f..5a40d54d 100644 --- a/internal/orchestrator/orchestrator.go +++ b/internal/orchestrator/orchestrator.go @@ -2,9 +2,12 @@ package orchestrator import ( stdctx "context" + "encoding/json" + "errors" "fmt" "log" runtimemetrics "runtime/metrics" + "strings" "sync" "time" @@ -12,41 +15,77 @@ import ( "github.com/Automattic/jetmon/internal/checker" "github.com/Automattic/jetmon/internal/config" "github.com/Automattic/jetmon/internal/db" + "github.com/Automattic/jetmon/internal/eventstore" "github.com/Automattic/jetmon/internal/metrics" "github.com/Automattic/jetmon/internal/veriflier" "github.com/Automattic/jetmon/internal/wpcom" ) +// v1 site_status values projected onto jetpack_monitor_sites.site_status from +// the event-sourced state. These remain unchanged for back-compat with v1 +// consumers; the orchestrator writes them in the same transaction as every +// event mutation. const ( - statusRunning = 1 - statusConfirmedDown = 2 + statusDown = 0 // Seems Down event open (local failures, retry/verification in progress) + statusRunning = 1 // No active event + statusConfirmedDown = 2 // Down event (verifier-confirmed) ) +// checkTypeHTTP is the canonical check_type for the v1 HTTP probe path. New +// check types (DNS, TLS expiry, keyword, redirect, etc.) get their own +// constants alongside. +const ( + checkTypeHTTP = "http" + checkTypeTLSExpiry = "tls_expiry" +) + +// verifierRPCHeadroom is added to the per-site check timeout when computing +// the RPC deadline for a verifier call. The verifier needs enough budget to +// run its own HTTP check (matches site timeout) plus serialization, queueing, +// and network round-trip — 5s covers a comfortable steady-state and forces +// failure on a truly wedged verifier rather than letting the call hang. +const verifierRPCHeadroom = 5 * time.Second + var ( - nowFunc = time.Now - dbClaimBuckets = db.ClaimBuckets - dbHeartbeat = db.Heartbeat - dbReleaseHost = db.ReleaseHost - dbMarkHostDraining = db.MarkHostDraining - dbGetSitesForBucket = db.GetSitesForBucket - dbMarkSiteChecked = db.MarkSiteChecked - dbRecordCheckHistory = db.RecordCheckHistory - dbUpdateSSLExpiry = db.UpdateSSLExpiry - dbUpdateSiteStatus = db.UpdateSiteStatus - dbRecordFalsePositive = db.RecordFalsePositive - dbUpdateLastAlertSent = db.UpdateLastAlertSent - veriflierCheckFunc = func(c *veriflier.VeriflierClient, ctx stdctx.Context, req veriflier.CheckRequest) (*veriflier.CheckResult, error) { + nowFunc = time.Now + dbClaimBuckets = db.ClaimBuckets + dbHeartbeat = db.Heartbeat + dbReleaseHost = db.ReleaseHost + dbMarkHostDraining = db.MarkHostDraining + dbGetSitesForBucket = db.GetSitesForBucket + dbMarkSiteChecked = db.MarkSiteChecked + dbRecordCheckHistory = db.RecordCheckHistory + dbUpdateSSLExpiry = db.UpdateSSLExpiry + dbUpdateSiteStatus = db.UpdateSiteStatus + dbRecordFalsePositive = db.RecordFalsePositive + dbUpdateLastAlertSent = db.UpdateLastAlertSent + dbCountProjectionDrift = db.CountLegacyProjectionDrift + veriflierCheckFunc = func(c *veriflier.VeriflierClient, ctx stdctx.Context, req veriflier.CheckRequest) (*veriflier.CheckResult, error) { return c.Check(ctx, req) } + metricsClientFunc = func() metricsClient { + if m := metrics.Global(); m != nil { + return m + } + return nil + } wpcomNotifyFunc = func(c *wpcom.Client, n wpcom.Notification) error { return c.Notify(n) } currentMemoryMBFunc = currentMemoryMB ) +type metricsClient interface { + Increment(stat string, value int) + Gauge(stat string, value int) + Timing(stat string, d time.Duration) + EmitMemStats() +} + // Orchestrator drives the main check loop. type Orchestrator struct { pool *checker.Pool retries *retryQueue wpcom *wpcom.Client + events *eventstore.Store veriflierClients []*veriflier.VeriflierClient veriflierAddrs []string // parallel slice of "addr|token" for change detection veriflierMu sync.RWMutex @@ -70,6 +109,7 @@ func New(cfg *config.Config, wp *wpcom.Client) *Orchestrator { pool: pool, retries: newRetryQueue(), wpcom: wp, + events: eventstore.New(db.DB()), hostname: db.Hostname(), ctx: ctx, cancel: cancel, @@ -83,9 +123,28 @@ func New(cfg *config.Config, wp *wpcom.Client) *Orchestrator { return o } +// ev returns a non-nil event store. Tests that construct &Orchestrator{} +// directly without setting events get a no-op store backed by a nil DB so +// event-mutation paths run without panicking. Production always wires up a +// real Store in New(). +func (o *Orchestrator) ev() *eventstore.Store { + if o.events == nil { + return eventstore.New(nil) + } + return o.events +} + // ClaimBuckets registers this host in jetmon_hosts and sets the bucket range. func (o *Orchestrator) ClaimBuckets() error { cfg := config.Get() + if min, max, ok := cfg.PinnedBucketRange(); ok { + if o.bucketMin != min || o.bucketMax != max { + log.Printf("orchestrator: using pinned buckets %d-%d (dynamic bucket ownership disabled)", min, max) + } + o.bucketMin = min + o.bucketMax = max + return nil + } min, max, err := dbClaimBuckets( o.hostname, cfg.BucketTotal, @@ -108,11 +167,15 @@ func (o *Orchestrator) Run() { select { case <-o.ctx.Done(): log.Println("orchestrator: shutting down") - if err := dbMarkHostDraining(stdctx.Background(), o.hostname); err != nil { - log.Printf("orchestrator: mark draining: %v", err) + if !o.usesPinnedBuckets(config.Get()) { + if err := dbMarkHostDraining(stdctx.Background(), o.hostname); err != nil { + log.Printf("orchestrator: mark draining: %v", err) + } } o.pool.Drain() - if err := dbReleaseHost(stdctx.Background(), o.hostname); err != nil { + if o.usesPinnedBuckets(config.Get()) { + log.Println("orchestrator: pinned bucket mode active; no jetmon_hosts row to release") + } else if err := dbReleaseHost(stdctx.Background(), o.hostname); err != nil { log.Printf("orchestrator: release host: %v", err) } return @@ -145,15 +208,22 @@ func (o *Orchestrator) Stop() { func (o *Orchestrator) runRound() { cfg := config.Get() - // Update heartbeat. - if err := dbHeartbeat(o.ctx, o.hostname); err != nil { - log.Printf("orchestrator: heartbeat failed: %v", err) - } - // Re-claim every round so bucket ranges rebalance automatically when hosts - // join or leave the cluster. - if err := o.ClaimBuckets(); err != nil { - log.Printf("orchestrator: bucket rebalance failed: %v", err) + if o.usesPinnedBuckets(cfg) { + if err := o.ClaimBuckets(); err != nil { + log.Printf("orchestrator: pinned bucket claim failed: %v", err) + } + } else { + // Update heartbeat. + if err := dbHeartbeat(o.ctx, o.hostname); err != nil { + log.Printf("orchestrator: heartbeat failed: %v", err) + } + // Re-claim every round so bucket ranges rebalance automatically when + // hosts join or leave the cluster. + if err := o.ClaimBuckets(); err != nil { + log.Printf("orchestrator: bucket rebalance failed: %v", err) + } } + o.checkLegacyProjectionDrift(cfg) // Fetch sites. sites, err := dbGetSitesForBucket(o.ctx, o.bucketMin, o.bucketMax, cfg.DatasetSize, cfg.UseVariableCheckIntervals) @@ -223,7 +293,7 @@ process: // Emit metrics and update stats files. roundDuration := time.Since(o.roundStart) - m := metrics.Global() + m := metricsClientFunc() if m != nil { m.Timing("round.complete.time", roundDuration) m.Gauge("worker.queue.active", o.pool.ActiveCount()) @@ -278,8 +348,8 @@ func (o *Orchestrator) processResults(results map[int64]checker.Result, sites ma o.checkSSLAlerts(site, *res.SSLExpiry) } - o.auditLog(blogID, audit.EventCheck, o.hostname, - res.HTTPCode, res.ErrorCode, res.RTT.Milliseconds(), "") + // Per-check data is recorded in jetmon_check_history (above); duplicating + // it in jetmon_audit_log was retired with the operational/site-state split. if !res.IsFailure() { o.handleRecovery(site, res) @@ -295,19 +365,36 @@ func (o *Orchestrator) handleRecovery(site db.Site, res checker.Result) { return // was already up, nothing to do } + knownEventID := int64(0) + if entry != nil { + knownEventID = entry.eventID + } o.retries.clear(site.BlogID) if site.SiteStatus != statusRunning { changeTime := nowFunc().UTC() log.Printf("orchestrator: blog_id=%d recovered", site.BlogID) - o.auditTransition(site.BlogID, site.SiteStatus, statusRunning, "site recovered") + if entry != nil && site.SiteStatus == statusDown { + emitCounter("detection.probe_cleared.count", 1) + emitCounter("detection.probe_cleared."+failureClass(entry.lastResult)+".count", 1) + emitTimingSince("detection.seems_down_to_probe_cleared.time", entry.firstFailAt, changeTime) + } - if config.Get().DBUpdatesEnable { - _ = dbUpdateSiteStatus(o.ctx, site.BlogID, statusRunning, changeTime) + // Close the open event and project site_status back to running in the + // same transaction. The resolution reason depends on whether the event + // was already verifier-confirmed (Down) or still in the local-retry + // phase (Seems Down). + if err := o.closeRecoveredEvent(site.BlogID, knownEventID, changeTime); err != nil { + log.Printf("orchestrator: close recovered event blog_id=%d: %v", site.BlogID, err) } if inMaintenance(site) { - o.auditLog(site.BlogID, audit.EventMaintenanceActive, "local", 0, 0, 0, "recovery suppressed during maintenance") + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventMaintenanceActive, + Source: "local", + Detail: "recovery suppressed during maintenance", + }) } else if !o.isAlertSuppressed(site) { o.sendNotification(site, res, statusRunning, changeTime, nil) } @@ -316,11 +403,44 @@ func (o *Orchestrator) handleRecovery(site db.Site, res checker.Result) { func (o *Orchestrator) handleFailure(site db.Site, res checker.Result) { entry := o.retries.record(res) + class := failureClass(res) + emitCounter("detection.failure."+class+".count", 1) + + // Open a Seems Down event on the first failure we don't already have an + // id for. The schema's idempotent dedup_key means re-detecting the same + // failure would update the same row, so this is also a self-healing retry + // path if a previous Open failed to commit. + if entry.eventID == 0 { + id, err := o.openSeemsDown(site, res) + if err != nil { + log.Printf("orchestrator: open seems-down event blog_id=%d: %v", site.BlogID, err) + } else { + entry.eventID = id + if entry.failCount == 1 { + emitCounter("detection.seems_down.open.count", 1) + emitCounter("detection.seems_down.open."+class+".count", 1) + emitTimingSince("detection.first_failure_to_seems_down.time", entry.firstFailAt, nowFunc().UTC()) + } + } + } if entry.failCount < config.Get().NumOfChecks { - o.auditLog(site.BlogID, audit.EventRetryDispatched, o.hostname, - res.HTTPCode, res.ErrorCode, res.RTT.Milliseconds(), - fmt.Sprintf("retry %d of %d", entry.failCount, config.Get().NumOfChecks)) + meta, _ := json.Marshal(map[string]any{ + "http_code": res.HTTPCode, + "error_code": res.ErrorCode, + "rtt_ms": res.RTT.Milliseconds(), + "attempt": entry.failCount, + "of": config.Get().NumOfChecks, + "event_id": entry.eventID, + }) + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventID: entry.eventID, + EventType: audit.EventRetryDispatched, + Source: o.hostname, + Detail: fmt.Sprintf("retry %d of %d", entry.failCount, config.Get().NumOfChecks), + Metadata: meta, + }) return } @@ -330,14 +450,14 @@ func (o *Orchestrator) handleFailure(site db.Site, res checker.Result) { func (o *Orchestrator) escalateToVerifliers(site db.Site, entry *retryEntry) { clients := o.veriflierSnapshot() + emitCounter("detection.verifier.escalation.count", 1) + emitTimingSince("detection.first_failure_to_verification.time", entry.firstFailAt, nowFunc().UTC()) if len(clients) == 0 { + emitCounter("detection.verifier.no_clients.count", 1) o.confirmDown(site, entry, nil) return } - o.auditLog(site.BlogID, audit.EventVeriflierSent, o.hostname, 0, 0, 0, - fmt.Sprintf("escalating to %d verifliers", len(clients))) - req := veriflier.CheckRequest{ BlogID: site.BlogID, URL: site.MonitorURL, @@ -345,20 +465,44 @@ func (o *Orchestrator) escalateToVerifliers(site db.Site, entry *retryEntry) { Keyword: stringPtrValue(site.CheckKeyword), CustomHeaders: checker.ParseCustomHeaders(site.CustomHeaders), RedirectPolicy: site.RedirectPolicy, + RequestID: veriflier.NewRequestID(), } + escalateMeta, _ := json.Marshal(map[string]any{ + "verifier_count": len(clients), + "request_id": req.RequestID, + }) + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventVeriflierSent, + Source: o.hostname, + Detail: fmt.Sprintf("escalating to %d verifliers", len(clients)), + Metadata: escalateMeta, + }) + + // Per-RPC deadline: site's check budget plus headroom for the verifier's + // own HTTP work, server queueing, and network. Without this the dial / + // read can hang for o.ctx's lifetime (effectively forever) on a wedged + // verifier — the old hardcoded 30s client.Timeout was the only bound and + // has been removed in favor of this caller-controlled deadline. + rpcDeadline := time.Duration(timeoutForSite(config.Get(), site))*time.Second + verifierRPCHeadroom + rpcCtx, rpcCancel := stdctx.WithTimeout(o.ctx, rpcDeadline) + defer rpcCancel() + type vResult struct { - host string - res *veriflier.CheckResult - err error + host string + duration time.Duration + res *veriflier.CheckResult + err error } ch := make(chan vResult, len(clients)) for _, client := range clients { c := client go func() { - res, err := veriflierCheckFunc(c, o.ctx, req) - ch <- vResult{host: c.Addr(), res: res, err: err} + start := nowFunc() + res, err := veriflierCheckFunc(c, rpcCtx, req) + ch <- vResult{host: c.Addr(), duration: nowFunc().Sub(start), res: res, err: err} }() } @@ -368,16 +512,44 @@ func (o *Orchestrator) escalateToVerifliers(site db.Site, entry *retryEntry) { for range clients { vr := <-ch + emitTiming("verifier.rpc.duration", vr.duration) + hostSegment := metricSegment(vr.host) + emitTiming("verifier.host."+hostSegment+".rpc.duration", vr.duration) if vr.err != nil { + emitCounter("verifier.rpc.error.count", 1) + emitCounter("verifier.host."+hostSegment+".rpc.error.count", 1) log.Printf("orchestrator: veriflier %s error: %v", vr.host, vr.err) continue } + emitCounter("verifier.rpc.success.count", 1) + emitCounter("verifier.host."+hostSegment+".rpc.success.count", 1) healthyVerifliers++ - o.auditLog(site.BlogID, audit.EventVeriflierResult, vr.host, - int(vr.res.HTTPCode), int(vr.res.ErrorCode), vr.res.RTTMs, "") + // Verifier reply is operational telemetry — recorded under + // EventVeriflierSent with the response in metadata. The site-state + // outcome (confirm or false alarm) is captured separately, ultimately + // as a transition row in jetmon_event_transitions. + meta, _ := json.Marshal(map[string]any{ + "http_code": vr.res.HTTPCode, + "error_code": vr.res.ErrorCode, + "rtt_ms": vr.res.RTTMs, + "success": vr.res.Success, + "request_id": vr.res.RequestID, + }) + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventVeriflierSent, + Source: vr.host, + Detail: "veriflier reply", + Metadata: meta, + }) vResults = append(vResults, *vr.res) if !vr.res.Success { + emitCounter("verifier.vote.confirm_down.count", 1) + emitCounter("verifier.host."+hostSegment+".vote.confirm_down.count", 1) confirmations++ + } else { + emitCounter("verifier.vote.disagree.count", 1) + emitCounter("verifier.host."+hostSegment+".vote.disagree.count", 1) } } @@ -389,14 +561,36 @@ func (o *Orchestrator) escalateToVerifliers(site db.Site, entry *retryEntry) { if quorum < 1 { quorum = 1 } + emitGauge("detection.verifier.healthy.count", healthyVerifliers) + emitGauge("detection.verifier.confirmations.count", confirmations) + emitGauge("detection.verifier.quorum.count", quorum) if confirmations >= quorum { + emitCounter("detection.verifier.quorum_met.count", 1) o.confirmDown(site, entry, vResults) } else { - // Verifliers did not confirm — false positive. + // Verifliers did not confirm — false positive. Close the Seems Down + // event with reason=false_alarm and reset site_status in the same tx. log.Printf("orchestrator: blog_id=%d verifliers did not confirm down (%d/%d)", site.BlogID, confirmations, quorum) + emitCounter("detection.verifier.false_alarm.count", 1) + emitCounter("detection.verifier.false_alarm."+failureClass(entry.lastResult)+".count", 1) + emitTimingSince("detection.seems_down_to_false_alarm.time", entry.firstFailAt, nowFunc().UTC()) _ = dbRecordFalsePositive(site.BlogID, entry.lastResult.HTTPCode, entry.lastResult.ErrorCode, entry.lastResult.RTT.Milliseconds()) + + if entry.eventID > 0 { + meta, _ := json.Marshal(map[string]any{ + "verifier_quorum": quorum, + "verifier_healthy": healthyVerifliers, + "verifier_disagreed": healthyVerifliers - confirmations, + "verifier_confirmed": confirmations, + }) + if err := o.closeEvent(site.BlogID, entry.eventID, + eventstore.ReasonFalseAlarm, statusRunning, nowFunc().UTC(), meta); err != nil { + log.Printf("orchestrator: close false-alarm event blog_id=%d event_id=%d: %v", + site.BlogID, entry.eventID, err) + } + } o.retries.clear(site.BlogID) } } @@ -404,20 +598,44 @@ func (o *Orchestrator) escalateToVerifliers(site db.Site, entry *retryEntry) { func (o *Orchestrator) confirmDown(site db.Site, entry *retryEntry, vResults []veriflier.CheckResult) { newStatus := statusConfirmedDown changeTime := nowFunc().UTC() + emitCounter("detection.down.confirmed.count", 1) + emitCounter("detection.down.confirmed."+failureClass(entry.lastResult)+".count", 1) + emitTimingSince("detection.seems_down_to_down.time", entry.firstFailAt, changeTime) log.Printf("orchestrator: blog_id=%d confirmed down", site.BlogID) - o.auditTransition(site.BlogID, site.SiteStatus, newStatus, "confirmed down") - if config.Get().DBUpdatesEnable { + // Promote the open Seems Down event to Down with reason=verifier_confirmed + // and project site_status=SITE_CONFIRMED_DOWN in the same tx. If we have no + // event id (open failed earlier or eventstore unavailable), fall back to + // the bare projection write. + if entry.eventID > 0 { + meta, _ := json.Marshal(map[string]any{ + "verifier_results": summarizeVerifierResults(vResults), + "verifier_confirmed": len(vResults), + }) + if err := o.promoteToDown(site.BlogID, entry.eventID, changeTime, meta); err != nil { + log.Printf("orchestrator: promote event blog_id=%d event_id=%d: %v", site.BlogID, entry.eventID, err) + } + } else if config.LegacyStatusProjectionEnabled() { _ = dbUpdateSiteStatus(o.ctx, site.BlogID, newStatus, changeTime) } if inMaintenance(site) { - o.auditLog(site.BlogID, audit.EventMaintenanceActive, "local", 0, 0, 0, "downtime suppressed during maintenance") + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventMaintenanceActive, + Source: "local", + Detail: "downtime suppressed during maintenance", + }) } else if !o.isAlertSuppressed(site) { o.sendNotification(site, entry.lastResult, newStatus, changeTime, vResults) } else { - o.auditLog(site.BlogID, audit.EventAlertSuppressed, "local", 0, 0, 0, "cooldown active") + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventAlertSuppressed, + Source: "local", + Detail: "cooldown active", + }) } o.retries.clear(site.BlogID) @@ -453,34 +671,152 @@ func (o *Orchestrator) sendNotification(site db.Site, res checker.Result, status Checks: checks, } - o.auditLog(site.BlogID, audit.EventWPCOMSent, "local", 0, 0, 0, - fmt.Sprintf("status=%d type=%s", status, n.StatusType)) + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventWPCOMSent, + Source: "local", + Detail: fmt.Sprintf("status=%d type=%s", status, n.StatusType), + }) + wpcomStatus := wpcomStatusMetricSegment(status) + emitCounter("wpcom.notification.attempt.count", 1) + emitCounter("wpcom.notification.status."+wpcomStatus+".attempt.count", 1) if err := wpcomNotifyFunc(o.wpcom, n); err != nil { + emitCounter("wpcom.notification.error.count", 1) + emitCounter("wpcom.notification.status."+wpcomStatus+".error.count", 1) + emitCounter("wpcom.notification.retry.count", 1) log.Printf("orchestrator: wpcom notify failed for blog_id=%d: %v", site.BlogID, err) - o.auditLog(site.BlogID, audit.EventWPCOMRetry, "local", 0, 0, 0, err.Error()) + o.auditLog(audit.Entry{ + BlogID: site.BlogID, + EventType: audit.EventWPCOMRetry, + Source: "local", + Detail: err.Error(), + }) // Single retry. if retryErr := wpcomNotifyFunc(o.wpcom, n); retryErr != nil { + emitCounter("wpcom.notification.error.count", 1) + emitCounter("wpcom.notification.status."+wpcomStatus+".error.count", 1) + emitCounter("wpcom.notification.failed.count", 1) + emitCounter("wpcom.notification.status."+wpcomStatus+".failed.count", 1) log.Printf("orchestrator: wpcom notify retry failed for blog_id=%d: %v", site.BlogID, retryErr) return } + emitCounter("wpcom.notification.retry.delivered.count", 1) } + emitCounter("wpcom.notification.delivered.count", 1) + emitCounter("wpcom.notification.status."+wpcomStatus+".delivered.count", 1) if err := dbUpdateLastAlertSent(o.ctx, site.BlogID, nowFunc().UTC()); err != nil { log.Printf("orchestrator: update last alert sent blog_id=%d: %v", site.BlogID, err) } } +// checkSSLAlerts manages a site-level tls_expiry event that tracks the cert's +// remaining lifetime. The event is opened idempotently — once it's open, every +// HTTPS check is a no-op on the events table unless the threshold (and thus +// severity) changes. The event closes when the cert is renewed beyond the +// outermost threshold. +// +// Severity ladder: +// - <= 7 days → Degraded (severity 2) +// - <= 14 days → Warning (severity 1) +// - <= 30 days → Warning (severity 1) +// - > 30 days → close any open event with reason=verifier_cleared func (o *Orchestrator) checkSSLAlerts(site db.Site, expiry time.Time) { - thresholds := []int{30, 14, 7} daysUntil := int(time.Until(expiry).Hours() / 24) - for _, t := range thresholds { - if daysUntil == t { - log.Printf("orchestrator: blog_id=%d SSL cert expires in %d days", site.BlogID, daysUntil) - o.auditLog(site.BlogID, audit.EventCheck, "local", 0, checker.ErrorTLSExpired, 0, - fmt.Sprintf("ssl certificate expires in %d days", daysUntil)) + + const ( + warnDays = 30 + degradedDays = 7 + ) + + if daysUntil > warnDays { + // Cert is healthy. Close any pre-existing tls_expiry event for this site. + if err := o.closeSSLExpiryIfOpen(site.BlogID); err != nil { + log.Printf("orchestrator: close tls_expiry event blog_id=%d: %v", site.BlogID, err) + } + return + } + + severity := eventstore.SeverityWarning + state := eventstore.StateWarning + if daysUntil <= degradedDays { + severity = eventstore.SeverityDegraded + state = eventstore.StateDegraded + } + + meta, _ := json.Marshal(map[string]any{ + "days_until": daysUntil, + "expires_at": expiry.UTC().Format(time.RFC3339), + }) + + if err := o.openOrUpdateSSLExpiry(site.BlogID, severity, state, daysUntil, meta); err != nil { + log.Printf("orchestrator: tls_expiry event blog_id=%d days=%d: %v", site.BlogID, daysUntil, err) + return + } + log.Printf("orchestrator: blog_id=%d SSL cert expires in %d days (severity %d)", site.BlogID, daysUntil, severity) +} + +// openOrUpdateSSLExpiry opens a tls_expiry event for the site if none exists, +// or escalates / de-escalates the existing event's severity if a threshold has +// been crossed. site_status is intentionally not projected — TLS expiry +// warnings don't affect the Up/Down state of the site (Layer 2 issue, not a +// Layer 4 outage). +func (o *Orchestrator) openOrUpdateSSLExpiry(blogID int64, severity uint8, state string, daysUntil int, meta json.RawMessage) error { + tx, err := o.ev().Begin(o.ctx) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + out, err := tx.Open(o.ctx, eventstore.OpenInput{ + Identity: eventstore.Identity{BlogID: blogID, CheckType: checkTypeTLSExpiry}, + Severity: severity, + State: state, + Source: o.hostname, + Metadata: meta, + }) + if err != nil { + return fmt.Errorf("open tls_expiry: %w", err) + } + + // If the event already existed and its severity differs from the new + // threshold, escalate (or de-escalate) with a transition row recording why. + if !out.Opened && out.CurrentSeverity != severity { + reason := eventstore.ReasonSeverityEscalation + if severity < out.CurrentSeverity { + reason = eventstore.ReasonSeverityDeescalation + } + if _, err := tx.Promote(o.ctx, out.EventID, severity, state, reason, o.hostname, meta); err != nil { + return fmt.Errorf("escalate tls_expiry: %w", err) + } + } + return tx.Commit() +} + +// closeSSLExpiryIfOpen closes an open tls_expiry event for the site, if any. +// No-op if no event exists. +func (o *Orchestrator) closeSSLExpiryIfOpen(blogID int64) error { + tx, err := o.ev().Begin(o.ctx) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + if tx.Tx() == nil { + return tx.Commit() + } + ae, err := tx.FindActiveByBlog(o.ctx, blogID, checkTypeTLSExpiry) + if err != nil { + if errors.Is(err, eventstore.ErrEventNotFound) { + return tx.Commit() } + return err } + if err := tx.Close(o.ctx, ae.ID, eventstore.ReasonVerifierCleared, o.hostname, nil); err != nil { + return fmt.Errorf("close tls_expiry: %w", err) + } + return tx.Commit() } func (o *Orchestrator) isAlertSuppressed(site db.Site) bool { @@ -498,6 +834,23 @@ func (o *Orchestrator) isAlertSuppressed(site db.Site) bool { return time.Since(*site.LastAlertSentAt) < time.Duration(cooldown)*time.Minute } +func (o *Orchestrator) checkLegacyProjectionDrift(cfg *config.Config) { + if !cfg.LegacyStatusProjectionEnable { + return + } + count, err := dbCountProjectionDrift(o.ctx, o.bucketMin, o.bucketMax) + if err != nil { + log.Printf("orchestrator: legacy projection drift check failed: %v", err) + emitCounter("projection.drift.check_error.count", 1) + return + } + emitGauge("projection.drift.count", count) + if count > 0 { + log.Printf("orchestrator: WARN legacy projection drift detected count=%d buckets=%d-%d", count, o.bucketMin, o.bucketMax) + emitCounter("projection.drift.detected.count", 1) + } +} + // RetryQueueSize returns the number of sites currently in local retry. func (o *Orchestrator) RetryQueueSize() int { return o.retries.size() @@ -508,6 +861,11 @@ func (o *Orchestrator) BucketRange() (int, int) { return o.bucketMin, o.bucketMax } +func (o *Orchestrator) usesPinnedBuckets(cfg *config.Config) bool { + _, _, ok := cfg.PinnedBucketRange() + return ok +} + // WorkerCount returns the live worker count. func (o *Orchestrator) WorkerCount() int { return o.pool.WorkerCount() @@ -523,18 +881,239 @@ func (o *Orchestrator) QueueDepth() int { return o.pool.QueueDepth() } -func (o *Orchestrator) auditLog(blogID int64, event, source string, httpCode, errorCode int, rttMs int64, detail string) { - if err := audit.Log(blogID, event, source, httpCode, errorCode, rttMs, detail); err != nil { - log.Printf("audit: blog_id=%d event=%s: %v", blogID, event, err) +func (o *Orchestrator) auditLog(e audit.Entry) { + if err := audit.Log(o.ctx, e); err != nil { + log.Printf("audit: blog_id=%d event=%s: %v", e.BlogID, e.EventType, err) + } +} + +func emitCounter(stat string, value int) { + if m := metricsClientFunc(); m != nil { + m.Increment(stat, value) } } -func (o *Orchestrator) auditTransition(blogID int64, from, to int, detail string) { - if err := audit.LogTransition(blogID, from, to, detail); err != nil { - log.Printf("audit: blog_id=%d transition %d->%d: %v", blogID, from, to, err) +func emitGauge(stat string, value int) { + if m := metricsClientFunc(); m != nil { + m.Gauge(stat, value) } } +func emitTiming(stat string, d time.Duration) { + if d < 0 { + return + } + if m := metricsClientFunc(); m != nil { + m.Timing(stat, d) + } +} + +func emitTimingSince(stat string, start, end time.Time) { + if start.IsZero() || end.IsZero() { + return + } + emitTiming(stat, end.Sub(start)) +} + +func failureClass(res checker.Result) string { + return metricSegment((&res).StatusType()) +} + +func metricSegment(s string) string { + s = strings.ToLower(strings.TrimSpace(s)) + if s == "" { + return "unknown" + } + + var b strings.Builder + lastUnderscore := false + for _, r := range s { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') { + b.WriteRune(r) + lastUnderscore = false + continue + } + if !lastUnderscore { + b.WriteByte('_') + lastUnderscore = true + } + } + + out := strings.Trim(b.String(), "_") + if out == "" { + return "unknown" + } + return out +} + +// openSeemsDown opens (or re-detects) a Seems Down event for an HTTP-failing +// site and projects v1 site_status=SITE_DOWN in the same transaction. Returns +// the event id. Idempotent: a re-detection of the same identity returns the +// existing event's id with no transition row written and no projection update. +func (o *Orchestrator) openSeemsDown(site db.Site, res checker.Result) (int64, error) { + tx, err := o.ev().Begin(o.ctx) + if err != nil { + return 0, err + } + defer func() { _ = tx.Rollback() }() + + meta, _ := json.Marshal(map[string]any{ + "http_code": res.HTTPCode, + "error_code": res.ErrorCode, + "rtt_ms": res.RTT.Milliseconds(), + "url": site.MonitorURL, + }) + + out, err := tx.Open(o.ctx, eventstore.OpenInput{ + Identity: eventstore.Identity{BlogID: site.BlogID, CheckType: checkTypeHTTP}, + Severity: eventstore.SeveritySeemsDown, + State: eventstore.StateSeemsDown, + Source: o.hostname, + Metadata: meta, + }) + if err != nil { + return 0, err + } + + // Project v1 site_status=SITE_DOWN only on the actual insert. A re-detection + // (Opened=false) is by definition a row that already exists, so site_status + // was already projected when the event first opened. + if out.Opened && config.LegacyStatusProjectionEnabled() && tx.Tx() != nil { + if err := db.UpdateSiteStatusTx(o.ctx, tx.Tx(), site.BlogID, statusDown, nowFunc().UTC()); err != nil { + return 0, fmt.Errorf("project site_status: %w", err) + } + } + + if err := tx.Commit(); err != nil { + return 0, fmt.Errorf("commit: %w", err) + } + return out.EventID, nil +} + +// promoteToDown bumps an open Seems Down event to Down (severity 4) and +// projects site_status=SITE_CONFIRMED_DOWN in the same transaction. +func (o *Orchestrator) promoteToDown(blogID, eventID int64, changeTime time.Time, meta json.RawMessage) error { + tx, err := o.ev().Begin(o.ctx) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.Promote(o.ctx, eventID, + eventstore.SeverityDown, eventstore.StateDown, + eventstore.ReasonVerifierConfirmed, o.hostname, meta); err != nil { + return fmt.Errorf("promote event: %w", err) + } + + if config.LegacyStatusProjectionEnabled() && tx.Tx() != nil { + if err := db.UpdateSiteStatusTx(o.ctx, tx.Tx(), blogID, statusConfirmedDown, changeTime); err != nil { + return fmt.Errorf("project site_status: %w", err) + } + } + return tx.Commit() +} + +// closeEvent closes an open event with the given resolution reason and projects +// site_status to the given v1 value in the same transaction. +func (o *Orchestrator) closeEvent(blogID, eventID int64, reason string, projectedStatus int, changeTime time.Time, meta json.RawMessage) error { + tx, err := o.ev().Begin(o.ctx) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + if err := tx.Close(o.ctx, eventID, reason, o.hostname, meta); err != nil { + return fmt.Errorf("close event: %w", err) + } + + if config.LegacyStatusProjectionEnabled() && tx.Tx() != nil { + if err := db.UpdateSiteStatusTx(o.ctx, tx.Tx(), blogID, projectedStatus, changeTime); err != nil { + return fmt.Errorf("project site_status: %w", err) + } + } + return tx.Commit() +} + +// closeRecoveredEvent closes the open event for a recovering site. Picks +// resolution reason from the event's current state — Seems Down → probe_cleared, +// Down → verifier_cleared. If the caller already knows the event id (from the +// retry entry) it is used directly; otherwise the active event is looked up +// inside the transaction. site_status is projected back to SITE_RUNNING in the +// same tx. +func (o *Orchestrator) closeRecoveredEvent(blogID, knownEventID int64, changeTime time.Time) error { + tx, err := o.ev().Begin(o.ctx) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + // Determine event id and current state. If knownEventID is set, read state + // directly; otherwise look up the active event for this blog. + var eventID int64 + var state string + switch { + case knownEventID > 0 && tx.Tx() != nil: + eventID = knownEventID + if err := tx.Tx().QueryRowContext(o.ctx, + `SELECT state FROM jetmon_events WHERE id = ?`, eventID, + ).Scan(&state); err != nil { + return fmt.Errorf("read event state: %w", err) + } + case tx.Tx() != nil: + ae, err := tx.FindActiveByBlog(o.ctx, blogID, checkTypeHTTP) + if err != nil { + if errors.Is(err, eventstore.ErrEventNotFound) { + // site_status disagreed with the event store (no open event but + // projection said non-running). Just project back to running. + if config.LegacyStatusProjectionEnabled() { + if err := db.UpdateSiteStatusTx(o.ctx, tx.Tx(), blogID, statusRunning, changeTime); err != nil { + return fmt.Errorf("project site_status: %w", err) + } + } + return tx.Commit() + } + return err + } + eventID = ae.ID + state = ae.State + default: + // nil-mode (no DB): nothing to do. + return tx.Commit() + } + + reason := eventstore.ReasonProbeCleared + if state == eventstore.StateDown { + reason = eventstore.ReasonVerifierCleared + } + + if err := tx.Close(o.ctx, eventID, reason, o.hostname, nil); err != nil { + return fmt.Errorf("close event: %w", err) + } + if config.LegacyStatusProjectionEnabled() && tx.Tx() != nil { + if err := db.UpdateSiteStatusTx(o.ctx, tx.Tx(), blogID, statusRunning, changeTime); err != nil { + return fmt.Errorf("project site_status: %w", err) + } + } + return tx.Commit() +} + +// summarizeVerifierResults extracts a small JSON-friendly summary of verifier +// replies for storage in transition metadata. We don't store the full result +// list — the per-RPC details are already in jetmon_audit_log under +// EventVeriflierSent. +func summarizeVerifierResults(vResults []veriflier.CheckResult) []map[string]any { + out := make([]map[string]any, 0, len(vResults)) + for _, vr := range vResults { + out = append(out, map[string]any{ + "host": vr.Host, + "success": vr.Success, + "http_code": vr.HTTPCode, + "rtt_ms": vr.RTTMs, + }) + } + return out +} + func inMaintenance(site db.Site) bool { now := time.Now() if site.MaintenanceStart == nil || site.MaintenanceEnd == nil { @@ -550,10 +1129,23 @@ func statusFromBool(success bool) int { return 0 } +func wpcomStatusMetricSegment(status int) string { + switch status { + case statusDown: + return "down" + case statusRunning: + return "running" + case statusConfirmedDown: + return "confirmed_down" + default: + return "unknown" + } +} + func (o *Orchestrator) refreshVeriflierClients(cfg *config.Config) { newAddrs := make([]string, 0, len(cfg.Verifiers)) for _, v := range cfg.Verifiers { - newAddrs = append(newAddrs, fmt.Sprintf("%s:%s|%s", v.Host, v.GRPCPort, v.AuthToken)) + newAddrs = append(newAddrs, fmt.Sprintf("%s:%s|%s", v.Host, v.TransportPort(), v.AuthToken)) } o.veriflierMu.RLock() @@ -565,7 +1157,7 @@ func (o *Orchestrator) refreshVeriflierClients(cfg *config.Config) { clients := make([]*veriflier.VeriflierClient, 0, len(cfg.Verifiers)) for _, v := range cfg.Verifiers { - addr := fmt.Sprintf("%s:%s", v.Host, v.GRPCPort) + addr := fmt.Sprintf("%s:%s", v.Host, v.TransportPort()) clients = append(clients, veriflier.NewVeriflierClient(addr, v.AuthToken)) } o.veriflierMu.Lock() diff --git a/internal/orchestrator/orchestrator_test.go b/internal/orchestrator/orchestrator_test.go index 7afcdbcb..92199882 100644 --- a/internal/orchestrator/orchestrator_test.go +++ b/internal/orchestrator/orchestrator_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "sync" + "sync/atomic" "testing" "time" @@ -74,6 +75,23 @@ func TestInMaintenance(t *testing.T) { } } +func TestSummarizeVerifierResults(t *testing.T) { + got := summarizeVerifierResults([]veriflier.CheckResult{ + {Host: "us-west", Success: false, HTTPCode: 500, RTTMs: 123}, + {Host: "eu", Success: true, HTTPCode: 200, RTTMs: 45}, + }) + if len(got) != 2 { + t.Fatalf("len = %d, want 2", len(got)) + } + if got[0]["host"] != "us-west" || got[0]["success"] != false || + got[0]["http_code"] != int32(500) || got[0]["rtt_ms"] != int64(123) { + t.Fatalf("first summary = %+v", got[0]) + } + if got[1]["host"] != "eu" || got[1]["success"] != true { + t.Fatalf("second summary = %+v", got[1]) + } +} + func TestSlicesEqual(t *testing.T) { if !slicesEqual(nil, nil) { t.Fatal("nil slices should be equal") @@ -92,8 +110,8 @@ func TestSlicesEqual(t *testing.T) { func TestRefreshVeriflierClientsReusesUnchangedClients(t *testing.T) { cfg := &config.Config{ Verifiers: []config.VerifierConfig{ - {Name: "a", Host: "host1", GRPCPort: "7803", AuthToken: "token1"}, - {Name: "b", Host: "host2", GRPCPort: "7804", AuthToken: "token2"}, + {Name: "a", Host: "host1", Port: "7803", AuthToken: "token1"}, + {Name: "b", Host: "host2", Port: "7804", AuthToken: "token2"}, }, } @@ -112,7 +130,7 @@ func TestRefreshVeriflierClientsReusesUnchangedClients(t *testing.T) { func TestRefreshVeriflierClientsRebuildsChangedClients(t *testing.T) { cfg := &config.Config{ Verifiers: []config.VerifierConfig{ - {Name: "a", Host: "host1", GRPCPort: "7803", AuthToken: "token1"}, + {Name: "a", Host: "host1", Port: "7803", AuthToken: "token1"}, }, } @@ -121,7 +139,7 @@ func TestRefreshVeriflierClientsRebuildsChangedClients(t *testing.T) { updated := &config.Config{ Verifiers: []config.VerifierConfig{ - {Name: "a", Host: "host1", GRPCPort: "7803", AuthToken: "token2"}, + {Name: "a", Host: "host1", Port: "7803", AuthToken: "token2"}, }, } @@ -138,6 +156,9 @@ func TestSendNotificationRetriesAndUpdatesAlertTimestamp(t *testing.T) { setTestConfig(t) + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + var notifyCalls int wpcomNotifyFunc = func(_ *wpcom.Client, _ wpcom.Notification) error { notifyCalls++ @@ -168,6 +189,20 @@ func TestSendNotificationRetriesAndUpdatesAlertTimestamp(t *testing.T) { if updatedBlogID != 123 { t.Fatalf("updated blog_id = %d, want 123", updatedBlogID) } + for stat, want := range map[string]int{ + "wpcom.notification.attempt.count": 1, + "wpcom.notification.status.running.attempt.count": 1, + "wpcom.notification.error.count": 1, + "wpcom.notification.status.running.error.count": 1, + "wpcom.notification.retry.count": 1, + "wpcom.notification.retry.delivered.count": 1, + "wpcom.notification.delivered.count": 1, + "wpcom.notification.status.running.delivered.count": 1, + } { + if got := rec.counter(stat); got != want { + t.Fatalf("%s = %d, want %d", stat, got, want) + } + } } func TestConfirmDownSuppressedDuringCooldown(t *testing.T) { @@ -270,13 +305,17 @@ func TestEscalateToVerifliersRecordsFalsePositiveWhenQuorumMissed(t *testing.T) return nil } - call := 0 + // escalateToVerifliers fans the verifier RPC out across goroutines, so + // `call` is read+written concurrently. Use atomic so `go test -race` + // stays clean. The semantics — first verifier returns Success=false, + // subsequent ones return true — are unchanged. + var call atomic.Int64 veriflierCheckFunc = func(c *veriflier.VeriflierClient, _ context.Context, req veriflier.CheckRequest) (*veriflier.CheckResult, error) { - call++ + n := call.Add(1) return &veriflier.CheckResult{ BlogID: req.BlogID, Host: c.Addr(), - Success: call != 1, + Success: n != 1, HTTPCode: 200, }, nil } @@ -307,22 +346,35 @@ func TestEscalateToVerifliersRecordsFalsePositiveWhenQuorumMissed(t *testing.T) func stubOrchestratorDeps() func() { origNow := nowFunc + origDBClaimBuckets := dbClaimBuckets + origDBHeartbeat := dbHeartbeat + origDBReleaseHost := dbReleaseHost + origDBMarkHostDraining := dbMarkHostDraining + origDBGetSites := dbGetSitesForBucket origDBUpdateStatus := dbUpdateSiteStatus origDBUpdateLastAlert := dbUpdateLastAlertSent origDBRecordFalsePositive := dbRecordFalsePositive origDBMarkSiteChecked := dbMarkSiteChecked origDBRecordCheckHistory := dbRecordCheckHistory origDBUpdateSSLExpiry := dbUpdateSSLExpiry + origDBCountProjectionDrift := dbCountProjectionDrift origNotify := wpcomNotifyFunc origVeriflierCheck := veriflierCheckFunc + origMetricsClient := metricsClientFunc nowFunc = time.Now + dbClaimBuckets = func(string, int, int, int) (int, int, error) { return 0, 0, nil } + dbHeartbeat = func(context.Context, string) error { return nil } + dbReleaseHost = func(context.Context, string) error { return nil } + dbMarkHostDraining = func(context.Context, string) error { return nil } + dbGetSitesForBucket = func(context.Context, int, int, int, bool) ([]db.Site, error) { return nil, nil } dbUpdateSiteStatus = func(context.Context, int64, int, time.Time) error { return nil } dbUpdateLastAlertSent = func(context.Context, int64, time.Time) error { return nil } dbRecordFalsePositive = func(int64, int, int, int64) error { return nil } dbMarkSiteChecked = func(context.Context, int64, time.Time) error { return nil } dbRecordCheckHistory = func(int64, int, int, int64, int64, int64, int64, int64) error { return nil } dbUpdateSSLExpiry = func(context.Context, int64, time.Time) error { return nil } + dbCountProjectionDrift = func(context.Context, int, int) (int, error) { return 0, nil } wpcomNotifyFunc = func(_ *wpcom.Client, _ wpcom.Notification) error { return nil } veriflierCheckFunc = func(c *veriflier.VeriflierClient, ctx context.Context, req veriflier.CheckRequest) (*veriflier.CheckResult, error) { return c.Check(ctx, req) @@ -330,14 +382,21 @@ func stubOrchestratorDeps() func() { return func() { nowFunc = origNow + dbClaimBuckets = origDBClaimBuckets + dbHeartbeat = origDBHeartbeat + dbReleaseHost = origDBReleaseHost + dbMarkHostDraining = origDBMarkHostDraining + dbGetSitesForBucket = origDBGetSites dbUpdateSiteStatus = origDBUpdateStatus dbUpdateLastAlertSent = origDBUpdateLastAlert dbRecordFalsePositive = origDBRecordFalsePositive dbMarkSiteChecked = origDBMarkSiteChecked dbRecordCheckHistory = origDBRecordCheckHistory dbUpdateSSLExpiry = origDBUpdateSSLExpiry + dbCountProjectionDrift = origDBCountProjectionDrift wpcomNotifyFunc = origNotify veriflierCheckFunc = origVeriflierCheck + metricsClientFunc = origMetricsClient } } @@ -355,7 +414,7 @@ func setTestConfig(t *testing.T) *config.Config { cfg.AlertCooldownMinutes = 30 cfg.NumOfChecks = 3 cfg.PeerOfflineLimit = 2 - cfg.DBUpdatesEnable = false + cfg.LegacyStatusProjectionEnable = false return cfg } @@ -455,6 +514,35 @@ func TestHandleRecoveryClearsRetryEntryEvenWhenAlreadyRunning(t *testing.T) { } } +func TestHandleRecoveryEmitsProbeClearedClassMetric(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + setTestConfig(t) + + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + + o := &Orchestrator{ + retries: newRetryQueue(), + wpcom: &wpcom.Client{}, + hostname: "local", + ctx: context.Background(), + } + o.retries.record(checkerResultFailure(42)) + + o.handleRecovery(db.Site{BlogID: 42, SiteStatus: statusDown}, checkerResultSuccess(42)) + + if got := rec.counter("detection.probe_cleared.count"); got != 1 { + t.Fatalf("probe-cleared counter = %d, want 1", got) + } + if got := rec.counter("detection.probe_cleared.server.count"); got != 1 { + t.Fatalf("probe-cleared server counter = %d, want 1", got) + } + if got := rec.timingCount("detection.seems_down_to_probe_cleared.time"); got != 1 { + t.Fatalf("probe-cleared timing count = %d, want 1", got) + } +} + func TestHandleFailureBelowThresholdDoesNotEscalate(t *testing.T) { restore := stubOrchestratorDeps() defer restore() @@ -686,6 +774,60 @@ func TestOrchestratorAccessors(t *testing.T) { } } +func TestClaimBucketsUsesPinnedRangeWithoutHostTable(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + cfg := setTestConfig(t) + min, max := 12, 34 + cfg.PinnedBucketMin = &min + cfg.PinnedBucketMax = &max + + var dynamicClaimCalled bool + dbClaimBuckets = func(string, int, int, int) (int, int, error) { + dynamicClaimCalled = true + return 0, 0, nil + } + + o := &Orchestrator{hostname: "host-a"} + if err := o.ClaimBuckets(); err != nil { + t.Fatalf("ClaimBuckets: %v", err) + } + if dynamicClaimCalled { + t.Fatal("ClaimBuckets called dynamic jetmon_hosts claim in pinned mode") + } + if o.bucketMin != 12 || o.bucketMax != 34 { + t.Fatalf("bucket range = %d-%d, want 12-34", o.bucketMin, o.bucketMax) + } +} + +func TestRunRoundSkipsHeartbeatWhenPinned(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + cfg := setTestConfig(t) + min, max := 12, 34 + cfg.PinnedBucketMin = &min + cfg.PinnedBucketMax = &max + + var heartbeatCalled bool + dbHeartbeat = func(context.Context, string) error { + heartbeatCalled = true + return nil + } + dbGetSitesForBucket = func(_ context.Context, gotMin, gotMax, _ int, _ bool) ([]db.Site, error) { + if gotMin != 12 || gotMax != 34 { + t.Fatalf("fetch buckets = %d-%d, want 12-34", gotMin, gotMax) + } + return nil, nil + } + + o := &Orchestrator{ctx: context.Background(), hostname: "host-a"} + o.runRound() + + if heartbeatCalled { + t.Fatal("runRound updated jetmon_hosts heartbeat in pinned mode") + } +} + func TestRetryQueueAllBlogIDs(t *testing.T) { q := newRetryQueue() q.record(checkerResultFailure(1)) @@ -735,11 +877,78 @@ func TestIsAlertSuppressedCustomCooldown(t *testing.T) { } } +func TestCheckLegacyProjectionDriftEmitsGaugeAndWarningCounter(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + cfg := setTestConfig(t) + cfg.LegacyStatusProjectionEnable = true + + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + dbCountProjectionDrift = func(_ context.Context, bucketMin, bucketMax int) (int, error) { + if bucketMin != 10 || bucketMax != 20 { + t.Fatalf("drift check buckets = %d-%d, want 10-20", bucketMin, bucketMax) + } + return 3, nil + } + + o := &Orchestrator{ctx: context.Background(), bucketMin: 10, bucketMax: 20} + o.checkLegacyProjectionDrift(cfg) + + if got := rec.gauge("projection.drift.count"); got != 3 { + t.Fatalf("projection.drift.count = %d, want 3", got) + } + if got := rec.counter("projection.drift.detected.count"); got != 1 { + t.Fatalf("projection.drift.detected.count = %d, want 1", got) + } +} + +func TestCheckLegacyProjectionDriftSkipsWhenProjectionDisabled(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + cfg := setTestConfig(t) + cfg.LegacyStatusProjectionEnable = false + + var called bool + dbCountProjectionDrift = func(context.Context, int, int) (int, error) { + called = true + return 0, nil + } + + o := &Orchestrator{ctx: context.Background()} + o.checkLegacyProjectionDrift(cfg) + if called { + t.Fatal("drift check should be skipped when legacy projection is disabled") + } +} + +func TestCheckLegacyProjectionDriftEmitsErrorCounter(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + cfg := setTestConfig(t) + cfg.LegacyStatusProjectionEnable = true + + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + dbCountProjectionDrift = func(context.Context, int, int) (int, error) { + return 0, fmt.Errorf("db failed") + } + + o := &Orchestrator{ctx: context.Background()} + o.checkLegacyProjectionDrift(cfg) + if got := rec.counter("projection.drift.check_error.count"); got != 1 { + t.Fatalf("projection.drift.check_error.count = %d, want 1", got) + } +} + func TestSendNotificationBothRetriesFail(t *testing.T) { restore := stubOrchestratorDeps() defer restore() setTestConfig(t) + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + calls := 0 wpcomNotifyFunc = func(_ *wpcom.Client, _ wpcom.Notification) error { calls++ @@ -765,6 +974,21 @@ func TestSendNotificationBothRetriesFail(t *testing.T) { if updateAlertCalled { t.Fatal("dbUpdateLastAlertSent should not be called when both retries fail") } + for stat, want := range map[string]int{ + "wpcom.notification.attempt.count": 1, + "wpcom.notification.status.confirmed_down.attempt.count": 1, + "wpcom.notification.error.count": 2, + "wpcom.notification.status.confirmed_down.error.count": 2, + "wpcom.notification.retry.count": 1, + "wpcom.notification.failed.count": 1, + "wpcom.notification.status.confirmed_down.failed.count": 1, + "wpcom.notification.delivered.count": 0, + "wpcom.notification.status.confirmed_down.delivered.count": 0, + } { + if got := rec.counter(stat); got != want { + t.Fatalf("%s = %d, want %d", stat, got, want) + } + } } func TestEscalateToVerifliersNoClients(t *testing.T) { @@ -928,3 +1152,254 @@ func TestHandleFailureEscalatesAfterThreshold(t *testing.T) { t.Fatal("expected escalation to verifliers after NumOfChecks failures") } } + +func TestHandleFailureEmitsSeemsDownMetrics(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + setTestConfig(t) + + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + + firstFailureAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + nowFunc = func() time.Time { return firstFailureAt.Add(2 * time.Second) } + + res := checkerResultFailure(42) + res.Timestamp = firstFailureAt + + o := &Orchestrator{ + retries: newRetryQueue(), + wpcom: &wpcom.Client{}, + hostname: "local-host", + ctx: context.Background(), + } + o.handleFailure(db.Site{BlogID: 42, MonitorURL: "https://example.com", SiteStatus: statusRunning}, res) + + if got := rec.counter("detection.seems_down.open.count"); got != 1 { + t.Fatalf("seems-down open counter = %d, want 1", got) + } + if got := rec.counter("detection.failure.server.count"); got != 1 { + t.Fatalf("failure class counter = %d, want 1", got) + } + if got := rec.counter("detection.seems_down.open.server.count"); got != 1 { + t.Fatalf("seems-down class counter = %d, want 1", got) + } + if got := rec.timingCount("detection.first_failure_to_seems_down.time"); got != 1 { + t.Fatalf("first failure timing count = %d, want 1", got) + } +} + +func TestEscalateToVerifliersEmitsConfirmedMetrics(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + + cfg := setTestConfig(t) + cfg.PeerOfflineLimit = 1 + + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + + wpcomNotifyFunc = func(_ *wpcom.Client, _ wpcom.Notification) error { return nil } + dbUpdateLastAlertSent = func(context.Context, int64, time.Time) error { return nil } + veriflierCheckFunc = func(c *veriflier.VeriflierClient, _ context.Context, req veriflier.CheckRequest) (*veriflier.CheckResult, error) { + return &veriflier.CheckResult{ + BlogID: req.BlogID, + Host: c.Addr(), + Success: false, + HTTPCode: 500, + RequestID: req.RequestID, + }, nil + } + + o := &Orchestrator{ + retries: newRetryQueue(), + wpcom: &wpcom.Client{}, + ctx: context.Background(), + hostname: "local-host", + veriflierClients: []*veriflier.VeriflierClient{ + veriflier.NewVeriflierClient("v1", ""), + }, + } + + fail := checkerResultFailure(321) + o.retries.record(fail) + entry := o.retries.get(321) + o.escalateToVerifliers(db.Site{BlogID: 321, MonitorURL: "https://example.com", SiteStatus: statusRunning}, entry) + + for stat, want := range map[string]int{ + "detection.verifier.escalation.count": 1, + "verifier.rpc.success.count": 1, + "verifier.host.v1.rpc.success.count": 1, + "verifier.vote.confirm_down.count": 1, + "verifier.host.v1.vote.confirm_down.count": 1, + "detection.verifier.quorum_met.count": 1, + "detection.down.confirmed.count": 1, + "detection.down.confirmed.server.count": 1, + } { + if got := rec.counter(stat); got != want { + t.Fatalf("%s = %d, want %d", stat, got, want) + } + } + for _, stat := range []string{ + "detection.first_failure_to_verification.time", + "verifier.rpc.duration", + "verifier.host.v1.rpc.duration", + "detection.seems_down_to_down.time", + } { + if got := rec.timingCount(stat); got != 1 { + t.Fatalf("%s timing count = %d, want 1", stat, got) + } + } +} + +func TestEscalateToVerifliersEmitsFalseAlarmMetrics(t *testing.T) { + restore := stubOrchestratorDeps() + defer restore() + + cfg := setTestConfig(t) + cfg.PeerOfflineLimit = 1 + + rec := newRecordingMetrics() + metricsClientFunc = func() metricsClient { return rec } + + dbRecordFalsePositive = func(int64, int, int, int64) error { return nil } + wpcomNotifyFunc = func(_ *wpcom.Client, _ wpcom.Notification) error { + t.Fatal("notification should not be sent for false alarm") + return nil + } + veriflierCheckFunc = func(c *veriflier.VeriflierClient, _ context.Context, req veriflier.CheckRequest) (*veriflier.CheckResult, error) { + return &veriflier.CheckResult{ + BlogID: req.BlogID, + Host: c.Addr(), + Success: true, + HTTPCode: 200, + RequestID: req.RequestID, + }, nil + } + + o := &Orchestrator{ + retries: newRetryQueue(), + wpcom: &wpcom.Client{}, + ctx: context.Background(), + hostname: "local-host", + veriflierClients: []*veriflier.VeriflierClient{ + veriflier.NewVeriflierClient("v1", ""), + }, + } + + fail := checkerResultFailure(654) + o.retries.record(fail) + entry := o.retries.get(654) + o.escalateToVerifliers(db.Site{BlogID: 654, MonitorURL: "https://example.com", SiteStatus: statusRunning}, entry) + + for stat, want := range map[string]int{ + "detection.verifier.escalation.count": 1, + "verifier.rpc.success.count": 1, + "verifier.host.v1.rpc.success.count": 1, + "verifier.vote.disagree.count": 1, + "verifier.host.v1.vote.disagree.count": 1, + "detection.verifier.false_alarm.count": 1, + "detection.verifier.false_alarm.server.count": 1, + } { + if got := rec.counter(stat); got != want { + t.Fatalf("%s = %d, want %d", stat, got, want) + } + } + if got := rec.timingCount("detection.seems_down_to_false_alarm.time"); got != 1 { + t.Fatalf("false alarm timing count = %d, want 1", got) + } +} + +func TestMetricSegment(t *testing.T) { + tests := []struct { + in string + want string + }{ + {in: "", want: "unknown"}, + {in: "server", want: "server"}, + {in: "US-West:7803", want: "us_west_7803"}, + {in: " eu.central-1 ", want: "eu_central_1"}, + {in: "://", want: "unknown"}, + } + + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + if got := metricSegment(tt.in); got != tt.want { + t.Fatalf("metricSegment(%q) = %q, want %q", tt.in, got, tt.want) + } + }) + } +} + +func TestWPCOMStatusMetricSegment(t *testing.T) { + tests := []struct { + status int + want string + }{ + {status: statusDown, want: "down"}, + {status: statusRunning, want: "running"}, + {status: statusConfirmedDown, want: "confirmed_down"}, + {status: 99, want: "unknown"}, + } + + for _, tt := range tests { + t.Run(tt.want, func(t *testing.T) { + if got := wpcomStatusMetricSegment(tt.status); got != tt.want { + t.Fatalf("wpcomStatusMetricSegment(%d) = %q, want %q", tt.status, got, tt.want) + } + }) + } +} + +type recordingMetrics struct { + mu sync.Mutex + counters map[string]int + gauges map[string]int + timings map[string][]time.Duration +} + +func newRecordingMetrics() *recordingMetrics { + return &recordingMetrics{ + counters: make(map[string]int), + gauges: make(map[string]int), + timings: make(map[string][]time.Duration), + } +} + +func (r *recordingMetrics) Increment(stat string, value int) { + r.mu.Lock() + defer r.mu.Unlock() + r.counters[stat] += value +} + +func (r *recordingMetrics) Gauge(stat string, value int) { + r.mu.Lock() + defer r.mu.Unlock() + r.gauges[stat] = value +} + +func (r *recordingMetrics) Timing(stat string, d time.Duration) { + r.mu.Lock() + defer r.mu.Unlock() + r.timings[stat] = append(r.timings[stat], d) +} + +func (r *recordingMetrics) EmitMemStats() {} + +func (r *recordingMetrics) counter(stat string) int { + r.mu.Lock() + defer r.mu.Unlock() + return r.counters[stat] +} + +func (r *recordingMetrics) gauge(stat string) int { + r.mu.Lock() + defer r.mu.Unlock() + return r.gauges[stat] +} + +func (r *recordingMetrics) timingCount(stat string) int { + r.mu.Lock() + defer r.mu.Unlock() + return len(r.timings[stat]) +} diff --git a/internal/orchestrator/retry.go b/internal/orchestrator/retry.go index 44e08e81..0faf910e 100644 --- a/internal/orchestrator/retry.go +++ b/internal/orchestrator/retry.go @@ -9,12 +9,13 @@ import ( // retryEntry tracks local retry state for a site that has failed at least once. type retryEntry struct { - blogID int64 - url string - failCount int - firstFailAt time.Time - lastResult checker.Result - checks []checker.Result // all check results since first failure + blogID int64 + url string + failCount int + firstFailAt time.Time + lastResult checker.Result + checks []checker.Result // all check results since first failure + eventID int64 // jetmon_events.id for the open Seems Down event; 0 if not yet opened or eventstore unavailable } // retryQueue holds sites awaiting local retry or veriflier escalation. diff --git a/internal/veriflier/client.go b/internal/veriflier/client.go index ae888e9a..094ea95e 100644 --- a/internal/veriflier/client.go +++ b/internal/veriflier/client.go @@ -1,31 +1,56 @@ package veriflier import ( + "bytes" "context" + "crypto/rand" + "encoding/hex" "encoding/json" "fmt" + "net" "net/http" - "strings" "time" ) -// VeriflierClient sends check batches to a remote Veriflier via gRPC. -// Until protoc-generated stubs are in place this implementation uses a -// lightweight JSON-over-HTTP transport on the same port, making it fully -// functional without a protoc dependency. Swap in the generated gRPC client -// by replacing the send() method after running `make generate`. +// VeriflierClient sends check batches to a remote Veriflier over the v2 +// production JSON-over-HTTP transport. type VeriflierClient struct { - addr string - authToken string + addr string + authToken string httpClient *http.Client } // NewVeriflierClient creates a client targeting the given address (host:port). +// +// The HTTP transport is tuned for the orchestrator's hot-path use: many +// short-lived RPCs to the same verifier host during outage waves. Default +// MaxIdleConnsPerHost=2 forces frequent reconnects under any concurrency above +// 2; we raise it so the orchestrator's per-verifier escalation goroutines +// reuse a small pool of warm connections. +// +// No client-level Timeout is set. Per-call deadlines come from the caller's +// context (the orchestrator wraps each escalation with NET_COMMS_TIMEOUT + +// headroom). A blanket client.Timeout would override that — see Go's +// http.Client docs: client.Timeout is enforced regardless of ctx, so leaving +// it unset means ctx is the only deadline and is honored exactly. func NewVeriflierClient(addr, authToken string) *VeriflierClient { + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 5 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 20, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 5 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + ForceAttemptHTTP2: true, + } return &VeriflierClient{ - addr: addr, - authToken: authToken, - httpClient: &http.Client{Timeout: 30 * time.Second}, + addr: addr, + authToken: authToken, + httpClient: &http.Client{Transport: transport}, } } @@ -36,6 +61,9 @@ func (c *VeriflierClient) Addr() string { // Check sends a single site check request to the Veriflier and returns the result. func (c *VeriflierClient) Check(ctx context.Context, req CheckRequest) (*CheckResult, error) { + if req.RequestID == "" { + req.RequestID = NewRequestID() + } results, err := c.CheckBatch(ctx, []CheckRequest{req}) if err != nil { return nil, err @@ -46,7 +74,8 @@ func (c *VeriflierClient) Check(ctx context.Context, req CheckRequest) (*CheckRe return &results[0], nil } -// CheckBatch sends multiple check requests to the Veriflier. +// CheckBatch sends multiple check requests to the Veriflier. Each request +// without a RequestID is given a fresh one; existing RequestIDs are preserved. func (c *VeriflierClient) CheckBatch(ctx context.Context, reqs []CheckRequest) ([]CheckResult, error) { type batchReq struct { Sites []CheckRequest `json:"sites"` @@ -55,13 +84,19 @@ func (c *VeriflierClient) CheckBatch(ctx context.Context, reqs []CheckRequest) ( Results []CheckResult `json:"results"` } + for i := range reqs { + if reqs[i].RequestID == "" { + reqs[i].RequestID = NewRequestID() + } + } + body, err := json.Marshal(batchReq{Sites: reqs}) if err != nil { return nil, err } url := fmt.Sprintf("http://%s/check", c.addr) - httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, strings.NewReader(string(body))) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body)) if err != nil { return nil, err } @@ -97,6 +132,9 @@ func (c *VeriflierClient) Ping(ctx context.Context) (string, error) { return "", err } defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("veriflier status returned %d", resp.StatusCode) + } var s struct { Status string `json:"status"` @@ -105,3 +143,16 @@ func (c *VeriflierClient) Ping(ctx context.Context) (string, error) { _ = json.NewDecoder(resp.Body).Decode(&s) return s.Version, nil } + +// NewRequestID returns a 16-byte random id, hex-encoded (32 chars). Used as +// the RPC correlation id between Monitor and Verifier. Crypto/rand backed so +// IDs are unpredictable; this isn't a security primitive but it's free. +func NewRequestID() string { + var b [16]byte + if _, err := rand.Read(b[:]); err != nil { + // Fall back to a timestamp-based id; collisions are vanishingly + // unlikely at our request rates and the id is correlation-only. + return fmt.Sprintf("ts-%d", time.Now().UnixNano()) + } + return hex.EncodeToString(b[:]) +} diff --git a/internal/veriflier/server.go b/internal/veriflier/server.go index 135caf5c..c9145284 100644 --- a/internal/veriflier/server.go +++ b/internal/veriflier/server.go @@ -1,27 +1,59 @@ package veriflier import ( + "context" "encoding/json" "fmt" "log" "net/http" + "time" + + "github.com/Automattic/jetmon/internal/metrics" ) // Server listens for inbound connections from the Monitor and dispatches // check batches to the local checker. Used by the Veriflier binary. // // This is the server-side counterpart to VeriflierClient. It implements -// the same JSON-over-HTTP transport and is replaced by a generated gRPC -// server after running `make generate`. +// the v2 production JSON-over-HTTP transport. +// +// The HTTP server is configured with read/write/idle timeouts so a slow or +// stalled client cannot pin a goroutine indefinitely (slowloris-style DoS). +// Shutdown(ctx) drains in-flight requests up to the caller's deadline before +// closing the listener. type Server struct { authToken string checkFn func(req CheckRequest) CheckResult addr string hostname string version string + httpSrv *http.Server } +// Timeout defaults for the verifier HTTP server. These are conservative — the +// expected pattern is a small batch POST that completes in well under a +// second. Longer values would make slowloris cheaper. +const ( + readHeaderTimeout = 5 * time.Second + readTimeout = 30 * time.Second + writeTimeout = 35 * time.Second // > readTimeout so the response can flush + idleTimeout = 120 * time.Second +) + +// maxRequestBodyBytes caps an inbound POST /check body. A typical batch is +// ~200 sites × ~250 bytes/site ≈ 50KB, so 10MB is generous headroom and +// closes a trivial DoS vector (an attacker that has the auth token can't +// stream gigabytes through the JSON decoder before we notice). +const maxRequestBodyBytes = 10 * 1024 * 1024 + // NewServer creates a Server that calls checkFn for each check request. +// +// authToken must be non-empty in production. An empty token would create a +// dangerous edge case where any request with `Authorization: Bearer ` (with +// a trailing space and nothing else) would be accepted; callers that +// receive an empty token from config should reject it before reaching here. +// We don't validate at construct time because tests exercise the empty-token +// path via httptest, but veriflier2/cmd/main.go does check at startup. func NewServer(addr, authToken, hostname, version string, checkFn func(CheckRequest) CheckResult) *Server { return &Server{ addr: addr, @@ -32,17 +64,39 @@ func NewServer(addr, authToken, hostname, version string, checkFn func(CheckRequ } } -// Listen starts the HTTP server. Blocks until the server exits. +// Listen starts the HTTP server. Blocks until the server exits via Shutdown +// or an unrecoverable error. Returns http.ErrServerClosed on a clean Shutdown. func (s *Server) Listen() error { mux := http.NewServeMux() mux.HandleFunc("/check", s.handleCheck) mux.HandleFunc("/status", s.handleStatus) + s.httpSrv = &http.Server{ + Addr: s.addr, + Handler: mux, + ReadHeaderTimeout: readHeaderTimeout, + ReadTimeout: readTimeout, + WriteTimeout: writeTimeout, + IdleTimeout: idleTimeout, + } + log.Printf("veriflier: listening on %s", s.addr) - return http.ListenAndServe(s.addr, mux) + return s.httpSrv.ListenAndServe() +} + +// Shutdown gracefully stops the server, allowing in-flight requests to +// complete up to the context's deadline. Safe to call before Listen — the +// underlying http.Server is nil-checked. +func (s *Server) Shutdown(ctx context.Context) error { + if s.httpSrv == nil { + return nil + } + return s.httpSrv.Shutdown(ctx) } func (s *Server) handleCheck(w http.ResponseWriter, r *http.Request) { + start := time.Now() + if r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return @@ -50,6 +104,7 @@ func (s *Server) handleCheck(w http.ResponseWriter, r *http.Request) { token := r.Header.Get("Authorization") if token != "Bearer "+s.authToken { + incrementMetric("verifier.auth.rejected.count", 1) http.Error(w, "unauthorized", http.StatusUnauthorized) return } @@ -61,19 +116,38 @@ func (s *Server) handleCheck(w http.ResponseWriter, r *http.Request) { Results []CheckResult `json:"results"` } + // Cap the body before decoding. An overlong body produces a clear 413 + // rather than streaming through the JSON decoder until something else + // times out. + r.Body = http.MaxBytesReader(w, r.Body, maxRequestBodyBytes) + var req batchReq if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + // MaxBytesReader's "http: request body too large" error is the + // signal we want to surface as 413; everything else is a malformed + // JSON payload (400). + if err.Error() == "http: request body too large" { + http.Error(w, "request body too large", http.StatusRequestEntityTooLarge) + return + } http.Error(w, fmt.Sprintf("decode: %v", err), http.StatusBadRequest) return } results := make([]CheckResult, 0, len(req.Sites)) for _, site := range req.Sites { + // Echo RequestID so the orchestrator can correlate this reply with the + // audit row it wrote when escalating. + log.Printf("veriflier: check blog_id=%d request_id=%s url=%s", site.BlogID, site.RequestID, site.URL) res := s.checkFn(site) res.Host = s.hostname + res.RequestID = site.RequestID results = append(results, res) } + incrementMetric("verifier.checks.received.count", len(req.Sites)) + timingMetric("verifier.checks.duration.timer", time.Since(start)) + w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(batchResp{Results: results}) } @@ -85,3 +159,18 @@ func (s *Server) handleStatus(w http.ResponseWriter, r *http.Request) { "version": s.version, }) } + +// incrementMetric and timingMetric are nil-safe wrappers around the global +// StatsD client. The verifier binary may run without metrics configured (no +// STATSD_ADDR env var), in which case these are no-ops. +func incrementMetric(name string, value int) { + if m := metrics.Global(); m != nil { + m.Increment(name, value) + } +} + +func timingMetric(name string, d time.Duration) { + if m := metrics.Global(); m != nil { + m.Timing(name, d) + } +} diff --git a/internal/veriflier/types.go b/internal/veriflier/types.go index 5efe4dc1..0e08eabd 100644 --- a/internal/veriflier/types.go +++ b/internal/veriflier/types.go @@ -1,10 +1,15 @@ // Package veriflier provides the client and server for Monitor↔Veriflier // communication. The current transport is JSON-over-HTTP; types mirror the -// proto definitions in proto/veriflier.proto. Run `make generate` after -// installing protoc to replace this with generated gRPC stubs. +// schema shape in proto/veriflier.proto, which is retained as a reference for +// a possible future transport. package veriflier // CheckRequest is a single site to check, sent from Monitor to Veriflier. +// +// RequestID is a client-generated correlation id (16-byte hex). The verifier +// echoes it back in the response and stamps it on its server-side log line so +// that "the orchestrator escalated → this verifier observed → this audit row +// in the monitor DB" can be reconstructed without timestamp matching. type CheckRequest struct { BlogID int64 URL string @@ -12,6 +17,7 @@ type CheckRequest struct { Keyword string CustomHeaders map[string]string RedirectPolicy string + RequestID string } // CheckResult is a single check outcome returned by the Veriflier. @@ -23,4 +29,5 @@ type CheckResult struct { HTTPCode int32 ErrorCode int32 RTTMs int64 + RequestID string // echoed from CheckRequest.RequestID } diff --git a/internal/veriflier/veriflier_test.go b/internal/veriflier/veriflier_test.go index b28c7ae9..69e7e98a 100644 --- a/internal/veriflier/veriflier_test.go +++ b/internal/veriflier/veriflier_test.go @@ -3,10 +3,12 @@ package veriflier import ( "bytes" "context" + "encoding/hex" "encoding/json" "net/http" "net/http/httptest" "testing" + "time" ) func newTestServer(checkFn func(CheckRequest) CheckResult) (*Server, *httptest.Server) { @@ -176,6 +178,22 @@ func TestClientPing(t *testing.T) { } } +func TestClientPingRejectsErrorStatus(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "unavailable", http.StatusServiceUnavailable) + })) + defer ts.Close() + + client := NewVeriflierClient(ts.Listener.Addr().String(), "secret") + _, err := client.Ping(context.Background()) + if err == nil { + t.Fatal("Ping() expected error") + } + if err.Error() != "veriflier status returned 503" { + t.Fatalf("Ping() error = %v", err) + } +} + func TestClientBatchRoundTrip(t *testing.T) { _, ts := newTestServer(func(req CheckRequest) CheckResult { return CheckResult{BlogID: req.BlogID, Success: true, HTTPCode: 200} @@ -205,3 +223,130 @@ func TestClientRejectsUnauthorized(t *testing.T) { t.Fatal("Check() expected error for wrong auth token") } } + +func TestNewRequestID(t *testing.T) { + id := NewRequestID() + if len(id) != 32 { + t.Fatalf("NewRequestID() len = %d, want 32", len(id)) + } + if _, err := hex.DecodeString(id); err != nil { + t.Fatalf("NewRequestID() not hex: %v", err) + } + other := NewRequestID() + if id == other { + t.Fatal("NewRequestID() collided across two calls") + } +} + +func TestRequestIDIsEchoed(t *testing.T) { + // Server should reflect each request's RequestID into the corresponding result. + _, ts := newTestServer(func(req CheckRequest) CheckResult { + return CheckResult{BlogID: req.BlogID, Success: true, HTTPCode: 200} + }) + defer ts.Close() + + client := NewVeriflierClient(ts.Listener.Addr().String(), "secret") + res, err := client.Check(context.Background(), CheckRequest{BlogID: 99, URL: "https://example.com"}) + if err != nil { + t.Fatalf("Check() error = %v", err) + } + if res.RequestID == "" { + t.Fatal("RequestID empty in response — client should auto-generate and server should echo") + } + if len(res.RequestID) != 32 { + t.Fatalf("RequestID len = %d, want 32 (16-byte hex)", len(res.RequestID)) + } +} + +func TestRequestIDPreservedWhenCallerSets(t *testing.T) { + // When the caller sets RequestID explicitly, the client must not overwrite it. + const callerID = "caller-supplied-id" + _, ts := newTestServer(func(req CheckRequest) CheckResult { + return CheckResult{BlogID: req.BlogID, Success: true} + }) + defer ts.Close() + + client := NewVeriflierClient(ts.Listener.Addr().String(), "secret") + res, err := client.Check(context.Background(), CheckRequest{ + BlogID: 1, + URL: "https://example.com", + RequestID: callerID, + }) + if err != nil { + t.Fatalf("Check() error = %v", err) + } + if res.RequestID != callerID { + t.Fatalf("RequestID = %q, want %q (caller-supplied id was overwritten)", res.RequestID, callerID) + } +} + +func TestServerRejectsOversizedBody(t *testing.T) { + // The body cap is the only DoS mitigation between an authorized caller + // and the JSON decoder. A body over the 10MB cap should be rejected + // with 413 — and crucially, the checkFn should never be invoked. + _, ts := newTestServer(func(req CheckRequest) CheckResult { + t.Fatal("checkFn should not be called for oversized body") + return CheckResult{} + }) + defer ts.Close() + + // Build a body just over the 10MB cap. Padding lives in a custom_headers + // value so the JSON shape is still valid (we want to confirm the cap + // fires, not that the JSON is malformed). + pad := make([]byte, 11*1024*1024) + for i := range pad { + pad[i] = 'x' + } + body := bytes.NewBuffer(nil) + body.WriteString(`{"sites":[{"BlogID":1,"URL":"https://example.com","CustomHeaders":{"X-Pad":"`) + body.Write(pad) + body.WriteString(`"}}]}`) + + req, _ := http.NewRequest(http.MethodPost, ts.URL+"/check", body) + req.Header.Set("Authorization", "Bearer secret") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("request error: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusRequestEntityTooLarge { + t.Fatalf("status = %d, want 413", resp.StatusCode) + } +} + +func TestServerShutdownDrains(t *testing.T) { + // Shutdown should drain in-flight requests up to the context deadline, + // not yank the connection mid-response. + srv := NewServer("127.0.0.1:0", "secret", "test-host", "1.0", func(req CheckRequest) CheckResult { + // Simulate a slow check so Shutdown has something to drain. + time.Sleep(50 * time.Millisecond) + return CheckResult{BlogID: req.BlogID, Success: true} + }) + + // Listen in background; surface the listener's actual port via httptest hack. + // Using httptest.NewUnstartedServer with our handler avoids the port-binding race. + mux := http.NewServeMux() + mux.HandleFunc("/check", srv.handleCheck) + mux.HandleFunc("/status", srv.handleStatus) + ts := httptest.NewServer(mux) + defer ts.Close() + + // Fire a request, then call Shutdown on the underlying httptest.Server's + // http.Server. We're testing the *handler* path with timeouts; the + // httptest.Server itself manages the listener. + client := NewVeriflierClient(ts.Listener.Addr().String(), "secret") + done := make(chan error, 1) + go func() { + _, err := client.Check(context.Background(), CheckRequest{BlogID: 1, URL: "https://example.com"}) + done <- err + }() + + // Give the request time to land in the handler's sleep, then verify it + // completes successfully (no panic, no shutdown mid-response). + if err := <-done; err != nil { + t.Fatalf("in-flight check failed: %v", err) + } +} diff --git a/internal/webhooks/deliveries.go b/internal/webhooks/deliveries.go new file mode 100644 index 00000000..cfed37ab --- /dev/null +++ b/internal/webhooks/deliveries.go @@ -0,0 +1,369 @@ +package webhooks + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "time" +) + +// ErrDeliveryNotFound is returned by Get / Retry when the delivery row +// doesn't exist. +var ErrDeliveryNotFound = errors.New("webhooks: delivery not found") + +// Delivery is the in-memory shape of a jetmon_webhook_deliveries row. +type Delivery struct { + ID int64 + WebhookID int64 + TransitionID int64 + EventID int64 + EventType string + Payload json.RawMessage // frozen at create time + Status Status + Attempt int + NextAttemptAt *time.Time + LastStatusCode *int + LastResponse *string + LastAttemptAt *time.Time + DeliveredAt *time.Time + CreatedAt time.Time +} + +// EnqueueInput carries everything needed to insert a delivery row. payload +// is captured by the caller (the dispatcher builds it from the event + +// transition + site context) and stored verbatim. +type EnqueueInput struct { + WebhookID int64 + TransitionID int64 + EventID int64 + EventType string + Payload json.RawMessage +} + +// Enqueue inserts a pending delivery with attempt=0 and next_attempt_at=now, +// signaling the worker to pick it up on the next tick. Uses INSERT IGNORE +// against the (webhook_id, transition_id) UNIQUE KEY so concurrent +// dispatchers don't create duplicate deliveries. +// +// Returns the new delivery's id, or 0 if the row was a duplicate (in which +// case some other dispatcher already enqueued this combination). +func Enqueue(ctx context.Context, db *sql.DB, in EnqueueInput) (int64, error) { + res, err := db.ExecContext(ctx, ` + INSERT IGNORE INTO jetmon_webhook_deliveries + (webhook_id, transition_id, event_id, event_type, payload, + status, attempt, next_attempt_at) + VALUES (?, ?, ?, ?, ?, 'pending', 0, CURRENT_TIMESTAMP)`, + in.WebhookID, in.TransitionID, in.EventID, in.EventType, []byte(in.Payload), + ) + if err != nil { + return 0, fmt.Errorf("webhooks: enqueue: %w", err) + } + id, err := res.LastInsertId() + if err != nil { + // MySQL's LastInsertId after INSERT IGNORE that didn't insert returns + // 0 with no error; getting an error here is an unusual driver quirk. + return 0, fmt.Errorf("webhooks: last insert id: %w", err) + } + affected, _ := res.RowsAffected() + if affected == 0 { + // Row was a duplicate — another dispatcher already enqueued this + // (webhook, transition) combination. Not an error condition. + return 0, nil + } + return id, nil +} + +// claimLockDuration is how far ClaimReady pushes next_attempt_at out +// when it claims a row. It must outlast the worker's per-delivery wall +// clock so the in-flight goroutine has time to write its real result +// (delivered → next_attempt_at NULL, failed → next_attempt_at = retry +// time) before this in-flight lease expires. The default worker +// HTTPTimeout is 30s with a 5s buffer; 60s gives comfortable headroom. +// +// If a goroutine crashes without updating the row (panic without +// recovery, OOM kill, etc.), the lease expires naturally and the +// row becomes claimable again — natural recovery without operator +// intervention. +const claimLockDuration = 60 * time.Second + +// ClaimReady returns up to limit pending deliveries whose next_attempt_at +// is in the past, ordered by next_attempt_at ASC (oldest first). It claims +// rows with SELECT ... FOR UPDATE inside a transaction so active-active +// delivery workers cannot claim the same row. Each claimed row then gets an +// in-flight lease by pushing next_attempt_at to NOW + +// claimLockDuration before the transaction commits, so subsequent ticks don't +// re-claim a row whose dispatch is still in-flight. The dispatch goroutine +// overwrites next_attempt_at with its real value (NULL on success, retry time +// on failure) when it finishes. +// +// Without the in-flight lease, the deliver loop's 1-second tick re-claims +// any in-flight row up to the per-webhook in-flight cap, producing +// concurrent dispatches and inflating the attempt counter — three +// concurrent claims followed by three failures end up at attempt=3 +// after a single round. The lease prevents that after the transaction commits. +func ClaimReady(ctx context.Context, db *sql.DB, limit int) ([]Delivery, error) { + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return nil, fmt.Errorf("webhooks: begin claim: %w", err) + } + committed := false + defer func() { + if !committed { + _ = tx.Rollback() + } + }() + + rows, err := tx.QueryContext(ctx, ` + SELECT id, webhook_id, transition_id, event_id, event_type, payload, + status, attempt, next_attempt_at, last_status_code, last_response, + last_attempt_at, delivered_at, created_at + FROM jetmon_webhook_deliveries + WHERE status = 'pending' + AND (next_attempt_at IS NULL OR next_attempt_at <= CURRENT_TIMESTAMP) + ORDER BY next_attempt_at ASC + LIMIT ? + FOR UPDATE`, limit) + if err != nil { + return nil, fmt.Errorf("webhooks: claim ready: %w", err) + } + var claimed []Delivery + for rows.Next() { + d, err := scanDeliveryRow(rows) + if err != nil { + rows.Close() + return nil, err + } + claimed = append(claimed, *d) + } + if err := rows.Err(); err != nil { + rows.Close() + return nil, err + } + if err := rows.Close(); err != nil { + return nil, fmt.Errorf("webhooks: close claim rows: %w", err) + } + + lockUntil := time.Now().Add(claimLockDuration).UTC() + for i := range claimed { + res, err := tx.ExecContext(ctx, ` + UPDATE jetmon_webhook_deliveries + SET next_attempt_at = ? + WHERE id = ? + AND status = 'pending'`, + lockUntil, claimed[i].ID) + if err != nil { + return nil, fmt.Errorf("webhooks: claim row %d: %w", claimed[i].ID, err) + } + affected, err := res.RowsAffected() + if err != nil { + return nil, fmt.Errorf("webhooks: claim row %d rows affected: %w", claimed[i].ID, err) + } + if affected != 1 { + return nil, fmt.Errorf("webhooks: claim row %d affected %d rows, want 1", claimed[i].ID, affected) + } + } + if err := tx.Commit(); err != nil { + return nil, fmt.Errorf("webhooks: commit claim: %w", err) + } + committed = true + return claimed, nil +} + +// MarkDelivered records a successful delivery with the response status. +// Sets status=delivered, captures last_status_code, last_response, and +// delivered_at. Subsequent retries are not scheduled — the row is terminal. +func MarkDelivered(ctx context.Context, db *sql.DB, id int64, statusCode int, responseBody string) error { + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_webhook_deliveries + SET status = 'delivered', + last_status_code = ?, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + delivered_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = NULL + WHERE id = ?`, + statusCode, truncate(responseBody, 2048), id) + if err != nil { + return fmt.Errorf("webhooks: mark delivered: %w", err) + } + return nil +} + +// ScheduleRetry bumps the attempt counter and sets next_attempt_at per the +// retry schedule. Captures the status/response from the failed attempt. +// If the next attempt would exceed maxAttempts, the row is marked +// abandoned instead. +func ScheduleRetry(ctx context.Context, db *sql.DB, id int64, statusCode int, responseBody string, nextAttempt time.Time, abandon bool) error { + if abandon { + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_webhook_deliveries + SET status = 'abandoned', + last_status_code = ?, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = NULL + WHERE id = ?`, + statusCode, truncate(responseBody, 2048), id) + if err != nil { + return fmt.Errorf("webhooks: abandon: %w", err) + } + return nil + } + _, err := db.ExecContext(ctx, ` + UPDATE jetmon_webhook_deliveries + SET last_status_code = ?, + last_response = ?, + last_attempt_at = CURRENT_TIMESTAMP, + attempt = attempt + 1, + next_attempt_at = ? + WHERE id = ?`, + statusCode, truncate(responseBody, 2048), nextAttempt.UTC(), id) + if err != nil { + return fmt.Errorf("webhooks: schedule retry: %w", err) + } + return nil +} + +// GetDelivery returns a single delivery row by id. +func GetDelivery(ctx context.Context, db *sql.DB, id int64) (*Delivery, error) { + row := db.QueryRowContext(ctx, ` + SELECT id, webhook_id, transition_id, event_id, event_type, payload, + status, attempt, next_attempt_at, last_status_code, last_response, + last_attempt_at, delivered_at, created_at + FROM jetmon_webhook_deliveries + WHERE id = ?`, id) + d, err := scanDeliveryRow(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrDeliveryNotFound + } + return nil, err + } + return d, nil +} + +// ListDeliveries returns deliveries for a webhook, optionally filtered by +// status, ordered by created_at DESC. Cursor-paginated on id. +func ListDeliveries(ctx context.Context, db *sql.DB, webhookID int64, status Status, cursorID int64, limit int) ([]Delivery, error) { + args := []any{webhookID} + q := ` + SELECT id, webhook_id, transition_id, event_id, event_type, payload, + status, attempt, next_attempt_at, last_status_code, last_response, + last_attempt_at, delivered_at, created_at + FROM jetmon_webhook_deliveries + WHERE webhook_id = ?` + if status != "" { + q += " AND status = ?" + args = append(args, string(status)) + } + if cursorID > 0 { + q += " AND id < ?" + args = append(args, cursorID) + } + q += " ORDER BY id DESC LIMIT ?" + args = append(args, limit) + + rows, err := db.QueryContext(ctx, q, args...) + if err != nil { + return nil, fmt.Errorf("webhooks: list deliveries: %w", err) + } + defer rows.Close() + var out []Delivery + for rows.Next() { + d, err := scanDeliveryRow(rows) + if err != nil { + return nil, err + } + out = append(out, *d) + } + return out, rows.Err() +} + +// RetryDelivery resets an abandoned delivery to pending so the worker +// picks it up on the next tick. Manual operator path: consumer fixed +// their endpoint, wants the previously-failed delivery to fire again. +// +// Resets attempt to 0 (new retry sequence) so the consumer gets the full +// 6 attempts again — they may have just brought their service back and a +// transient failure deserves a fresh budget. +// +// Only abandoned deliveries can be retried via this path. pending +// deliveries are already in the worker's queue; delivered deliveries +// were already accepted by the consumer. +func RetryDelivery(ctx context.Context, db *sql.DB, id int64) error { + res, err := db.ExecContext(ctx, ` + UPDATE jetmon_webhook_deliveries + SET status = 'pending', + attempt = 0, + next_attempt_at = CURRENT_TIMESTAMP, + last_status_code = NULL, + last_response = NULL, + last_attempt_at = NULL + WHERE id = ? AND status = 'abandoned'`, id) + if err != nil { + return fmt.Errorf("webhooks: retry delivery: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + // Either the row doesn't exist or it isn't abandoned. Distinguish + // for a useful error message. + d, getErr := GetDelivery(ctx, db, id) + if getErr != nil { + return getErr + } + return fmt.Errorf("webhooks: delivery %d is %s, only abandoned deliveries can be retried", id, d.Status) + } + return nil +} + +func scanDeliveryRow(s rowScanner) (*Delivery, error) { + var ( + d Delivery + payload sql.NullString + nextAttemptAt sql.NullTime + lastStatusCode sql.NullInt64 + lastResponse sql.NullString + lastAttemptAt sql.NullTime + deliveredAt sql.NullTime + statusStr string + ) + if err := s.Scan( + &d.ID, &d.WebhookID, &d.TransitionID, &d.EventID, &d.EventType, &payload, + &statusStr, &d.Attempt, &nextAttemptAt, &lastStatusCode, &lastResponse, + &lastAttemptAt, &deliveredAt, &d.CreatedAt, + ); err != nil { + return nil, err + } + d.Status = Status(statusStr) + if payload.Valid { + d.Payload = json.RawMessage(payload.String) + } + if nextAttemptAt.Valid { + d.NextAttemptAt = &nextAttemptAt.Time + } + if lastStatusCode.Valid { + v := int(lastStatusCode.Int64) + d.LastStatusCode = &v + } + if lastResponse.Valid { + d.LastResponse = &lastResponse.String + } + if lastAttemptAt.Valid { + d.LastAttemptAt = &lastAttemptAt.Time + } + if deliveredAt.Valid { + d.DeliveredAt = &deliveredAt.Time + } + return &d, nil +} + +func truncate(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] +} diff --git a/internal/webhooks/deliveries_test.go b/internal/webhooks/deliveries_test.go new file mode 100644 index 00000000..eef65110 --- /dev/null +++ b/internal/webhooks/deliveries_test.go @@ -0,0 +1,115 @@ +package webhooks + +import ( + "context" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" +) + +const selectClaimReadySQL = ` SELECT id, webhook_id, transition_id, event_id, event_type, payload, status, attempt, next_attempt_at, last_status_code, last_response, last_attempt_at, delivered_at, created_at FROM jetmon_webhook_deliveries WHERE status = 'pending' AND (next_attempt_at IS NULL OR next_attempt_at <= CURRENT_TIMESTAMP) ORDER BY next_attempt_at ASC LIMIT ? FOR UPDATE` + +const leaseClaimedSQL = ` UPDATE jetmon_webhook_deliveries SET next_attempt_at = ? WHERE id = ? AND status = 'pending'` + +var columnsClaimedDelivery = []string{ + "id", "webhook_id", "transition_id", "event_id", "event_type", + "payload", "status", "attempt", "next_attempt_at", "last_status_code", "last_response", + "last_attempt_at", "delivered_at", "created_at", +} + +// TestClaimReadyClaimsRowsTransactionally verifies that ClaimReady uses +// row-level locks and then leases each claimed row so subsequent ticks do not +// re-claim a still-in-flight delivery. +func TestClaimReadyClaimsRowsTransactionally(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + rows := sqlmock.NewRows(columnsClaimedDelivery). + AddRow(int64(1), int64(7), int64(100), int64(900), "event.opened", + []byte(`{}`), "pending", 0, now, nil, nil, nil, nil, now). + AddRow(int64(2), int64(7), int64(101), int64(901), "event.opened", + []byte(`{}`), "pending", 0, now, nil, nil, nil, nil, now) + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50).WillReturnRows(rows) + mock.ExpectExec(leaseClaimedSQL). + WithArgs(sqlmock.AnyArg(), int64(1)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(leaseClaimedSQL). + WithArgs(sqlmock.AnyArg(), int64(2)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + + out, err := ClaimReady(context.Background(), db, 50) + if err != nil { + t.Fatalf("ClaimReady: %v", err) + } + if len(out) != 2 { + t.Errorf("got %d claimed, want 2", len(out)) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expectations: %v", err) + } +} + +func TestClaimReadyRollsBackWhenLeaseUpdateMisses(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + rows := sqlmock.NewRows(columnsClaimedDelivery). + AddRow(int64(1), int64(7), int64(100), int64(900), "event.opened", + []byte(`{}`), "pending", 0, now, nil, nil, nil, nil, now) + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50).WillReturnRows(rows) + mock.ExpectExec(leaseClaimedSQL). + WithArgs(sqlmock.AnyArg(), int64(1)). + WillReturnResult(sqlmock.NewResult(0, 0)) + mock.ExpectRollback() + + out, err := ClaimReady(context.Background(), db, 50) + if err == nil { + t.Fatal("ClaimReady succeeded after lease update missed") + } + if len(out) != 0 { + t.Fatalf("got %d claimed rows with failed lease update, want 0", len(out)) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expectations: %v", err) + } +} + +// TestClaimReadyNoCandidatesCommitsWithoutLeaseUpdates verifies that when the +// SELECT returns nothing, ClaimReady issues no UPDATEs. +func TestClaimReadyNoCandidatesCommitsWithoutLeaseUpdates(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50). + WillReturnRows(sqlmock.NewRows(columnsClaimedDelivery)) + mock.ExpectCommit() + + out, err := ClaimReady(context.Background(), db, 50) + if err != nil { + t.Fatalf("ClaimReady: %v", err) + } + if len(out) != 0 { + t.Errorf("got %d claimed, want 0", len(out)) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expectations: %v", err) + } +} diff --git a/internal/webhooks/repository_coverage_test.go b/internal/webhooks/repository_coverage_test.go new file mode 100644 index 00000000..8bc85d22 --- /dev/null +++ b/internal/webhooks/repository_coverage_test.go @@ -0,0 +1,452 @@ +package webhooks + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "strings" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" +) + +var webhookColumns = []string{ + "id", "url", "active", "owner_tenant_id", "events", "site_filter", "state_filter", + "secret_preview", "created_by", "created_at", "updated_at", +} + +func webhookRow(id int64, url string, active uint8, createdAt time.Time) *sqlmock.Rows { + return sqlmock.NewRows(webhookColumns).AddRow( + id, url, active, "tenant-a", + `["event.opened"]`, + `{"site_ids":[42]}`, + `{"states":["Down"]}`, + "_XYZ", "ops", createdAt, createdAt, + ) +} + +func TestCreateWebhookPersistsDefaultsAndFetchesRecord(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + mock.ExpectExec("INSERT INTO jetmon_webhooks"). + WithArgs( + "https://consumer.example/hook", + 1, + nil, + sqlmock.AnyArg(), + sqlmock.AnyArg(), + sqlmock.AnyArg(), + sqlmock.AnyArg(), + sqlmock.AnyArg(), + "ops", + ). + WillReturnResult(sqlmock.NewResult(12, 1)) + mock.ExpectQuery("SELECT id, url, active, owner_tenant_id, events"). + WithArgs(int64(12)). + WillReturnRows(webhookRow(12, "https://consumer.example/hook", 1, now)) + + raw, hook, err := Create(context.Background(), db, CreateInput{ + URL: "https://consumer.example/hook", + Events: []string{EventOpened}, + SiteFilter: SiteFilter{SiteIDs: []int64{42}}, + StateFilter: StateFilter{States: []string{"Down"}}, + CreatedBy: "ops", + }) + if err != nil { + t.Fatalf("Create: %v", err) + } + if !strings.HasPrefix(raw, SecretPrefix) { + t.Fatalf("raw secret = %q, want %s prefix", raw, SecretPrefix) + } + if hook.ID != 12 || !hook.Active || hook.SiteFilter.SiteIDs[0] != 42 || hook.StateFilter.States[0] != "Down" { + t.Fatalf("hook = %+v", hook) + } + if hook.OwnerTenantID == nil || *hook.OwnerTenantID != "tenant-a" { + t.Fatalf("hook.OwnerTenantID = %v, want tenant-a", hook.OwnerTenantID) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestCreateWebhookRejectsInvalidInputBeforeDB(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + if _, _, err := Create(context.Background(), db, CreateInput{}); err == nil { + t.Fatal("Create accepted an empty URL") + } + if _, _, err := Create(context.Background(), db, CreateInput{ + URL: "https://consumer.example/hook", + Events: []string{"event.bogus"}, + }); !errors.Is(err, ErrInvalidEvent) { + t.Fatalf("Create invalid event error = %v, want ErrInvalidEvent", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unexpected sql calls: %v", err) + } +} + +func TestGetWebhookNotFound(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectQuery("SELECT id, url, active, owner_tenant_id, events"). + WithArgs(int64(404)). + WillReturnError(sql.ErrNoRows) + + _, err = Get(context.Background(), db, 404) + if !errors.Is(err, ErrWebhookNotFound) { + t.Fatalf("Get error = %v, want ErrWebhookNotFound", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestListWebhooksScansRows(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + rows := sqlmock.NewRows(webhookColumns). + AddRow(int64(1), "https://a.example", uint8(1), nil, `[]`, `{}`, `{}`, "aaaa", "ops", now, now). + AddRow(int64(2), "https://b.example", uint8(0), "tenant-b", nil, nil, nil, "bbbb", "ops", now, now) + mock.ExpectQuery("SELECT id, url, active, owner_tenant_id, events"). + WillReturnRows(rows) + + hooks, err := List(context.Background(), db) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(hooks) != 2 || hooks[0].Active != true || hooks[1].Active != false { + t.Fatalf("hooks = %+v", hooks) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestListActiveWebhooksScansRows(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + mock.ExpectQuery("SELECT id, url, active, owner_tenant_id, events"). + WillReturnRows(webhookRow(3, "https://active.example", 1, now)) + + hooks, err := ListActive(context.Background(), db) + if err != nil { + t.Fatalf("ListActive: %v", err) + } + if len(hooks) != 1 || hooks[0].ID != 3 { + t.Fatalf("hooks = %+v", hooks) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestTenantScopedWebhookQueriesFilterByOwner(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + active := false + mock.ExpectQuery("WHERE id = \\? AND owner_tenant_id = \\?"). + WithArgs(int64(12), "tenant-a"). + WillReturnRows(webhookRow(12, "https://tenant.example/hook", 1, now)) + mock.ExpectQuery("WHERE owner_tenant_id = \\? ORDER BY id ASC"). + WithArgs("tenant-a"). + WillReturnRows(webhookRow(13, "https://tenant.example/other", 1, now)) + mock.ExpectExec("UPDATE jetmon_webhooks SET"). + WithArgs(0, int64(12), "tenant-a"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery("WHERE id = \\? AND owner_tenant_id = \\?"). + WithArgs(int64(12), "tenant-a"). + WillReturnRows(sqlmock.NewRows(webhookColumns).AddRow( + int64(12), "https://tenant.example/hook", uint8(0), "tenant-a", + `["event.opened"]`, `{}`, `{}`, "_XYZ", "ops", now, now, + )) + mock.ExpectExec("DELETE FROM jetmon_webhooks WHERE id = \\? AND owner_tenant_id = \\?"). + WithArgs(int64(12), "tenant-a"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + hook, err := GetForTenant(context.Background(), db, 12, "tenant-a") + if err != nil { + t.Fatalf("GetForTenant: %v", err) + } + if hook.OwnerTenantID == nil || *hook.OwnerTenantID != "tenant-a" { + t.Fatalf("hook.OwnerTenantID = %v, want tenant-a", hook.OwnerTenantID) + } + hooks, err := ListForTenant(context.Background(), db, "tenant-a") + if err != nil { + t.Fatalf("ListForTenant: %v", err) + } + if len(hooks) != 1 || hooks[0].ID != 13 { + t.Fatalf("hooks = %+v", hooks) + } + hook, err = UpdateForTenant(context.Background(), db, 12, "tenant-a", UpdateInput{Active: &active}) + if err != nil { + t.Fatalf("UpdateForTenant: %v", err) + } + if hook.Active { + t.Fatalf("hook.Active = true, want false") + } + if err := DeleteForTenant(context.Background(), db, 12, "tenant-a"); err != nil { + t.Fatalf("DeleteForTenant: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestUpdateWebhookAppliesPatchAndFetchesRecord(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + url := "https://consumer.example/new" + active := false + events := []string{EventClosed} + siteFilter := SiteFilter{SiteIDs: []int64{7}} + stateFilter := StateFilter{States: []string{"Up"}} + now := time.Now().UTC() + + mock.ExpectExec("UPDATE jetmon_webhooks SET"). + WithArgs(url, 0, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), int64(5)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery("SELECT id, url, active, owner_tenant_id, events"). + WithArgs(int64(5)). + WillReturnRows(sqlmock.NewRows(webhookColumns).AddRow( + int64(5), url, uint8(0), nil, `["event.closed"]`, + `{"site_ids":[7]}`, `{"states":["Up"]}`, "_NEW", "ops", now, now, + )) + + hook, err := Update(context.Background(), db, 5, UpdateInput{ + URL: &url, + Active: &active, + Events: &events, + SiteFilter: &siteFilter, + StateFilter: &stateFilter, + }) + if err != nil { + t.Fatalf("Update: %v", err) + } + if hook.Active || hook.Events[0] != EventClosed || hook.SiteFilter.SiteIDs[0] != 7 { + t.Fatalf("hook = %+v", hook) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestDeleteWebhookReportsMissingRows(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectExec("DELETE FROM jetmon_webhooks"). + WithArgs(int64(10)). + WillReturnResult(sqlmock.NewResult(0, 0)) + + if err := Delete(context.Background(), db, 10); !errors.Is(err, ErrWebhookNotFound) { + t.Fatalf("Delete error = %v, want ErrWebhookNotFound", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestRotateSecretUpdatesStoredSecret(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + mock.ExpectExec("UPDATE jetmon_webhooks SET secret"). + WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), int64(8)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery("SELECT id, url, active, owner_tenant_id, events"). + WithArgs(int64(8)). + WillReturnRows(webhookRow(8, "https://consumer.example/hook", 1, now)) + + raw, hook, err := RotateSecret(context.Background(), db, 8) + if err != nil { + t.Fatalf("RotateSecret: %v", err) + } + if !strings.HasPrefix(raw, SecretPrefix) || hook.ID != 8 { + t.Fatalf("RotateSecret returned raw=%q hook=%+v", raw, hook) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestLoadSecret(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectQuery("SELECT secret FROM jetmon_webhooks"). + WithArgs(int64(4)). + WillReturnRows(sqlmock.NewRows([]string{"secret"}).AddRow("whsec_secret")) + + secret, err := LoadSecret(context.Background(), db, 4) + if err != nil { + t.Fatalf("LoadSecret: %v", err) + } + if secret != "whsec_secret" { + t.Fatalf("secret = %q", secret) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +var webhookDeliveryColumns = []string{ + "id", "webhook_id", "transition_id", "event_id", "event_type", + "payload", "status", "attempt", "next_attempt_at", "last_status_code", "last_response", + "last_attempt_at", "delivered_at", "created_at", +} + +func webhookDeliveryRow(id int64, status Status, now time.Time) *sqlmock.Rows { + return sqlmock.NewRows(webhookDeliveryColumns).AddRow( + id, int64(20), int64(30), int64(40), EventOpened, + []byte(`{"ok":true}`), string(status), 2, now, 503, "down", now, nil, now, + ) +} + +func TestEnqueueWebhookDeliveryReturnsInsertedIDAndDuplicateZero(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + payload := json.RawMessage(`{"type":"event.opened"}`) + mock.ExpectExec("INSERT IGNORE INTO jetmon_webhook_deliveries"). + WithArgs(int64(1), int64(2), int64(3), EventOpened, []byte(payload)). + WillReturnResult(sqlmock.NewResult(9, 1)) + mock.ExpectExec("INSERT IGNORE INTO jetmon_webhook_deliveries"). + WithArgs(int64(1), int64(2), int64(3), EventOpened, []byte(payload)). + WillReturnResult(sqlmock.NewResult(0, 0)) + + id, err := Enqueue(context.Background(), db, EnqueueInput{ + WebhookID: 1, TransitionID: 2, EventID: 3, EventType: EventOpened, Payload: payload, + }) + if err != nil || id != 9 { + t.Fatalf("Enqueue inserted = (%d, %v), want (9, nil)", id, err) + } + id, err = Enqueue(context.Background(), db, EnqueueInput{ + WebhookID: 1, TransitionID: 2, EventID: 3, EventType: EventOpened, Payload: payload, + }) + if err != nil || id != 0 { + t.Fatalf("Enqueue duplicate = (%d, %v), want (0, nil)", id, err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestWebhookDeliveryStateUpdates(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + next := time.Now().UTC().Add(time.Minute) + mock.ExpectExec("UPDATE jetmon_webhook_deliveries"). + WithArgs(204, "ok", int64(1)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetmon_webhook_deliveries"). + WithArgs(503, "retry", next, int64(2)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetmon_webhook_deliveries"). + WithArgs(410, "gone", int64(3)). + WillReturnResult(sqlmock.NewResult(0, 1)) + + if err := MarkDelivered(context.Background(), db, 1, 204, "ok"); err != nil { + t.Fatalf("MarkDelivered: %v", err) + } + if err := ScheduleRetry(context.Background(), db, 2, 503, "retry", next, false); err != nil { + t.Fatalf("ScheduleRetry retry: %v", err) + } + if err := ScheduleRetry(context.Background(), db, 3, 410, "gone", next, true); err != nil { + t.Fatalf("ScheduleRetry abandon: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestGetListAndRetryWebhookDeliveries(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + now := time.Now().UTC() + mock.ExpectQuery("SELECT id, webhook_id, transition_id"). + WithArgs(int64(1)). + WillReturnRows(webhookDeliveryRow(1, StatusAbandoned, now)) + mock.ExpectQuery("SELECT id, webhook_id, transition_id"). + WithArgs(int64(20), string(StatusAbandoned), int64(50), 10). + WillReturnRows(webhookDeliveryRow(2, StatusAbandoned, now)) + mock.ExpectExec("UPDATE jetmon_webhook_deliveries"). + WithArgs(int64(2)). + WillReturnResult(sqlmock.NewResult(0, 1)) + + d, err := GetDelivery(context.Background(), db, 1) + if err != nil { + t.Fatalf("GetDelivery: %v", err) + } + if d.LastStatusCode == nil || *d.LastStatusCode != 503 || d.LastResponse == nil || *d.LastResponse != "down" { + t.Fatalf("delivery did not scan nullable fields: %+v", d) + } + list, err := ListDeliveries(context.Background(), db, 20, StatusAbandoned, 50, 10) + if err != nil { + t.Fatalf("ListDeliveries: %v", err) + } + if len(list) != 1 || list[0].ID != 2 { + t.Fatalf("deliveries = %+v", list) + } + if err := RetryDelivery(context.Background(), db, 2); err != nil { + t.Fatalf("RetryDelivery: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} diff --git a/internal/webhooks/webhooks.go b/internal/webhooks/webhooks.go new file mode 100644 index 00000000..d2b79f5d --- /dev/null +++ b/internal/webhooks/webhooks.go @@ -0,0 +1,611 @@ +// Package webhooks manages outbound webhook subscriptions and HMAC-signed +// deliveries. Sole writer for jetmon_webhooks and jetmon_webhook_deliveries. +// +// A webhook is a registration that says "POST to this URL when matching +// events fire." A delivery is one webhook firing — created when an event +// transition matches the webhook's filters, then dispatched by the +// background delivery worker. +// +// See API.md "Family 4" for the public design and ROADMAP.md for deferred +// items (site.state_changed events, grace-period secret rotation). +package webhooks + +import ( + "context" + "crypto/hmac" + "crypto/rand" + "crypto/sha256" + "database/sql" + "encoding/base32" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "strconv" + "time" +) + +// Storage note: the raw secret is stored in plaintext in jetmon_webhooks. +// Webhooks are outbound-only — the server signs every delivery, so the HMAC +// key has to be available in plaintext at signing time. Hashing the secret +// at rest (the API-key pattern) would make signing impossible. Encryption +// at rest with a master key is on ROADMAP.md as a future hardening step. + +// Status enumerates the lifecycle states of a delivery row. +type Status string + +const ( + StatusPending Status = "pending" + StatusDelivered Status = "delivered" + StatusFailed Status = "failed" + StatusAbandoned Status = "abandoned" +) + +// Webhook event type strings — what consumers see in the X-Jetmon-Event +// header and the events filter array. Stable identifiers; new types are +// added (never renamed) so existing webhook configs don't break. +const ( + EventOpened = "event.opened" + EventSeverityChanged = "event.severity_changed" + EventStateChanged = "event.state_changed" + EventCauseLinked = "event.cause_linked" + EventCauseUnlinked = "event.cause_unlinked" + EventClosed = "event.closed" +) + +// AllEventTypes returns the canonical set of webhook event types. Used by +// validators (a webhook's events filter must use values from this set) and +// by docs/listings. +func AllEventTypes() []string { + return []string{ + EventOpened, + EventSeverityChanged, + EventStateChanged, + EventCauseLinked, + EventCauseUnlinked, + EventClosed, + } +} + +// SecretPrefix is the leak-detection hint on every raw secret. Stripe +// convention: a string that starts with this is unmistakably a webhook +// signing secret if it shows up in logs or git diffs. +const SecretPrefix = "whsec_" + +// Sentinel errors returned by package functions. +var ( + ErrWebhookNotFound = errors.New("webhooks: webhook not found") + ErrInvalidEvent = errors.New("webhooks: unknown event type") +) + +// Webhook is the in-memory shape of a jetmon_webhooks row. The raw secret +// is never stored here — it's hashed at create/rotate time and discarded. +type Webhook struct { + ID int64 + URL string + Active bool + OwnerTenantID *string + Events []string // empty slice = match all + SiteFilter SiteFilter // empty = match all + StateFilter StateFilter // empty = match all + SecretPreview string // last 4 chars of the raw secret, for display + CreatedBy string + CreatedAt time.Time + UpdatedAt time.Time +} + +// SiteFilter restricts deliveries to a fixed list of sites. Empty SiteIDs +// (or a nil filter) means "match all sites." +type SiteFilter struct { + SiteIDs []int64 `json:"site_ids,omitempty"` +} + +// StateFilter restricts deliveries to events with one of the given states. +// Empty States means "match all states." +type StateFilter struct { + States []string `json:"states,omitempty"` +} + +// Matches reports whether the filter set as a whole accepts a given +// (event_type, site_id, state) combination. Filters AND together; empty +// dimensions are unrestricted. +func (w *Webhook) Matches(eventType string, siteID int64, state string) bool { + if !w.Active { + return false + } + if len(w.Events) > 0 && !contains(w.Events, eventType) { + return false + } + if len(w.SiteFilter.SiteIDs) > 0 && !containsInt64(w.SiteFilter.SiteIDs, siteID) { + return false + } + if len(w.StateFilter.States) > 0 && !contains(w.StateFilter.States, state) { + return false + } + return true +} + +// CreateInput is the data needed to insert a new webhook. URL is required; +// everything else has sensible defaults (Active=true, all filters empty = +// match-all). +type CreateInput struct { + URL string + Active *bool // nil → true + OwnerTenantID *string + Events []string + SiteFilter SiteFilter + StateFilter StateFilter + CreatedBy string +} + +// UpdateInput is a sparse patch. nil fields are unchanged. Empty slices +// (vs. nil slices) are meaningful: an explicit empty slice clears the +// filter, restoring "match all" semantics. +type UpdateInput struct { + URL *string + Active *bool + Events *[]string + SiteFilter *SiteFilter + StateFilter *StateFilter +} + +// Create inserts a webhook and returns the one-time raw secret plus the +// persisted record. The raw secret is also stored in the DB (see Storage +// note above) so the delivery worker can sign with it. +func Create(ctx context.Context, db *sql.DB, in CreateInput) (rawSecret string, w *Webhook, err error) { + if in.URL == "" { + return "", nil, errors.New("webhooks: URL is required") + } + if err := validateEvents(in.Events); err != nil { + return "", nil, err + } + active := true + if in.Active != nil { + active = *in.Active + } + + rawSecret, err = GenerateSecret() + if err != nil { + return "", nil, err + } + preview := previewOf(rawSecret) + + eventsJSON, _ := json.Marshal(in.Events) + siteFilterJSON, _ := json.Marshal(in.SiteFilter) + stateFilterJSON, _ := json.Marshal(in.StateFilter) + + res, err := db.ExecContext(ctx, ` + INSERT INTO jetmon_webhooks + (url, active, owner_tenant_id, events, site_filter, state_filter, + secret, secret_preview, created_by) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + in.URL, boolToTinyint(active), nullableString(in.OwnerTenantID), eventsJSON, siteFilterJSON, stateFilterJSON, + rawSecret, preview, in.CreatedBy, + ) + if err != nil { + return "", nil, fmt.Errorf("webhooks: insert: %w", err) + } + id, err := res.LastInsertId() + if err != nil { + return "", nil, fmt.Errorf("webhooks: last insert id: %w", err) + } + + w, err = Get(ctx, db, id) + if err != nil { + return "", nil, err + } + return rawSecret, w, nil +} + +// Get returns a single webhook by id, or ErrWebhookNotFound. +func Get(ctx context.Context, db *sql.DB, id int64) (*Webhook, error) { + return get(ctx, db, id, "") +} + +// GetForTenant returns a single webhook owned by ownerTenantID. It hides +// cross-tenant rows behind ErrWebhookNotFound so future public callers don't +// learn whether another tenant's webhook exists. +func GetForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (*Webhook, error) { + if ownerTenantID == "" { + return nil, errors.New("webhooks: owner tenant id is required") + } + return get(ctx, db, id, ownerTenantID) +} + +func get(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (*Webhook, error) { + q := selectWebhookSQL + " WHERE id = ?" + args := []any{id} + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + row := db.QueryRowContext(ctx, q, args...) + w, err := scanWebhookRow(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrWebhookNotFound + } + return nil, err + } + return w, nil +} + +// List returns all webhooks ordered by id ASC. Webhook count is bounded by +// the number of registered consumers; we don't paginate today. If a future +// deployment grows past hundreds of webhooks, add cursor pagination here. +func List(ctx context.Context, db *sql.DB) ([]Webhook, error) { + return list(ctx, db, "") +} + +// ListForTenant returns only webhooks owned by ownerTenantID. +func ListForTenant(ctx context.Context, db *sql.DB, ownerTenantID string) ([]Webhook, error) { + if ownerTenantID == "" { + return nil, errors.New("webhooks: owner tenant id is required") + } + return list(ctx, db, ownerTenantID) +} + +func list(ctx context.Context, db *sql.DB, ownerTenantID string) ([]Webhook, error) { + q := selectWebhookSQL + args := []any{} + if ownerTenantID != "" { + q += " WHERE owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + q += " ORDER BY id ASC" + rows, err := db.QueryContext(ctx, q, args...) + if err != nil { + return nil, fmt.Errorf("webhooks: list: %w", err) + } + defer rows.Close() + var out []Webhook + for rows.Next() { + w, err := scanWebhookRow(rows) + if err != nil { + return nil, err + } + out = append(out, *w) + } + return out, rows.Err() +} + +// ListActive returns only webhooks with active=1. Used by the delivery +// dispatcher; inactive webhooks don't get matched against new transitions. +func ListActive(ctx context.Context, db *sql.DB) ([]Webhook, error) { + rows, err := db.QueryContext(ctx, selectWebhookSQL+" WHERE active = 1 ORDER BY id ASC") + if err != nil { + return nil, fmt.Errorf("webhooks: list active: %w", err) + } + defer rows.Close() + var out []Webhook + for rows.Next() { + w, err := scanWebhookRow(rows) + if err != nil { + return nil, err + } + out = append(out, *w) + } + return out, rows.Err() +} + +// Update applies a partial patch and returns the updated webhook. Fields +// left nil in UpdateInput are unchanged; an explicitly empty slice clears +// the corresponding filter to "match all" semantics. +func Update(ctx context.Context, db *sql.DB, id int64, in UpdateInput) (*Webhook, error) { + return update(ctx, db, id, "", in) +} + +// UpdateForTenant updates a webhook only when it is owned by ownerTenantID. +func UpdateForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string, in UpdateInput) (*Webhook, error) { + if ownerTenantID == "" { + return nil, errors.New("webhooks: owner tenant id is required") + } + return update(ctx, db, id, ownerTenantID, in) +} + +func update(ctx context.Context, db *sql.DB, id int64, ownerTenantID string, in UpdateInput) (*Webhook, error) { + if in.Events != nil { + if err := validateEvents(*in.Events); err != nil { + return nil, err + } + } + + clauses := []string{} + args := []any{} + if in.URL != nil { + clauses = append(clauses, "url = ?") + args = append(args, *in.URL) + } + if in.Active != nil { + clauses = append(clauses, "active = ?") + args = append(args, boolToTinyint(*in.Active)) + } + if in.Events != nil { + b, _ := json.Marshal(*in.Events) + clauses = append(clauses, "events = ?") + args = append(args, b) + } + if in.SiteFilter != nil { + b, _ := json.Marshal(*in.SiteFilter) + clauses = append(clauses, "site_filter = ?") + args = append(args, b) + } + if in.StateFilter != nil { + b, _ := json.Marshal(*in.StateFilter) + clauses = append(clauses, "state_filter = ?") + args = append(args, b) + } + + if len(clauses) == 0 { + // No-op patch — return current state. + return get(ctx, db, id, ownerTenantID) + } + + args = append(args, id) + q := "UPDATE jetmon_webhooks SET " + for i, c := range clauses { + if i > 0 { + q += ", " + } + q += c + } + q += " WHERE id = ?" + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + if _, err := db.ExecContext(ctx, q, args...); err != nil { + return nil, fmt.Errorf("webhooks: update: %w", err) + } + return get(ctx, db, id, ownerTenantID) +} + +// Delete removes a webhook from jetmon_webhooks. Existing rows in +// jetmon_webhook_deliveries are intentionally NOT cascaded — they remain +// for audit and manual retry. The dispatcher won't create new deliveries +// for a deleted webhook because ListActive filters it out. +func Delete(ctx context.Context, db *sql.DB, id int64) error { + return deleteWebhook(ctx, db, id, "") +} + +// DeleteForTenant removes a webhook only when it is owned by ownerTenantID. +func DeleteForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) error { + if ownerTenantID == "" { + return errors.New("webhooks: owner tenant id is required") + } + return deleteWebhook(ctx, db, id, ownerTenantID) +} + +func deleteWebhook(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) error { + q := "DELETE FROM jetmon_webhooks WHERE id = ?" + args := []any{id} + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + res, err := db.ExecContext(ctx, q, args...) + if err != nil { + return fmt.Errorf("webhooks: delete: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return ErrWebhookNotFound + } + return nil +} + +// RotateSecret generates a new secret, replaces the stored value, and +// returns the new raw secret (one-time view in API responses). The old +// secret stops working immediately — see API.md "Signing and secret +// rotation" for why this is the v1 behavior and how grace-period rotation +// will be added later. +func RotateSecret(ctx context.Context, db *sql.DB, id int64) (string, *Webhook, error) { + return rotateSecret(ctx, db, id, "") +} + +// RotateSecretForTenant rotates a webhook secret only when it is owned by +// ownerTenantID. +func RotateSecretForTenant(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (string, *Webhook, error) { + if ownerTenantID == "" { + return "", nil, errors.New("webhooks: owner tenant id is required") + } + return rotateSecret(ctx, db, id, ownerTenantID) +} + +func rotateSecret(ctx context.Context, db *sql.DB, id int64, ownerTenantID string) (string, *Webhook, error) { + rawSecret, err := GenerateSecret() + if err != nil { + return "", nil, err + } + preview := previewOf(rawSecret) + q := `UPDATE jetmon_webhooks SET secret = ?, secret_preview = ? WHERE id = ?` + args := []any{rawSecret, preview, id} + if ownerTenantID != "" { + q += " AND owner_tenant_id = ?" + args = append(args, ownerTenantID) + } + res, err := db.ExecContext(ctx, + q, args...) + if err != nil { + return "", nil, fmt.Errorf("webhooks: rotate-secret: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return "", nil, ErrWebhookNotFound + } + w, err := get(ctx, db, id, ownerTenantID) + if err != nil { + return "", nil, err + } + return rawSecret, w, nil +} + +// LoadSecret returns the raw signing secret for a webhook. Used by the +// delivery worker only — every public-facing handler returns SecretPreview +// instead. Kept as a separate function (not a field on Webhook) so the +// raw value can't leak through serialization of the Webhook struct. +func LoadSecret(ctx context.Context, db *sql.DB, id int64) (string, error) { + var s string + err := db.QueryRowContext(ctx, + `SELECT secret FROM jetmon_webhooks WHERE id = ?`, id, + ).Scan(&s) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return "", ErrWebhookNotFound + } + return "", fmt.Errorf("webhooks: load secret: %w", err) + } + return s, nil +} + +// GenerateSecret returns a fresh raw secret. 32 random bytes encoded as +// base32 with the "whsec_" prefix. Same shape as apikeys — high-entropy +// random; the leak-detection prefix is the only thing that distinguishes +// it from a generic random string. +func GenerateSecret() (string, error) { + var buf [32]byte + if _, err := rand.Read(buf[:]); err != nil { + return "", fmt.Errorf("webhooks: read entropy: %w", err) + } + encoded := base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(buf[:]) + return SecretPrefix + encoded, nil +} + +// Sign produces the X-Jetmon-Signature header value for a delivery. +// Format: "t=,v1=" — see API.md. +// +// The timestamp is part of the signature input so consumers can reject +// stale (replayed) deliveries by checking the t= value against their +// own clock and refusing anything older than ~5 minutes. +func Sign(timestamp time.Time, body []byte, secret string) string { + ts := strconv.FormatInt(timestamp.Unix(), 10) + mac := hmac.New(sha256.New, []byte(secret)) + mac.Write([]byte(ts)) + mac.Write([]byte(".")) + mac.Write(body) + sig := hex.EncodeToString(mac.Sum(nil)) + return "t=" + ts + ",v1=" + sig +} + +// EventTypeForReason maps a jetmon_event_transitions.reason value to the +// webhook event type that should fire. Returns "" if the reason should +// produce no webhook (used for cause-link reasons that are stored as +// transitions but not surfaced as separate webhook events in v1). +// +// The mapping is fixed in code — adding new transition reasons requires +// extending this function so consumers see the right webhook event type. +func EventTypeForReason(reason string) string { + switch reason { + case "opened": + return EventOpened + case "severity_escalation", "severity_deescalation": + return EventSeverityChanged + case "state_change", "verifier_confirmed": + return EventStateChanged + case "cause_linked": + return EventCauseLinked + case "cause_unlinked": + return EventCauseUnlinked + case "verifier_cleared", "probe_cleared", "false_alarm", + "manual_override", "maintenance_swallowed", "superseded", "auto_timeout": + return EventClosed + default: + return "" + } +} + +// validateEvents rejects an events list that includes an unknown event +// type. Empty list is fine — that's the "match all" sentinel. +func validateEvents(events []string) error { + all := AllEventTypes() + for _, e := range events { + if !contains(all, e) { + return fmt.Errorf("%w: %q (allowed: %v)", ErrInvalidEvent, e, all) + } + } + return nil +} + +// previewOf returns the last 4 characters of a raw secret for display. +// Short enough to fit on a one-line listing; long enough to disambiguate +// among a handful of webhooks. +func previewOf(s string) string { + if len(s) <= 4 { + return s + } + return s[len(s)-4:] +} + +// selectWebhookSQL is shared by Get / List / ListActive so the column +// order matches scanWebhookRow. +const selectWebhookSQL = ` + SELECT id, url, active, owner_tenant_id, events, site_filter, state_filter, + secret_preview, created_by, created_at, updated_at + FROM jetmon_webhooks` + +type rowScanner interface { + Scan(...any) error +} + +func scanWebhookRow(s rowScanner) (*Webhook, error) { + var ( + w Webhook + active uint8 + ownerTenantID sql.NullString + eventsJSON sql.NullString + siteFilterJSON sql.NullString + stateFilterJSON sql.NullString + ) + if err := s.Scan( + &w.ID, &w.URL, &active, &ownerTenantID, &eventsJSON, &siteFilterJSON, &stateFilterJSON, + &w.SecretPreview, &w.CreatedBy, &w.CreatedAt, &w.UpdatedAt, + ); err != nil { + return nil, err + } + w.Active = active == 1 + if ownerTenantID.Valid { + w.OwnerTenantID = &ownerTenantID.String + } + if eventsJSON.Valid && eventsJSON.String != "" { + _ = json.Unmarshal([]byte(eventsJSON.String), &w.Events) + } + if siteFilterJSON.Valid && siteFilterJSON.String != "" { + _ = json.Unmarshal([]byte(siteFilterJSON.String), &w.SiteFilter) + } + if stateFilterJSON.Valid && stateFilterJSON.String != "" { + _ = json.Unmarshal([]byte(stateFilterJSON.String), &w.StateFilter) + } + return &w, nil +} + +func boolToTinyint(b bool) int { + if b { + return 1 + } + return 0 +} + +func nullableString(s *string) any { + if s == nil { + return nil + } + return *s +} + +func contains(haystack []string, needle string) bool { + for _, s := range haystack { + if s == needle { + return true + } + } + return false +} + +func containsInt64(haystack []int64, needle int64) bool { + for _, v := range haystack { + if v == needle { + return true + } + } + return false +} diff --git a/internal/webhooks/webhooks_test.go b/internal/webhooks/webhooks_test.go new file mode 100644 index 00000000..05e5e873 --- /dev/null +++ b/internal/webhooks/webhooks_test.go @@ -0,0 +1,238 @@ +package webhooks + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "strconv" + "strings" + "testing" + "time" +) + +func TestGenerateSecretShape(t *testing.T) { + raw, err := GenerateSecret() + if err != nil { + t.Fatalf("GenerateSecret: %v", err) + } + if !strings.HasPrefix(raw, SecretPrefix) { + t.Fatalf("missing prefix: %q", raw) + } + // 32 random bytes → 52 base32 chars (no padding) + len(SecretPrefix). + if len(raw) != len(SecretPrefix)+52 { + t.Errorf("raw length = %d, want %d", len(raw), len(SecretPrefix)+52) + } +} + +func TestGenerateSecretUnique(t *testing.T) { + a, _ := GenerateSecret() + b, _ := GenerateSecret() + if a == b { + t.Fatal("two generated secrets collided") + } +} + +func TestSignDeterministicWithSameInputs(t *testing.T) { + ts := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) + body := []byte(`{"event":"event.opened","id":42}`) + a := Sign(ts, body, "whsec_TESTSECRET") + b := Sign(ts, body, "whsec_TESTSECRET") + if a != b { + t.Errorf("Sign should be deterministic; got %q vs %q", a, b) + } +} + +func TestSignFormat(t *testing.T) { + ts := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) + body := []byte(`{"hello":"world"}`) + secret := "whsec_TESTSECRET" + got := Sign(ts, body, secret) + if !strings.HasPrefix(got, "t=") { + t.Errorf("signature = %q, want prefix t=", got) + } + if !strings.Contains(got, ",v1=") { + t.Errorf("signature = %q, want ,v1=", got) + } + // Compute the expected signature independently — same algorithm but with + // the timestamp pulled from ts so the test stays correct under any clock. + tsStr := strconv.FormatInt(ts.Unix(), 10) + mac := hmac.New(sha256.New, []byte(secret)) + mac.Write([]byte(tsStr)) + mac.Write([]byte(".")) + mac.Write(body) + expected := "t=" + tsStr + ",v1=" + hex.EncodeToString(mac.Sum(nil)) + if got != expected { + t.Errorf("Sign computed unexpectedly\n got: %s\nwant: %s", got, expected) + } +} + +func TestSignDiffersOnTimestamp(t *testing.T) { + t1 := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) + t2 := t1.Add(1 * time.Second) + body := []byte(`{}`) + a := Sign(t1, body, "whsec_x") + b := Sign(t2, body, "whsec_x") + if a == b { + t.Errorf("signature should change with timestamp; both = %q", a) + } +} + +func TestSignDiffersOnSecret(t *testing.T) { + ts := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) + body := []byte(`{}`) + if Sign(ts, body, "whsec_a") == Sign(ts, body, "whsec_b") { + t.Error("signature should differ between secrets") + } +} + +func TestEventTypeForReason(t *testing.T) { + cases := map[string]string{ + "opened": EventOpened, + "severity_escalation": EventSeverityChanged, + "severity_deescalation": EventSeverityChanged, + "state_change": EventStateChanged, + "verifier_confirmed": EventStateChanged, + "cause_linked": EventCauseLinked, + "cause_unlinked": EventCauseUnlinked, + "verifier_cleared": EventClosed, + "probe_cleared": EventClosed, + "false_alarm": EventClosed, + "manual_override": EventClosed, + "maintenance_swallowed": EventClosed, + "superseded": EventClosed, + "auto_timeout": EventClosed, + "unknown_reason": "", + "": "", + } + for reason, want := range cases { + got := EventTypeForReason(reason) + if got != want { + t.Errorf("EventTypeForReason(%q) = %q, want %q", reason, got, want) + } + } +} + +func TestWebhookMatchesAllFiltersEmpty(t *testing.T) { + // No filters set — webhook should match everything. + w := &Webhook{Active: true} + if !w.Matches(EventOpened, 12345, "Down") { + t.Error("empty filters should match all events") + } + if !w.Matches(EventClosed, 99999, "Up") { + t.Error("empty filters should match unrelated event/state") + } +} + +func TestWebhookMatchesInactive(t *testing.T) { + w := &Webhook{Active: false} + if w.Matches(EventOpened, 1, "Down") { + t.Error("inactive webhook should never match") + } +} + +func TestWebhookMatchesEventFilter(t *testing.T) { + w := &Webhook{ + Active: true, + Events: []string{EventOpened, EventClosed}, + } + if !w.Matches(EventOpened, 1, "Down") { + t.Error("event in filter should match") + } + if w.Matches(EventSeverityChanged, 1, "Down") { + t.Error("event not in filter should not match") + } +} + +func TestWebhookMatchesSiteFilter(t *testing.T) { + w := &Webhook{ + Active: true, + SiteFilter: SiteFilter{SiteIDs: []int64{101, 102}}, + } + if !w.Matches(EventOpened, 101, "Down") { + t.Error("site in filter should match") + } + if w.Matches(EventOpened, 999, "Down") { + t.Error("site not in filter should not match") + } +} + +func TestWebhookMatchesStateFilter(t *testing.T) { + w := &Webhook{ + Active: true, + StateFilter: StateFilter{States: []string{"Down", "Seems Down"}}, + } + if !w.Matches(EventOpened, 1, "Down") { + t.Error("state in filter should match") + } + if w.Matches(EventOpened, 1, "Warning") { + t.Error("state not in filter should not match") + } +} + +func TestWebhookMatchesAllDimensions(t *testing.T) { + // All three filters set — must AND across dimensions. + w := &Webhook{ + Active: true, + Events: []string{EventOpened}, + SiteFilter: SiteFilter{SiteIDs: []int64{42}}, + StateFilter: StateFilter{States: []string{"Down"}}, + } + if !w.Matches(EventOpened, 42, "Down") { + t.Error("all three dimensions match → should fire") + } + if w.Matches(EventClosed, 42, "Down") { + t.Error("event mismatch → should not fire (AND semantics)") + } + if w.Matches(EventOpened, 99, "Down") { + t.Error("site mismatch → should not fire (AND semantics)") + } + if w.Matches(EventOpened, 42, "Up") { + t.Error("state mismatch → should not fire (AND semantics)") + } +} + +func TestPreviewOf(t *testing.T) { + if got := previewOf("whsec_LONG_SECRET_VALUE_XYZ"); got != "_XYZ" { + t.Errorf("previewOf long = %q, want _XYZ", got) + } + if got := previewOf("ab"); got != "ab" { + t.Errorf("previewOf short = %q, want ab", got) + } +} + +func TestValidateEventsRejectsUnknown(t *testing.T) { + if err := validateEvents([]string{EventOpened, "event.bogus"}); err == nil { + t.Error("unknown event type should be rejected") + } + if err := validateEvents([]string{EventOpened, EventClosed}); err != nil { + t.Errorf("known events rejected: %v", err) + } + if err := validateEvents(nil); err != nil { + t.Errorf("empty events list rejected: %v", err) + } +} + +func TestAllEventTypesIsCanonical(t *testing.T) { + all := AllEventTypes() + expected := []string{ + EventOpened, EventSeverityChanged, EventStateChanged, + EventCauseLinked, EventCauseUnlinked, EventClosed, + } + if len(all) != len(expected) { + t.Fatalf("AllEventTypes() len = %d, want %d", len(all), len(expected)) + } + for i, e := range expected { + if all[i] != e { + t.Errorf("AllEventTypes()[%d] = %q, want %q", i, all[i], e) + } + } +} + +func TestTruncate(t *testing.T) { + if got := truncate("hello", 10); got != "hello" { + t.Errorf("truncate(short) = %q", got) + } + if got := truncate("hello world", 5); got != "hello" { + t.Errorf("truncate(long) = %q", got) + } +} diff --git a/internal/webhooks/worker.go b/internal/webhooks/worker.go new file mode 100644 index 00000000..57e01ebb --- /dev/null +++ b/internal/webhooks/worker.go @@ -0,0 +1,464 @@ +package webhooks + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net" + "net/http" + "strconv" + "sync" + "time" +) + +// retrySchedule maps the *next* attempt number to its delay from the +// previous attempt. attempt 1 is the initial enqueue (immediate); attempts +// 2–6 retry at the documented intervals. +// +// After attempt 6 fails, the delivery is abandoned. Total elapsed time +// from first attempt to abandonment: ~7h36m. See API.md for rationale. +var retrySchedule = []time.Duration{ + 0, // attempt 1 — initial enqueue, no retry delay + 1 * time.Minute, // attempt 2 + 5 * time.Minute, // attempt 3 + 30 * time.Minute, // attempt 4 + 1 * time.Hour, // attempt 5 + 6 * time.Hour, // attempt 6 +} + +// maxAttempts is the highest attempt number we'll try. After attempt 6 +// fails, the row is marked abandoned. +const maxAttempts = 6 + +// nextRetryDelay returns the delay until the next attempt given the +// current attempt count (1-indexed: 1 is the first POST, 6 is the last). +// abandoned=true means there is no next attempt — the delivery should +// be marked abandoned. +func nextRetryDelay(currentAttempt int) (delay time.Duration, abandoned bool) { + next := currentAttempt + 1 + if next > maxAttempts { + return 0, true + } + return retrySchedule[next-1], false +} + +// WorkerConfig configures the delivery worker. Defaults are sensible for +// a single jetmon2 instance; multi-instance deployments should set +// InstanceID to a unique value per instance so each tracks its own +// dispatch progress. +type WorkerConfig struct { + DB *sql.DB + InstanceID string // key into jetmon_webhook_dispatch_progress + PollInterval time.Duration // default 1s + MaxConcurrent int // shared deliverer pool size; default 50 + PerWebhookCap int // per-webhook in-flight cap; default 3 + HTTPTimeout time.Duration // per-delivery HTTP timeout; default 30s + BatchSize int // dispatcher's transition fetch + deliverer's claim batch; default 200 +} + +func (c *WorkerConfig) applyDefaults() { + if c.PollInterval == 0 { + c.PollInterval = 1 * time.Second + } + if c.MaxConcurrent == 0 { + c.MaxConcurrent = 50 + } + if c.PerWebhookCap == 0 { + c.PerWebhookCap = 3 + } + if c.HTTPTimeout == 0 { + c.HTTPTimeout = 30 * time.Second + } + if c.BatchSize == 0 { + c.BatchSize = 200 + } + if c.InstanceID == "" { + c.InstanceID = "default" + } +} + +// Worker drives webhook delivery. Two background goroutines: +// +// - dispatcher: every PollInterval, polls jetmon_event_transitions for +// new rows since last_seen, matches each against active webhooks, +// and enqueues a delivery per match. +// - deliverer: every PollInterval, claims pending deliveries whose +// next_attempt_at has passed and POSTs them with HMAC signing. +// Successes mark delivered; failures schedule retries on the +// exponential backoff schedule until attempt 6, then abandon. +// +// Both goroutines run continuously until Stop is called. Stop blocks +// until both have exited cleanly. +type Worker struct { + cfg WorkerConfig + httpClient *http.Client + + inFlightMu sync.Mutex + inFlight map[int64]int // webhook_id → current in-flight count + + stop chan struct{} + done chan struct{} +} + +// NewWorker constructs a Worker. Call Start to launch the goroutines. +func NewWorker(cfg WorkerConfig) *Worker { + cfg.applyDefaults() + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 5 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 10, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 5 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + ForceAttemptHTTP2: true, + } + return &Worker{ + cfg: cfg, + httpClient: &http.Client{Transport: transport, Timeout: cfg.HTTPTimeout}, + inFlight: make(map[int64]int), + stop: make(chan struct{}), + done: make(chan struct{}), + } +} + +// Start launches the dispatcher and deliverer goroutines. Call Stop to +// signal shutdown. Start is non-blocking. +func (w *Worker) Start() { + go w.run() +} + +// Stop signals the goroutines to exit and waits for them. +func (w *Worker) Stop() { + close(w.stop) + <-w.done +} + +func (w *Worker) run() { + defer close(w.done) + + dispatcherDone := make(chan struct{}) + delivererDone := make(chan struct{}) + + go func() { + defer close(dispatcherDone) + w.dispatchLoop() + }() + go func() { + defer close(delivererDone) + w.deliverLoop() + }() + + <-dispatcherDone + <-delivererDone +} + +// dispatchLoop is the polling loop for the dispatcher. +func (w *Worker) dispatchLoop() { + ticker := time.NewTicker(w.cfg.PollInterval) + defer ticker.Stop() + for { + select { + case <-w.stop: + return + case <-ticker.C: + if err := w.dispatchTick(); err != nil { + log.Printf("webhooks: dispatcher tick error: %v", err) + } + } + } +} + +// dispatchTick polls jetmon_event_transitions for new rows and creates +// deliveries for each match against an active webhook. +func (w *Worker) dispatchTick() error { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + lastID, err := w.loadProgress(ctx) + if err != nil { + return fmt.Errorf("load progress: %w", err) + } + + type transitionRow struct { + id int64 + eventID int64 + blogID int64 + stateAfter sql.NullString + reason string + changedAt time.Time + } + rows, err := w.cfg.DB.QueryContext(ctx, ` + SELECT id, event_id, blog_id, state_after, reason, changed_at + FROM jetmon_event_transitions + WHERE id > ? + ORDER BY id ASC + LIMIT ?`, lastID, w.cfg.BatchSize) + if err != nil { + return fmt.Errorf("query transitions: %w", err) + } + defer rows.Close() + + var transitions []transitionRow + for rows.Next() { + var t transitionRow + if err := rows.Scan(&t.id, &t.eventID, &t.blogID, &t.stateAfter, &t.reason, &t.changedAt); err != nil { + return fmt.Errorf("scan transition: %w", err) + } + transitions = append(transitions, t) + } + if err := rows.Err(); err != nil { + return fmt.Errorf("transitions iterate: %w", err) + } + if len(transitions) == 0 { + return nil + } + + hooks, err := ListActive(ctx, w.cfg.DB) + if err != nil { + return fmt.Errorf("list active webhooks: %w", err) + } + + for _, t := range transitions { + eventType := EventTypeForReason(t.reason) + if eventType == "" { + continue + } + state := "" + if t.stateAfter.Valid { + state = t.stateAfter.String + } + for i := range hooks { + h := &hooks[i] + if !h.Matches(eventType, t.blogID, state) { + continue + } + payload, err := w.buildPayload(eventType, t.id, t.eventID, t.blogID, t.reason, state, t.changedAt) + if err != nil { + log.Printf("webhooks: build payload event_id=%d transition_id=%d: %v", + t.eventID, t.id, err) + continue + } + if _, err := Enqueue(ctx, w.cfg.DB, EnqueueInput{ + WebhookID: h.ID, + TransitionID: t.id, + EventID: t.eventID, + EventType: eventType, + Payload: payload, + }); err != nil { + log.Printf("webhooks: enqueue webhook_id=%d transition_id=%d: %v", + h.ID, t.id, err) + continue + } + } + } + + if err := w.saveProgress(ctx, transitions[len(transitions)-1].id); err != nil { + return fmt.Errorf("save progress: %w", err) + } + return nil +} + +// buildPayload returns the JSON body that the consumer receives. Frozen at +// enqueue time — see API.md "frozen-at-fire-time" contract. +// +// Shape is flat: type, occurred_at, ids, and the relevant event/transition +// fields. Consumers who want full event detail call GET /events/{id}. +func (w *Worker) buildPayload(eventType string, transitionID, eventID, blogID int64, reason, state string, occurredAt time.Time) (json.RawMessage, error) { + body := map[string]any{ + "type": eventType, + "occurred_at": occurredAt.UTC().Format(time.RFC3339Nano), + "transition_id": transitionID, + "event_id": eventID, + "site_id": blogID, + "reason": reason, + "state": state, + } + return json.Marshal(body) +} + +// loadProgress reads the last_transition_id high-water mark for this +// instance from jetmon_webhook_dispatch_progress. Returns 0 if no row +// exists yet (first tick). +func (w *Worker) loadProgress(ctx context.Context) (int64, error) { + var lastID int64 + err := w.cfg.DB.QueryRowContext(ctx, + `SELECT last_transition_id FROM jetmon_webhook_dispatch_progress WHERE instance_id = ?`, + w.cfg.InstanceID, + ).Scan(&lastID) + if errors.Is(err, sql.ErrNoRows) { + return 0, nil + } + if err != nil { + return 0, err + } + return lastID, nil +} + +// saveProgress upserts the last_transition_id high-water mark for this +// instance. Multi-instance: each instance has its own row keyed on +// instance_id, so they don't trample each other's progress. +func (w *Worker) saveProgress(ctx context.Context, lastID int64) error { + _, err := w.cfg.DB.ExecContext(ctx, ` + INSERT INTO jetmon_webhook_dispatch_progress (instance_id, last_transition_id) + VALUES (?, ?) + ON DUPLICATE KEY UPDATE last_transition_id = VALUES(last_transition_id)`, + w.cfg.InstanceID, lastID) + return err +} + +// deliverLoop is the polling loop for the deliverer. It pulls ready +// deliveries from the queue and dispatches each as a goroutine, subject +// to the per-webhook in-flight cap. +func (w *Worker) deliverLoop() { + ticker := time.NewTicker(w.cfg.PollInterval) + defer ticker.Stop() + for { + select { + case <-w.stop: + return + case <-ticker.C: + if err := w.deliverTick(); err != nil { + log.Printf("webhooks: deliverer tick error: %v", err) + } + } + } +} + +func (w *Worker) deliverTick() error { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + deliveries, err := ClaimReady(ctx, w.cfg.DB, w.cfg.MaxConcurrent) + if err != nil { + return err + } + for i := range deliveries { + d := deliveries[i] + if !w.acquireSlot(d.WebhookID) { + // Per-webhook cap reached; row stays pending and we'll pick + // it up next tick. + continue + } + go func(d Delivery) { + defer w.releaseSlot(d.WebhookID) + w.deliver(d) + }(d) + } + return nil +} + +// acquireSlot tries to reserve a per-webhook in-flight slot. Returns true +// if reserved, false if the webhook is already at its cap. +func (w *Worker) acquireSlot(webhookID int64) bool { + w.inFlightMu.Lock() + defer w.inFlightMu.Unlock() + if w.inFlight[webhookID] >= w.cfg.PerWebhookCap { + return false + } + w.inFlight[webhookID]++ + return true +} + +func (w *Worker) releaseSlot(webhookID int64) { + w.inFlightMu.Lock() + defer w.inFlightMu.Unlock() + w.inFlight[webhookID]-- + if w.inFlight[webhookID] <= 0 { + delete(w.inFlight, webhookID) + } +} + +// deliver runs one POST attempt against the consumer URL. Updates the +// delivery row with success/retry/abandon based on the response. +func (w *Worker) deliver(d Delivery) { + ctx, cancel := context.WithTimeout(context.Background(), w.cfg.HTTPTimeout+5*time.Second) + defer cancel() + + // Look up the URL and signing secret from the webhook row. Either may + // be missing if the webhook was deleted between dispatch and deliver, + // in which case we abandon the row (the delivery target is gone). + hook, err := Get(ctx, w.cfg.DB, d.WebhookID) + if err != nil { + w.handleResult(ctx, d, 0, fmt.Sprintf("webhook lookup: %v", err), true) + return + } + secret, err := LoadSecret(ctx, w.cfg.DB, d.WebhookID) + if err != nil { + w.handleResult(ctx, d, 0, fmt.Sprintf("secret lookup: %v", err), true) + return + } + if !hook.Active { + // Webhook was paused between dispatch and deliver. Abandon: the + // caller doesn't want this delivery anymore. + w.handleResult(ctx, d, 0, "webhook is inactive", true) + return + } + + timestamp := time.Now() + signature := Sign(timestamp, d.Payload, secret) + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, hook.URL, bytes.NewReader(d.Payload)) + if err != nil { + w.handleResult(ctx, d, 0, fmt.Sprintf("build request: %v", err), false) + return + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Jetmon-Event", d.EventType) + req.Header.Set("X-Jetmon-Delivery", strconv.FormatInt(d.ID, 10)) + req.Header.Set("X-Jetmon-Signature", signature) + + resp, err := w.httpClient.Do(req) + if err != nil { + // Network-level failure: connection refused, DNS, timeout, TLS. + // Record the error message as last_response and schedule retry. + w.handleResult(ctx, d, 0, "transport: "+err.Error(), false) + return + } + defer resp.Body.Close() + + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) + if resp.StatusCode >= 200 && resp.StatusCode < 300 { + if err := MarkDelivered(ctx, w.cfg.DB, d.ID, resp.StatusCode, string(body)); err != nil { + log.Printf("webhooks: mark delivered id=%d: %v", d.ID, err) + } + return + } + // Any non-2xx is retried. Some 4xx (404, 410) might warrant immediate + // abandonment, but for v1 we treat all non-2xx alike — consumers + // occasionally return 4xx during deploys, and a single 4xx shouldn't + // permanently fail an otherwise-recoverable webhook. + w.handleResult(ctx, d, resp.StatusCode, string(body), false) +} + +// handleResult writes the delivery outcome to the database. forceAbandon +// is true for non-retryable failures (webhook deleted/inactive, request +// build error); otherwise the retry schedule decides whether to retry or +// abandon based on the attempt count. +func (w *Worker) handleResult(ctx context.Context, d Delivery, statusCode int, responseBody string, forceAbandon bool) { + currentAttempt := d.Attempt + 1 // we just completed this attempt + var ( + next time.Time + abandoned bool + ) + if forceAbandon { + abandoned = true + } else { + delay, ab := nextRetryDelay(currentAttempt) + abandoned = ab + if !abandoned { + next = time.Now().Add(delay) + } + } + if err := ScheduleRetry(ctx, w.cfg.DB, d.ID, statusCode, responseBody, next, abandoned); err != nil { + log.Printf("webhooks: schedule retry id=%d: %v", d.ID, err) + } +} diff --git a/internal/webhooks/worker_test.go b/internal/webhooks/worker_test.go new file mode 100644 index 00000000..09dddaaa --- /dev/null +++ b/internal/webhooks/worker_test.go @@ -0,0 +1,306 @@ +package webhooks + +import ( + "context" + "crypto/hmac" + "crypto/sha256" + "database/sql" + "encoding/hex" + "encoding/json" + "strconv" + "strings" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" +) + +func TestNextRetryDelayFollowsSchedule(t *testing.T) { + cases := []struct { + current int + want time.Duration + abandoned bool + }{ + {1, 1 * time.Minute, false}, + {2, 5 * time.Minute, false}, + {3, 30 * time.Minute, false}, + {4, 1 * time.Hour, false}, + {5, 6 * time.Hour, false}, + {6, 0, true}, // last attempt failed → abandon + {7, 0, true}, // beyond max → still abandon (defensive) + } + for _, c := range cases { + got, ab := nextRetryDelay(c.current) + if ab != c.abandoned { + t.Errorf("nextRetryDelay(%d).abandoned = %v, want %v", c.current, ab, c.abandoned) + } + if !c.abandoned && got != c.want { + t.Errorf("nextRetryDelay(%d).delay = %v, want %v", c.current, got, c.want) + } + } +} + +// TestSignatureRoundTrip verifies that consumers can recompute and verify +// the signature we send. This is the contract test — if it ever fails, +// every consumer's signature verification breaks. +func TestSignatureRoundTrip(t *testing.T) { + secret := "whsec_TEST_SECRET_VALUE" + body := []byte(`{"type":"event.opened","event_id":42}`) + timestamp := time.Date(2026, 4, 25, 12, 0, 0, 0, time.UTC) + + signature := Sign(timestamp, body, secret) + + // Parse the signature: t=,v1= + parts := strings.Split(signature, ",") + if len(parts) != 2 { + t.Fatalf("signature should have 2 parts, got %d: %s", len(parts), signature) + } + if !strings.HasPrefix(parts[0], "t=") { + t.Fatalf("part 0 should start with t=, got %s", parts[0]) + } + if !strings.HasPrefix(parts[1], "v1=") { + t.Fatalf("part 1 should start with v1=, got %s", parts[1]) + } + tsStr := strings.TrimPrefix(parts[0], "t=") + sigHex := strings.TrimPrefix(parts[1], "v1=") + + // Recompute on the consumer side. + mac := hmac.New(sha256.New, []byte(secret)) + mac.Write([]byte(tsStr)) + mac.Write([]byte(".")) + mac.Write(body) + expected := hex.EncodeToString(mac.Sum(nil)) + + if !hmac.Equal([]byte(sigHex), []byte(expected)) { + t.Errorf("signature mismatch:\n got %s\n want %s", sigHex, expected) + } + + // Verify timestamp is parseable and matches what we sent. + ts, err := strconv.ParseInt(tsStr, 10, 64) + if err != nil { + t.Errorf("timestamp not parseable: %v", err) + } + if ts != timestamp.Unix() { + t.Errorf("timestamp = %d, want %d", ts, timestamp.Unix()) + } +} + +func TestApplyDefaults(t *testing.T) { + c := WorkerConfig{} + c.applyDefaults() + if c.PollInterval != 1*time.Second { + t.Errorf("PollInterval = %v, want 1s", c.PollInterval) + } + if c.MaxConcurrent != 50 { + t.Errorf("MaxConcurrent = %d, want 50", c.MaxConcurrent) + } + if c.PerWebhookCap != 3 { + t.Errorf("PerWebhookCap = %d, want 3", c.PerWebhookCap) + } + if c.HTTPTimeout != 30*time.Second { + t.Errorf("HTTPTimeout = %v, want 30s", c.HTTPTimeout) + } + if c.BatchSize != 200 { + t.Errorf("BatchSize = %d, want 200", c.BatchSize) + } + if c.InstanceID != "default" { + t.Errorf("InstanceID = %q, want default", c.InstanceID) + } +} + +func TestApplyDefaultsPreservesExplicit(t *testing.T) { + c := WorkerConfig{ + PollInterval: 5 * time.Second, + MaxConcurrent: 10, + InstanceID: "host-a", + } + c.applyDefaults() + if c.PollInterval != 5*time.Second { + t.Errorf("PollInterval = %v, want 5s (explicit)", c.PollInterval) + } + if c.MaxConcurrent != 10 { + t.Errorf("MaxConcurrent = %d, want 10 (explicit)", c.MaxConcurrent) + } + if c.InstanceID != "host-a" { + t.Errorf("InstanceID = %q, want host-a (explicit)", c.InstanceID) + } + // Unset fields should still get defaults. + if c.PerWebhookCap != 3 { + t.Errorf("PerWebhookCap = %d, want 3 (default)", c.PerWebhookCap) + } +} + +func TestAcquireSlotRespectsCap(t *testing.T) { + w := &Worker{ + cfg: WorkerConfig{PerWebhookCap: 2}, + inFlight: make(map[int64]int), + } + if !w.acquireSlot(1) { + t.Fatal("first acquire should succeed") + } + if !w.acquireSlot(1) { + t.Fatal("second acquire should succeed (under cap)") + } + if w.acquireSlot(1) { + t.Fatal("third acquire should fail (cap=2)") + } + w.releaseSlot(1) + if !w.acquireSlot(1) { + t.Fatal("acquire after release should succeed") + } +} + +func TestAcquireSlotIsolatesWebhooks(t *testing.T) { + w := &Worker{ + cfg: WorkerConfig{PerWebhookCap: 1}, + inFlight: make(map[int64]int), + } + if !w.acquireSlot(1) { + t.Fatal("webhook 1 first acquire failed") + } + if w.acquireSlot(1) { + t.Fatal("webhook 1 second acquire should fail (cap=1)") + } + // Different webhook should be unaffected. + if !w.acquireSlot(2) { + t.Fatal("webhook 2 acquire should succeed even though webhook 1 is at cap") + } +} + +func TestReleaseSlotCleansUpZeroCounts(t *testing.T) { + w := &Worker{ + cfg: WorkerConfig{PerWebhookCap: 5}, + inFlight: make(map[int64]int), + } + w.acquireSlot(1) + w.releaseSlot(1) + if _, ok := w.inFlight[1]; ok { + t.Error("zero-count entry should be deleted from map") + } +} + +func TestNewWorkerInitializesRuntimeState(t *testing.T) { + w := NewWorker(WorkerConfig{InstanceID: "host-a", HTTPTimeout: 2 * time.Second}) + if w.cfg.InstanceID != "host-a" { + t.Fatalf("InstanceID = %q, want host-a", w.cfg.InstanceID) + } + if w.httpClient == nil || w.httpClient.Timeout != 2*time.Second { + t.Fatalf("httpClient = %+v", w.httpClient) + } + if w.inFlight == nil || w.stop == nil || w.done == nil { + t.Fatalf("worker runtime state not initialized: %+v", w) + } +} + +func TestWorkerStartStop(t *testing.T) { + w := NewWorker(WorkerConfig{PollInterval: time.Hour}) + w.Start() + w.Stop() +} + +func TestDeliverTickNoReadyDeliveries(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectBegin() + mock.ExpectQuery(selectClaimReadySQL).WithArgs(50). + WillReturnRows(sqlmock.NewRows(columnsClaimedDelivery)) + mock.ExpectCommit() + + w := NewWorker(WorkerConfig{DB: db}) + if err := w.deliverTick(); err != nil { + t.Fatalf("deliverTick: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestHandleResultSchedulesRetryAndForcedAbandon(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectExec("UPDATE jetmon_webhook_deliveries"). + WithArgs(503, "retry", sqlmock.AnyArg(), int64(1)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE jetmon_webhook_deliveries"). + WithArgs(0, "gone", int64(2)). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := NewWorker(WorkerConfig{DB: db}) + w.handleResult(context.Background(), Delivery{ID: 1, Attempt: 0}, 503, "retry", false) + w.handleResult(context.Background(), Delivery{ID: 2, Attempt: 0}, 0, "gone", true) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} + +func TestBuildPayload(t *testing.T) { + occurredAt := time.Date(2026, 4, 27, 12, 0, 0, 123, time.UTC) + w := &Worker{} + payload, err := w.buildPayload(EventOpened, 10, 20, 30, "opened", "Seems Down", occurredAt) + if err != nil { + t.Fatalf("buildPayload: %v", err) + } + + var body map[string]any + if err := json.Unmarshal(payload, &body); err != nil { + t.Fatalf("Unmarshal: %v", err) + } + if body["type"] != EventOpened || body["reason"] != "opened" || body["state"] != "Seems Down" { + t.Fatalf("payload = %s", payload) + } + if body["transition_id"].(float64) != 10 || body["event_id"].(float64) != 20 || body["site_id"].(float64) != 30 { + t.Fatalf("payload ids = %s", payload) + } + if body["occurred_at"] != occurredAt.Format(time.RFC3339Nano) { + t.Fatalf("occurred_at = %v", body["occurred_at"]) + } +} + +func TestProgressLoadSave(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + w := &Worker{cfg: WorkerConfig{DB: db, InstanceID: "host-a"}} + mock.ExpectQuery("SELECT last_transition_id FROM jetmon_webhook_dispatch_progress"). + WithArgs("host-a"). + WillReturnError(sql.ErrNoRows) + mock.ExpectExec("INSERT INTO jetmon_webhook_dispatch_progress"). + WithArgs("host-a", int64(55)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery("SELECT last_transition_id FROM jetmon_webhook_dispatch_progress"). + WithArgs("host-a"). + WillReturnRows(sqlmock.NewRows([]string{"last_transition_id"}).AddRow(int64(55))) + + last, err := w.loadProgress(context.Background()) + if err != nil { + t.Fatalf("loadProgress empty: %v", err) + } + if last != 0 { + t.Fatalf("empty progress = %d, want 0", last) + } + if err := w.saveProgress(context.Background(), 55); err != nil { + t.Fatalf("saveProgress: %v", err) + } + last, err = w.loadProgress(context.Background()) + if err != nil { + t.Fatalf("loadProgress stored: %v", err) + } + if last != 55 { + t.Fatalf("stored progress = %d, want 55", last) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet sql expectations: %v", err) + } +} diff --git a/internal/wpcom/client.go b/internal/wpcom/client.go index 1fef3310..22abf9a6 100644 --- a/internal/wpcom/client.go +++ b/internal/wpcom/client.go @@ -20,7 +20,7 @@ const ( // CheckEntry represents a single check result included in a notification. type CheckEntry struct { - Type int `json:"type"` // 1=local, 2=veriflier + Type int `json:"type"` // 1=local, 2=veriflier Host string `json:"host"` Status int `json:"status"` RTT int64 `json:"rtt"` @@ -53,7 +53,7 @@ type Client struct { } type queuedNotification struct { - n Notification + n Notification queuedAt time.Time } diff --git a/systemd/jetmon-deliverer.service b/systemd/jetmon-deliverer.service new file mode 100644 index 00000000..b9e63e6e --- /dev/null +++ b/systemd/jetmon-deliverer.service @@ -0,0 +1,35 @@ +[Unit] +Description=Jetmon Delivery Workers +Documentation=https://github.com/Automattic/jetmon +After=network.target mysql.service +Wants=network.target +StartLimitIntervalSec=60s +StartLimitBurst=5 + +[Service] +Type=simple +User=jetmon +Group=jetmon +WorkingDirectory=/opt/jetmon2 +ExecStart=/opt/jetmon2/bin/jetmon-deliverer +Restart=on-failure +RestartSec=5s +TimeoutStopSec=35s + +# Resource limits. +MemoryMax=256M +LimitNOFILE=65536 +LimitNPROC=4096 + +# Hardening. +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=full +ProtectHome=yes + +# Environment. +EnvironmentFile=-/opt/jetmon2/config/jetmon2.env +Environment=JETMON_CONFIG=/opt/jetmon2/config/deliverer.json + +[Install] +WantedBy=multi-user.target diff --git a/systemd/jetmon2.service b/systemd/jetmon2.service index d82736fd..71ae837f 100644 --- a/systemd/jetmon2.service +++ b/systemd/jetmon2.service @@ -3,6 +3,8 @@ Description=Jetmon 2 — Jetpack Uptime Monitor Documentation=https://github.com/Automattic/jetmon After=network.target mysql.service Wants=network.target +StartLimitIntervalSec=60s +StartLimitBurst=5 [Service] Type=simple @@ -14,8 +16,6 @@ ExecReload=/bin/kill -HUP $MAINPID Restart=on-failure RestartSec=5s TimeoutStopSec=35s -StartLimitIntervalSec=60s -StartLimitBurst=5 # Resource limits. MemoryMax=512M diff --git a/veriflier2/cmd/main.go b/veriflier2/cmd/main.go index 22512df0..1e3c0397 100644 --- a/veriflier2/cmd/main.go +++ b/veriflier2/cmd/main.go @@ -3,21 +3,28 @@ package main import ( "context" "encoding/json" + "errors" "fmt" "log" + "net/http" "os" "os/signal" "syscall" + "time" "github.com/Automattic/jetmon/internal/checker" + "github.com/Automattic/jetmon/internal/metrics" "github.com/Automattic/jetmon/internal/veriflier" ) var version = "dev" +const shutdownGracePeriod = 30 * time.Second + type veriflierConfig struct { AuthToken string `json:"auth_token"` - GRPCPort string `json:"grpc_port"` + Port string `json:"port"` + GRPCPort string `json:"grpc_port"` // Deprecated alias for Port. } func main() { @@ -34,29 +41,56 @@ func main() { if v := os.Getenv("VERIFLIER_AUTH_TOKEN"); v != "" { cfg.AuthToken = v } - if v := os.Getenv("VERIFLIER_GRPC_PORT"); v != "" { - cfg.GRPCPort = v + if v := envOrDefault("VERIFLIER_PORT", ""); v != "" { + cfg.Port = v + } else if v := os.Getenv("VERIFLIER_GRPC_PORT"); v != "" { + cfg.Port = v } - if cfg.GRPCPort == "" { - log.Fatalf("VERIFLIER_GRPC_PORT is not set") + if cfg.TransportPort() == "" { + log.Fatalf("VERIFLIER_PORT is not set") + } + // Reject empty auth tokens at startup. The verifier's Bearer comparison + // would otherwise accept any request with the literal "Bearer " header + // (no token after the space) — a subtle auth bypass if a misconfigured + // deploy leaves the token blank. Better to fail loud at startup. + if cfg.AuthToken == "" { + log.Fatalf("VERIFLIER_AUTH_TOKEN is not set; refusing to start with no authentication") + } + addr := fmt.Sprintf(":%s", cfg.TransportPort()) + + // Optional StatsD metrics. STATSD_ADDR is unset in standalone deploys, + // "statsd:8125" in the docker compose stack. metrics.Init failure logs and + // continues — the verifier should still run with metrics disabled. + if statsdAddr := os.Getenv("STATSD_ADDR"); statsdAddr != "" { + if err := metrics.Init(statsdAddr, hostname); err != nil { + log.Printf("metrics: init failed (%v) — running without metrics", err) + } else { + log.Printf("metrics: sending to %s", statsdAddr) + } } - addr := fmt.Sprintf(":%s", cfg.GRPCPort) srv := veriflier.NewServer(addr, cfg.AuthToken, hostname, version, performCheck) + // Graceful shutdown: SIGINT/SIGTERM triggers Shutdown(ctx) with a drain + // budget so in-flight checks can complete before the listener closes. sigCh := make(chan os.Signal, 1) signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) go func() { - <-sigCh - log.Println("veriflier2: shutting down") - os.Exit(0) + sig := <-sigCh + log.Printf("veriflier2: %s received, draining (up to %s)", sig, shutdownGracePeriod) + ctx, cancel := context.WithTimeout(context.Background(), shutdownGracePeriod) + defer cancel() + if err := srv.Shutdown(ctx); err != nil { + log.Printf("veriflier2: shutdown error: %v", err) + } }() log.Printf("veriflier2 %s starting on %s", version, addr) - if err := srv.Listen(); err != nil { + if err := srv.Listen(); err != nil && !errors.Is(err, http.ErrServerClosed) { log.Fatalf("listen: %v", err) } + log.Println("veriflier2: shutdown complete") } // performCheck runs a single HTTP check and returns the result for the server. @@ -93,7 +127,7 @@ func loadConfig(path string) (*veriflierConfig, error) { // Fall back to environment-only config. return &veriflierConfig{ AuthToken: os.Getenv("VERIFLIER_AUTH_TOKEN"), - GRPCPort: envOrDefault("VERIFLIER_GRPC_PORT", "7803"), + Port: envOrDefault("VERIFLIER_PORT", envOrDefault("VERIFLIER_GRPC_PORT", "7803")), }, nil } defer f.Close() @@ -105,6 +139,13 @@ func loadConfig(path string) (*veriflierConfig, error) { return &cfg, nil } +func (c veriflierConfig) TransportPort() string { + if c.Port != "" { + return c.Port + } + return c.GRPCPort +} + func envOrDefault(key, def string) string { if v := os.Getenv(key); v != "" { return v diff --git a/veriflier2/cmd/main_test.go b/veriflier2/cmd/main_test.go new file mode 100644 index 00000000..8e53db4a --- /dev/null +++ b/veriflier2/cmd/main_test.go @@ -0,0 +1,148 @@ +package main + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "github.com/Automattic/jetmon/internal/checker" + "github.com/Automattic/jetmon/internal/veriflier" +) + +func TestEnvOrDefault(t *testing.T) { + const key = "VERIFLIER_TEST_ENV_OR_DEFAULT" + t.Setenv(key, "") + if got := envOrDefault(key, "fallback"); got != "fallback" { + t.Fatalf("envOrDefault(empty) = %q, want fallback", got) + } + + t.Setenv(key, "configured") + if got := envOrDefault(key, "fallback"); got != "configured" { + t.Fatalf("envOrDefault(set) = %q, want configured", got) + } +} + +func TestStringPtr(t *testing.T) { + if got := stringPtr(""); got != nil { + t.Fatalf("stringPtr(empty) = %v, want nil", got) + } + got := stringPtr("needle") + if got == nil || *got != "needle" { + t.Fatalf("stringPtr(non-empty) = %v, want pointer to needle", got) + } +} + +func TestLoadConfigFromFile(t *testing.T) { + path := filepath.Join(t.TempDir(), "veriflier.json") + if err := os.WriteFile(path, []byte(`{"auth_token":"secret","port":"7804"}`), 0644); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg, err := loadConfig(path) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.AuthToken != "secret" || cfg.TransportPort() != "7804" { + t.Fatalf("config = %+v", cfg) + } +} + +func TestLoadConfigSupportsLegacyGRPCPort(t *testing.T) { + path := filepath.Join(t.TempDir(), "veriflier.json") + if err := os.WriteFile(path, []byte(`{"auth_token":"secret","grpc_port":"7805"}`), 0644); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg, err := loadConfig(path) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.TransportPort() != "7805" { + t.Fatalf("TransportPort() = %q, want 7805", cfg.TransportPort()) + } +} + +func TestLoadConfigFallsBackToEnvironment(t *testing.T) { + t.Setenv("VERIFLIER_AUTH_TOKEN", "env-secret") + t.Setenv("VERIFLIER_PORT", "7900") + + cfg, err := loadConfig(filepath.Join(t.TempDir(), "missing.json")) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.AuthToken != "env-secret" || cfg.TransportPort() != "7900" { + t.Fatalf("config = %+v", cfg) + } +} + +func TestLoadConfigFallsBackToLegacyPortEnvironment(t *testing.T) { + t.Setenv("VERIFLIER_AUTH_TOKEN", "env-secret") + t.Setenv("VERIFLIER_GRPC_PORT", "7901") + + cfg, err := loadConfig(filepath.Join(t.TempDir(), "missing.json")) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.TransportPort() != "7901" { + t.Fatalf("TransportPort() = %q, want 7901", cfg.TransportPort()) + } +} + +func TestLoadConfigRejectsMalformedJSON(t *testing.T) { + path := filepath.Join(t.TempDir(), "veriflier.json") + if err := os.WriteFile(path, []byte(`{"auth_token":`), 0644); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + if _, err := loadConfig(path); err == nil { + t.Fatal("loadConfig accepted malformed JSON") + } +} + +func TestPerformCheckSuccess(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("X-Test"); got != "present" { + t.Fatalf("X-Test header = %q, want present", got) + } + _, _ = w.Write([]byte("needle")) + })) + defer srv.Close() + + res := performCheck(veriflier.CheckRequest{ + BlogID: 42, + URL: srv.URL, + TimeoutSeconds: 2, + Keyword: "needle", + CustomHeaders: map[string]string{"X-Test": "present"}, + RedirectPolicy: string(checker.RedirectFollow), + }) + if !res.Success { + t.Fatalf("performCheck success = false; result=%+v", res) + } + if res.BlogID != 42 || res.HTTPCode != http.StatusOK { + t.Fatalf("performCheck result = %+v", res) + } +} + +func TestPerformCheckKeywordFailure(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte("different")) + })) + defer srv.Close() + + res := performCheck(veriflier.CheckRequest{ + BlogID: 43, + URL: srv.URL, + TimeoutSeconds: 2, + Keyword: "needle", + RedirectPolicy: string(checker.RedirectFollow), + }) + if res.Success { + t.Fatalf("performCheck success = true; result=%+v", res) + } + if res.ErrorCode != int32(checker.ErrorKeyword) { + t.Fatalf("error code = %d, want %d", res.ErrorCode, checker.ErrorKeyword) + } +} diff --git a/veriflier2/config/veriflier-sample.json b/veriflier2/config/veriflier-sample.json index 9912d166..c252c00f 100644 --- a/veriflier2/config/veriflier-sample.json +++ b/veriflier2/config/veriflier-sample.json @@ -1,4 +1,4 @@ { "auth_token" : "", - "grpc_port" : "" + "port" : "" }