Automattic · chrisbliss18 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -160,6 +160,9 @@ Copy `config/config-sample.json` to `config/config.json`. All keys from the orig
 - `LEGACY_STATUS_PROJECTION_ENABLE`: Keep v1 `site_status` / `last_status_change` projection updated during shadow-v2-state migration
 - `LOG_FORMAT`: `text` (default, drop-in compatible) or `json` (structured logging)
 - `USE_VARIABLE_CHECK_INTERVALS`: Respect per-site `check_interval`; the scheduler uses a short idle poll and maintained `next_check_at` timestamps control which sites are ready
+- `DNS_MONITOR_ENABLE`: Enable the independent recursive DNS probe stream
+- `DNS_MONITOR_INTERVAL_SEC`: Per-site DNS cadence; initial schedule rows are hash-jittered across this interval
+- `DNS_MONITOR_BATCH_SIZE`, `DNS_MONITOR_MAX_WORKERS`, `DNS_MONITOR_SCHEDULE_BATCH_SIZE`: Optional DNS guardrails; 0 means auto-size from `NUM_WORKERS`
 - `DASHBOARD_PORT`: Internal port for the operator dashboard (0 to disable)
 - `DEBUG_PORT`: localhost-only pprof port, default 6060 (0 to disable; never exposed remotely)
 
@@ -206,6 +209,15 @@ Every HTTPS check inspects `tls.ConnectionState` for:
 - TLS version — flags TLS 1.0/1.1 as deprecated
 - Cipher suite — recorded in audit log
 
+**DNS Monitoring:**
+When `DNS_MONITOR_ENABLE` is true, Jetmon runs a separate recursive DNS probe
+stream from `jetmon_dns_probe_state`. DNS schedules are spread over
+`DNS_MONITOR_INTERVAL_SEC`, lookup workers and batches auto-size from
+`NUM_WORKERS` by default, and failures open Degraded `dns` events with resolver
+evidence (NXDOMAIN, SERVFAIL, timeout, resolver error). The first DNS rollout
+slice is advisory: DNS events do not update the legacy HTTP `site_status`
+projection and do not send WPCOM downtime notifications.
+
 **Downtime Verification:**
 1. Local check fails → open a `Seems Down` event (severity 3) and enter the local retry queue. The event opens on the **first** failure so `started_at` reflects the actual incident start. Subsequent failures during retry are no-ops on the events table (idempotent dedup).
 2. After `NUM_OF_CHECKS` local failures → dispatch to Verifliers (event stays Seems Down)
@@ -256,6 +268,7 @@ New tables introduced by Jetmon 2:
 | `jetmon_event_transitions` | Append-only history of every mutation to `jetmon_events` (open, severity change, state change, cause link, close) |
 | `jetmon_audit_log` | Operational trail — WPCOM notifications, retry dispatch, verifier RPCs, alert/maintenance suppression, config reloads. Site-state changes do **not** flow through here |
 | `jetmon_check_history` | RTT and timing samples for trending |
+| `jetmon_dns_probe_state` | Independent DNS probe schedule plus latest recursive resolver evidence |
 | `jetmon_false_positives` | Veriflier non-confirmation events |
 
 ## Multi-Host Bucket Coordination

diff --git a/config/config-sample.json b/config/config-sample.json
@@ -35,6 +35,14 @@
 	"KEYWORD_READ_MAX_MS"          : 0,
 	"USE_VARIABLE_CHECK_INTERVALS" : true,
 
+	"DNS_MONITOR_ENABLE"              : false,
+	"DNS_MONITOR_INTERVAL_SEC"        : 900,
+	"DNS_MONITOR_TIMEOUT_MS"          : 2000,
+	"DNS_MONITOR_BATCH_SIZE"          : 0,
+	"DNS_MONITOR_MAX_WORKERS"         : 0,
+	"DNS_MONITOR_SCHEDULE_BATCH_SIZE" : 0,
+	"DNS_MONITOR_RESOLVERS"           : [],
+
 	"LOG_FORMAT"     : "text",
 	"DASHBOARD_PORT" : 8080,
 	"DASHBOARD_BIND_ADDR" : "127.0.0.1",

diff --git a/config/config.readme b/config/config.readme
@@ -116,6 +116,40 @@ follow-up when the normal interval is longer. Default in the sample config:
 true. Minimal configs that omit the key retain the compatibility default of
 false.
 
+DNS_MONITOR_ENABLE
+Set to true to run the independent recursive DNS monitor. DNS probes use their
+own schedule table and bounded worker loop so DNS health checks do not add DNS
+load to every HTTP probe. Default: false.
+
+DNS_MONITOR_INTERVAL_SEC
+Seconds between DNS probes for each active site when DNS monitoring is enabled.
+Initial schedules are spread across this interval with stable hash jitter so
+enabling DNS monitoring does not create a synchronized lookup wave. Default:
+900.
+
+DNS_MONITOR_TIMEOUT_MS
+Per-hostname recursive lookup timeout in milliseconds. Default: 2000.
+
+DNS_MONITOR_BATCH_SIZE
+Maximum due DNS probes to process in one scheduler pass. Set to 0 for automatic
+sizing based on NUM_WORKERS. Default: 0.
+
+DNS_MONITOR_MAX_WORKERS
+Maximum DNS lookup workers. Set to 0 for automatic sizing based on NUM_WORKERS.
+Default: 0.
+
+DNS_MONITOR_SCHEDULE_BATCH_SIZE
+Maximum missing DNS schedule rows to backfill in one scheduler pass. Set to 0
+for automatic sizing based on the DNS batch size. Default: 0.
+
+DNS_MONITOR_RESOLVERS
+Optional JSON array of recursive resolver addresses used by the DNS monitor
+instead of the host's system resolver. Entries may be hostnames/IPs or
+host:port values; port defaults to 53. Leave empty to use the system resolver.
+This is primarily for controlled test environments and production deployments
+that require a known recursive resolver path. Example:
+["1.1.1.1:53", "8.8.8.8:53"].
+
 LOG_FORMAT
 Log output format. Set to "json" for structured logging (e.g. for log aggregators), or "text" for human-readable output. Default: "text".
 

diff --git a/docs/events.md b/docs/events.md
@@ -204,6 +204,7 @@ The split exists because the two trails have different consumers and different r
 | `jetmon_events` + `jetmon_event_transitions` | Public API incident timelines, SLA reports | Long — 30/90 days at full fidelity, then rolled up |
 | `jetmon_audit_log` | Operators investigating "why did the alert fire" | Short — aggressive pruning is fine once the incident is closed |
 | `jetmon_check_history` | Response-time trending, baseline learning | Medium — granular timing is high volume |
+| `jetmon_dns_probe_state` | DNS scheduler and latest recursive resolver evidence | Medium — one current row per monitored site |
 
 ## Causal links
 

diff --git a/docs/operations-guide.md b/docs/operations-guide.md
@@ -27,6 +27,13 @@ Key settings:
 | `BODY_READ_MAX_MS` | 250 | Post-header body-phase budget in milliseconds for budgeted reads (unknown/large responses) |
 | `KEYWORD_READ_MAX_BYTES` | 1048576 | Max bytes scanned when keyword checks are enabled |
 | `KEYWORD_READ_MAX_MS` | 0 | Keyword read budget in milliseconds, 0 inherits full request timeout envelope |
+| `DNS_MONITOR_ENABLE` | false | Enable the independent recursive DNS monitor |
+| `DNS_MONITOR_INTERVAL_SEC` | 900 | Per-site DNS probe cadence when DNS monitoring is enabled |
+| `DNS_MONITOR_TIMEOUT_MS` | 2000 | Per-hostname recursive DNS lookup timeout |
+| `DNS_MONITOR_BATCH_SIZE` | 0 | Due DNS probes per scheduler pass; 0 auto-sizes from `NUM_WORKERS` |
+| `DNS_MONITOR_MAX_WORKERS` | 0 | DNS lookup worker cap; 0 auto-sizes from `NUM_WORKERS` |
+| `DNS_MONITOR_SCHEDULE_BATCH_SIZE` | 0 | Missing DNS schedule rows to backfill per pass; 0 auto-sizes |
+| `DNS_MONITOR_RESOLVERS` | empty | Optional recursive resolver list for the DNS monitor; empty uses the host system resolver |
 | `PEER_OFFLINE_LIMIT` | 3 | Veriflier agreements required to confirm downtime |
 | `WORKER_MAX_MEM_MB` | 0 | Optional Go runtime memory threshold that triggers worker-pool drain; 0 disables the artificial cap |
 | `BUCKET_TOTAL` | 1000 | Total bucket range across all hosts |
@@ -65,6 +72,24 @@ Scheduler behavior:
   reporting queries do not run on every short scheduler poll. Use
   `scheduler.round.due_count_sampled.count` to distinguish sampled polls from
   intentionally skipped reporting polls.
+- DNS monitoring uses `jetmon_dns_probe_state` for its own schedule and latest
+  recursive resolver evidence. Initial schedule rows are jittered across
+  `DNS_MONITOR_INTERVAL_SEC`, and `DNS_MONITOR_BATCH_SIZE` /
+  `DNS_MONITOR_MAX_WORKERS` can stay at 0 so the orchestrator auto-sizes DNS
+  load from the HTTP worker pool. DNS failures open Degraded `dns` events with
+  resolver metadata but do not change the legacy `site_status` projection or
+  send WPCOM downtime notifications in this first rollout slice.
+- `DNS_MONITOR_RESOLVERS` can pin DNS probes to a known recursive resolver path
+  for staging, uptime-bench, or production resolver policy. Use recursive
+  resolvers that can answer normal A/AAAA/CNAME lookups; pointing this at an
+  authoritative-only nameserver is only useful when that server is known to
+  answer every monitored name in the test. When several resolvers are listed,
+  Jetmon picks a stable resolver per hostname to distribute load without
+  synchronizing all sites onto one upstream.
+- DNS probe metrics include scheduler gauges such as `dns.selected.count` and
+  status counters such as `dns.check.ok.count`, `dns.check.nxdomain.count`,
+  and `dns.check.timeout.count`. These should appear under the normal
+  `com.jetpack.jetmon.<hostname>` StatsD prefix when StatsD is configured.
 
 See [../config/config.readme](../config/config.readme) for the full option
 reference.

diff --git a/docs/project.md b/docs/project.md
@@ -129,6 +129,15 @@ Add `maintenance_start` and `maintenance_end` (nullable `DATETIME`) columns to `
 **Granular Timing Breakdown**
 Go's `net/http/httptrace` provides discrete callbacks for DNS start/done, TCP connect start/done, TLS handshake start/done, request written, and first response byte. Each check records composite RTT plus DNS, TCP, TLS, and TTFB timings. The raw samples are stored in `jetmon_check_history` for response-time trending and API statistics; scheduler-level StatsD metrics report round/page phase timing and write volume.
 
+**Recursive DNS Monitoring**
+Jetmon can run a separate recursive DNS probe stream on a staggered schedule.
+DNS probes are stored in `jetmon_dns_probe_state`, auto-size their worker/batch
+limits from the HTTP worker pool by default, and open Degraded `dns` events
+with NXDOMAIN / SERVFAIL / timeout / resolver-error evidence when resolution
+fails. The first rollout slice is intentionally advisory: DNS events do not
+update the legacy HTTP `site_status` projection and do not send WPCOM downtime
+notifications until product semantics for DNS-to-site rollup are finalized.
+
 When the HTTP probe fails during resolver lookup, Jetmon records structured DNS
 diagnostics in event metadata when Go exposes them: NXDOMAIN, SERVFAIL, timeout,
 or a generic resolver error, plus the queried name and resolver server when
@@ -366,8 +375,13 @@ Benefits over the current static configuration:
 
 These are intentionally out of scope for the initial rewrite. They represent the path to making Jetmon 2 a fully competitive standalone monitoring platform rather than a reliable internal Jetpack service.
 
-**DNS Monitoring**
-Check that a domain resolves to expected IPs on a schedule, using Go's `net.LookupHost()`. Alert when the answer changes or when resolution fails. Particularly valuable for detecting DNS hijacking and nameserver misconfigurations before they cause HTTP failures. New monitor type stored as a separate DB table.
+**Advanced DNS Monitoring**
+Build on the recursive DNS probe stream with explicit DNS-record expectations,
+DNSSEC checks, split-horizon checks, full CNAME-chain capture, authoritative
+nameserver probes, and DNS latency baselines. These need product semantics for
+which failures are advisory, which roll up to site-level downtime, and how
+monitor-side resolver impairment is reported as Unknown rather than customer
+site downtime.
 
 **TCP Port Monitoring**
 Attempt a TCP connection to an arbitrary host:port on a schedule. No HTTP layer — a successful connection is "up". Useful for database ports, SMTP, and custom application services. A small extension of the existing connection logic.

diff --git a/docs/roadmap.md b/docs/roadmap.md
@@ -108,27 +108,35 @@ No active candidate branch is queued here right now.
   stable in production because dynamic WordPress pages need normalization,
   training, approval/reset workflows, and operator-visible evidence before
   Jetmon can safely alert on "content changed unexpectedly."
-- [x] Improve DNS diagnostics on HTTP lookup failures before building explicit
-  DNS monitors. The v2 HTTP checker already records DNS timing and classifies
-  lookup failures as connect failures; event metadata now distinguishes
-  NXDOMAIN, SERVFAIL, timeout, and resolver errors where Go/runtime resolver
-  data can support it. This is the recommended near-term step because it helps
-  HEs explain failures without creating a new monitor type.
-- [ ] Track DNS-specific benchmark scenarios separately from HTTP DNS failures.
-  Explicit DNS-record, DNSSEC, split-horizon, CNAME-chain, authoritative
-  nameserver, and DNS-latency monitors need a dedicated check type and event
-  taxonomy before they should be exposed as production uptime signals. Defer
-  this larger feature until the product semantics are designed: some DNS
-  failures should be `Warning` or `Degraded`, some should roll up to site-level
-  `Down`, and monitor-side resolver impairment must remain `Unknown`.
-- [ ] Decide whether Jetmon should add an explicit DNS monitor that bypasses or
-  complements recursive resolver cache visibility. The 2026-05-05 all-services
-  gapfill run showed every service, including Jetmon v2, missing short
-  authoritative DNS failure windows, which is consistent with recursive cache
-  TTLs hiding the outage from HTTP probes. This needs product semantics before
-  implementation: direct authoritative checks can catch short DNS outages, but
-  they also increase query load and can report a failure that many end users do
-  not observe until caches expire.
+- [x] Add the first explicit DNS monitor slice as an independent recursive
+  DNS probe stream. The implementation has its own schedule table, jittered
+  initial due times, auto-sized batch/worker guardrails, latest resolver
+  evidence, and `dns` events that do not mutate the legacy HTTP up/down
+  projection or send WPCOM downtime notifications yet.
+- [x] Harden the DNS monitor after the focused uptime-bench smoke test:
+  configurable recursive resolvers for controlled test/prod resolver paths,
+  CNAME evidence preserved on address-lookup failures, DNS status counters, and
+  causal links from active HTTP events to DNS root-cause events when both are
+  open for the same site.
+- [x] Improve DNS diagnostics on HTTP lookup failures. The v2 HTTP checker
+  already records DNS timing and classifies lookup failures as connect failures;
+  event metadata now distinguishes NXDOMAIN, SERVFAIL, timeout, and resolver
+  errors where Go/runtime resolver data can support it. This remains useful
+  even with explicit DNS probes because it ties a failed HTTP check directly to
+  the resolver failure seen on that request path.
+- [ ] Expand DNS-specific benchmark coverage beyond the first recursive probe
+  stream. DNS-record expectation checks, DNSSEC, split-horizon, full CNAME-chain
+  capture, authoritative nameserver probes, and DNS-latency monitors need
+  product semantics before they should be exposed as production uptime signals:
+  some DNS failures should be `Warning` or `Degraded`, some should roll up to
+  site-level `Down`, and monitor-side resolver impairment must remain `Unknown`.
+- [ ] Decide whether Jetmon should later add authoritative DNS probes that
+  bypass or complement recursive resolver cache visibility. The 2026-05-05
+  all-services gapfill run showed every service, including Jetmon v2, missing
+  short authoritative DNS failure windows, which is consistent with recursive
+  cache TTLs hiding the outage from HTTP probes. Direct authoritative checks can
+  catch short DNS outages, but they also increase query load and can report a
+  failure that many end users do not observe until caches expire.
 - [ ] Validate geo-scoped benchmark assumptions before changing Jetmon
   production behavior for `http-geo-503`. Confirm the probe source ranges,
   intended Jetmon region semantics, and support story for partial regional

diff --git a/internal/config/config.go b/internal/config/config.go
@@ -4,7 +4,9 @@ import (
 	"encoding/json"
 	"fmt"
 	"log"
+	"net"
 	"os"
+	"strconv"
 	"strings"
 	"sync"
 )
@@ -87,6 +89,17 @@ type Config struct {
 	KeywordReadMaxMS          int   `json:"KEYWORD_READ_MAX_MS"`
 	UseVariableCheckIntervals bool  `json:"USE_VARIABLE_CHECK_INTERVALS"`
 
+	// DNS monitoring is a separate scheduled probe stream. Batch/worker values
+	// default to 0, which lets the orchestrator derive bounded values from the
+	// HTTP worker count instead of requiring per-host tuning.
+	DNSMonitorEnable            bool     `json:"DNS_MONITOR_ENABLE"`
+	DNSMonitorIntervalSec       int      `json:"DNS_MONITOR_INTERVAL_SEC"`
+	DNSMonitorTimeoutMS         int      `json:"DNS_MONITOR_TIMEOUT_MS"`
+	DNSMonitorBatchSize         int      `json:"DNS_MONITOR_BATCH_SIZE"`
+	DNSMonitorMaxWorkers        int      `json:"DNS_MONITOR_MAX_WORKERS"`
+	DNSMonitorScheduleBatchSize int      `json:"DNS_MONITOR_SCHEDULE_BATCH_SIZE"`
+	DNSMonitorResolvers         []string `json:"DNS_MONITOR_RESOLVERS"`
+
 	LogFormat         string `json:"LOG_FORMAT"`
 	DashboardPort     int    `json:"DASHBOARD_PORT"`
 	DashboardBindAddr string `json:"DASHBOARD_BIND_ADDR"`
@@ -221,6 +234,8 @@ func defaults() *Config {
 		BodyReadMaxMS:                250,
 		KeywordReadMaxBytes:          1048576,
 		KeywordReadMaxMS:             0,
+		DNSMonitorIntervalSec:        900,
+		DNSMonitorTimeoutMS:          2000,
 		LogFormat:                    "text",
 		DashboardPort:                8080,
 		DashboardBindAddr:            "127.0.0.1",
@@ -304,6 +319,26 @@ func validate(cfg *Config) error {
 	if cfg.KeywordReadMaxMS < 0 {
 		return fmt.Errorf("KEYWORD_READ_MAX_MS must be >= 0")
 	}
+	if cfg.DNSMonitorEnable && cfg.DNSMonitorIntervalSec <= 0 {
+		return fmt.Errorf("DNS_MONITOR_INTERVAL_SEC must be > 0 when DNS_MONITOR_ENABLE is true")
+	}
+	if cfg.DNSMonitorEnable && cfg.DNSMonitorTimeoutMS <= 0 {
+		return fmt.Errorf("DNS_MONITOR_TIMEOUT_MS must be > 0 when DNS_MONITOR_ENABLE is true")
+	}
+	if cfg.DNSMonitorBatchSize < 0 {
+		return fmt.Errorf("DNS_MONITOR_BATCH_SIZE must be >= 0")
+	}
+	if cfg.DNSMonitorMaxWorkers < 0 {
+		return fmt.Errorf("DNS_MONITOR_MAX_WORKERS must be >= 0")
+	}
+	if cfg.DNSMonitorScheduleBatchSize < 0 {
+		return fmt.Errorf("DNS_MONITOR_SCHEDULE_BATCH_SIZE must be >= 0")
+	}
+	for i, resolver := range cfg.DNSMonitorResolvers {
+		if err := validateDNSResolverAddr(resolver); err != nil {
+			return fmt.Errorf("DNS_MONITOR_RESOLVERS[%d]: %w", i, err)
+		}
+	}
 	if cfg.MinTimeBetweenRoundsSec < 0 {
 		return fmt.Errorf("MIN_TIME_BETWEEN_ROUNDS_SEC must be >= 0")
 	}
@@ -376,6 +411,36 @@ func validatePinnedBucketRange(cfg *Config) error {
 	return nil
 }
 
+func validateDNSResolverAddr(addr string) error {
+	addr = strings.TrimSpace(addr)
+	if addr == "" {
+		return fmt.Errorf("resolver address must not be empty")
+	}
+	normalized := addr
+	if _, _, err := net.SplitHostPort(normalized); err != nil {
+		if strings.Contains(normalized, ":") {
+			normalized = net.JoinHostPort(strings.Trim(normalized, "[]"), "53")
+		} else {
+			normalized = net.JoinHostPort(normalized, "53")
+		}
+	}
+	host, port, err := net.SplitHostPort(normalized)
+	if err != nil {
+		return fmt.Errorf("resolver address must be host or host:port")
+	}
+	if strings.TrimSpace(host) == "" {
+		return fmt.Errorf("resolver host must not be empty")
+	}
+	if strings.TrimSpace(port) == "" {
+		return fmt.Errorf("resolver port must not be empty")
+	}
+	portNum, err := strconv.Atoi(port)
+	if err != nil || portNum < 1 || portNum > 65535 {
+		return fmt.Errorf("resolver port must be a number between 1 and 65535")
+	}
+	return nil
+}
+
 func displayName(v VerifierConfig, i int) string {
 	if v.Name != "" {
 		return v.Name