From e993b26256dfe25d07b2be9762b6d21343ff8a53 Mon Sep 17 00:00:00 2001 From: Nikhil Sonti Date: Fri, 30 Jan 2026 08:39:33 -0800 Subject: [PATCH] feat: remove watchdog --- apps/server/src/api/routes/health.ts | 12 +- apps/server/src/api/server.ts | 4 +- apps/server/src/api/types.ts | 5 +- apps/server/src/lib/health-watchdog.ts | 104 --------- apps/server/src/main.ts | 16 +- apps/server/tests/api/routes/health.test.ts | 59 +---- apps/server/tests/lib/health-watchdog.test.ts | 201 ------------------ bun.lock | 4 +- 8 files changed, 16 insertions(+), 389 deletions(-) delete mode 100644 apps/server/src/lib/health-watchdog.ts delete mode 100644 apps/server/tests/lib/health-watchdog.test.ts diff --git a/apps/server/src/api/routes/health.ts b/apps/server/src/api/routes/health.ts index 638a074c..61471ee2 100644 --- a/apps/server/src/api/routes/health.ts +++ b/apps/server/src/api/routes/health.ts @@ -5,19 +5,9 @@ */ import { Hono } from 'hono' -import type { HealthWatchdog } from '../../lib/health-watchdog' -interface HealthRouteConfig { - watchdog?: HealthWatchdog -} - -/** - * Health check route group. - * Records health check timestamps for the watchdog (if enabled). - */ -export function createHealthRoute(config: HealthRouteConfig = {}) { +export function createHealthRoute() { return new Hono().get('/', (c) => { - config.watchdog?.recordHealthCheck() return c.json({ status: 'ok' }) }) } diff --git a/apps/server/src/api/server.ts b/apps/server/src/api/server.ts index 75b1f74d..2d5259c5 100644 --- a/apps/server/src/api/server.ts +++ b/apps/server/src/api/server.ts @@ -50,12 +50,12 @@ export async function createHttpServer(config: HttpServerConfig) { allowRemote, } = config - const { healthWatchdog, onShutdown } = config + const { onShutdown } = config // DECLARATIVE route composition - chain .route() calls for type inference const app = new Hono() .use('/*', cors(defaultCorsConfig)) - .route('/health', createHealthRoute({ watchdog: healthWatchdog })) + .route('/health', createHealthRoute()) .route( '/shutdown', createShutdownRoute({ onShutdown: onShutdown ?? (() => {}) }), diff --git a/apps/server/src/api/types.ts b/apps/server/src/api/types.ts index d4dc5781..c5f034a6 100644 --- a/apps/server/src/api/types.ts +++ b/apps/server/src/api/types.ts @@ -17,7 +17,7 @@ import { z } from 'zod' import { VercelAIConfigSchema } from '../agent/provider-adapter/types' import type { McpContext } from '../browser/cdp/context' import type { ControllerContext } from '../browser/extension/context' -import type { HealthWatchdog } from '../lib/health-watchdog' + import type { MutexPool } from '../lib/mutex' import type { RateLimiter } from '../lib/rate-limiter/rate-limiter' import type { ToolDefinition } from '../tools/types/tool-definition' @@ -82,9 +82,6 @@ export interface HttpServerConfig { // For Graph routes codegenServiceUrl?: string - // For health monitoring - healthWatchdog?: HealthWatchdog - // For shutdown route onShutdown?: () => void } diff --git a/apps/server/src/lib/health-watchdog.ts b/apps/server/src/lib/health-watchdog.ts deleted file mode 100644 index f1a220b7..00000000 --- a/apps/server/src/lib/health-watchdog.ts +++ /dev/null @@ -1,104 +0,0 @@ -/** - * @license - * Copyright 2025 BrowserOS - * SPDX-License-Identifier: AGPL-3.0-or-later - * - * Health Watchdog - * - * Self-terminates the server if no health checks are received from Chromium - * within the timeout period. This prevents orphaned server processes when - * Chrome crashes or multiple server instances exist. - * - * Only enabled when launched by Chrome (instanceInstallId present). - * In dev mode, the watchdog is disabled. - */ - -import { EXIT_CODES } from '@browseros/shared/constants/exit-codes' -import { TIMEOUTS } from '@browseros/shared/constants/timeouts' -import type { LoggerInterface } from '@browseros/shared/types/logger' - -export interface HealthWatchdogConfig { - logger: LoggerInterface - checkIntervalMs?: number - timeoutMs?: number -} - -export class HealthWatchdog { - private lastHealthCheckAt: number - private timer: ReturnType | null = null - private readonly logger: LoggerInterface - private readonly checkIntervalMs: number - private readonly timeoutMs: number - - constructor(config: HealthWatchdogConfig) { - this.logger = config.logger - this.checkIntervalMs = - config.checkIntervalMs ?? TIMEOUTS.HEALTH_WATCHDOG_CHECK_INTERVAL - this.timeoutMs = config.timeoutMs ?? TIMEOUTS.HEALTH_WATCHDOG_TIMEOUT - this.lastHealthCheckAt = Date.now() - } - - /** - * Start the watchdog timer. - * Call this after the HTTP server is ready. - */ - start(): void { - if (this.timer) { - return - } - - this.logger.info('Health watchdog started', { - checkIntervalMs: this.checkIntervalMs, - timeoutMs: this.timeoutMs, - }) - - this.timer = setInterval(() => this.check(), this.checkIntervalMs) - } - - /** - * Stop the watchdog timer. - * Call this during graceful shutdown. - */ - stop(): void { - if (this.timer) { - clearInterval(this.timer) - this.timer = null - this.logger.info('Health watchdog stopped') - } - } - - /** - * Record that a health check was received. - * Call this from the /health endpoint handler. - */ - recordHealthCheck(): void { - this.lastHealthCheckAt = Date.now() - } - - /** - * Check if health checks have been received within the timeout. - * If not, self-terminate with GENERAL_ERROR so Chrome restarts us - * (if Chrome is still alive) or we just die (if orphaned). - */ - private check(): void { - const elapsed = Date.now() - this.lastHealthCheckAt - - if (elapsed > this.timeoutMs) { - this.logger.warn( - 'No health checks received within timeout, self-terminating', - { - elapsedMs: elapsed, - timeoutMs: this.timeoutMs, - }, - ) - process.exit(EXIT_CODES.GENERAL_ERROR) - } - } - - /** - * Get time since last health check (for debugging/monitoring). - */ - getTimeSinceLastCheck(): number { - return Date.now() - this.lastHealthCheckAt - } -} diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts index 59d4edc0..a085f83e 100644 --- a/apps/server/src/main.ts +++ b/apps/server/src/main.ts @@ -20,7 +20,7 @@ import { ControllerContext } from './browser/extension/context' import type { ServerConfig } from './config' import { INLINED_ENV } from './env' import { initializeDb } from './lib/db' -import { HealthWatchdog } from './lib/health-watchdog' + import { identity } from './lib/identity' import { logger } from './lib/logger' import { metrics } from './lib/metrics' @@ -35,7 +35,6 @@ import { VERSION } from './version' export class Application { private config: ServerConfig private db: Database | null = null - private healthWatchdog: HealthWatchdog | null = null constructor(config: ServerConfig) { this.config = config @@ -72,12 +71,6 @@ export class Application { const tools = createToolRegistry(cdpContext, controllerContext) const mutexPool = new MutexPool() - const isDev = process.env.NODE_ENV === 'development' - if (!isDev) { - this.healthWatchdog = new HealthWatchdog({ logger }) - logger.info('Health watchdog enabled') - } - try { await createHttpServer({ port: this.config.serverPort, @@ -92,7 +85,7 @@ export class Application { executionDir: this.config.executionDir, rateLimiter: new RateLimiter(this.getDb(), dailyRateLimit), codegenServiceUrl: this.config.codegenServiceUrl, - healthWatchdog: this.healthWatchdog ?? undefined, + onShutdown: () => this.stop(), }) } catch (error) { @@ -106,9 +99,6 @@ export class Application { `Health endpoint: http://127.0.0.1:${this.config.serverPort}/health`, ) - // Start the watchdog after HTTP server is ready - this.healthWatchdog?.start() - this.logStartupSummary() metrics.log('http_server.started', { version: VERSION }) @@ -116,7 +106,7 @@ export class Application { stop(): void { logger.info('Shutting down server...') - this.healthWatchdog?.stop() + // Immediate exit without graceful shutdown. Chromium may kill us on update/restart, // and we need to free the port instantly so the HTTP port doesn't keep switching. process.exit(EXIT_CODES.SUCCESS) diff --git a/apps/server/tests/api/routes/health.test.ts b/apps/server/tests/api/routes/health.test.ts index 9f5802e1..56cdd537 100644 --- a/apps/server/tests/api/routes/health.test.ts +++ b/apps/server/tests/api/routes/health.test.ts @@ -3,63 +3,18 @@ * Copyright 2025 BrowserOS */ -import { describe, it, mock } from 'bun:test' +import { describe, it } from 'bun:test' import assert from 'node:assert' import { createHealthRoute } from '../../../src/api/routes/health' -import type { HealthWatchdog } from '../../../src/lib/health-watchdog' describe('createHealthRoute', () => { - describe('without watchdog', () => { - it('returns status ok', async () => { - const route = createHealthRoute() - const response = await route.request('/') + it('returns status ok', async () => { + const route = createHealthRoute() + const response = await route.request('/') - assert.strictEqual(response.status, 200) - const body = await response.json() - assert.deepStrictEqual(body, { status: 'ok' }) - }) - }) - - describe('with watchdog', () => { - it('returns status ok and records health check', async () => { - const mockRecordHealthCheck = mock(() => {}) - const mockWatchdog = { - recordHealthCheck: mockRecordHealthCheck, - } as unknown as HealthWatchdog - - const route = createHealthRoute({ watchdog: mockWatchdog }) - const response = await route.request('/') - - assert.strictEqual(response.status, 200) - const body = await response.json() - assert.deepStrictEqual(body, { status: 'ok' }) - - // Verify watchdog was notified - assert.strictEqual( - mockRecordHealthCheck.mock.calls.length, - 1, - 'recordHealthCheck should be called once', - ) - }) - - it('records health check on every request', async () => { - const mockRecordHealthCheck = mock(() => {}) - const mockWatchdog = { - recordHealthCheck: mockRecordHealthCheck, - } as unknown as HealthWatchdog - - const route = createHealthRoute({ watchdog: mockWatchdog }) - - await route.request('/') - await route.request('/') - await route.request('/') - - assert.strictEqual( - mockRecordHealthCheck.mock.calls.length, - 3, - 'recordHealthCheck should be called for each request', - ) - }) + assert.strictEqual(response.status, 200) + const body = await response.json() + assert.deepStrictEqual(body, { status: 'ok' }) }) }) diff --git a/apps/server/tests/lib/health-watchdog.test.ts b/apps/server/tests/lib/health-watchdog.test.ts deleted file mode 100644 index 39963b8d..00000000 --- a/apps/server/tests/lib/health-watchdog.test.ts +++ /dev/null @@ -1,201 +0,0 @@ -/** - * @license - * Copyright 2025 BrowserOS - */ - -import { afterEach, beforeEach, describe, it, mock, spyOn } from 'bun:test' -import assert from 'node:assert' - -import { EXIT_CODES } from '@browseros/shared/constants/exit-codes' -import type { LoggerInterface } from '@browseros/shared/types/logger' - -import { HealthWatchdog } from '../../src/lib/health-watchdog' - -function createMockLogger(): LoggerInterface { - return { - debug: mock(() => {}), - info: mock(() => {}), - warn: mock(() => {}), - error: mock(() => {}), - } -} - -describe('HealthWatchdog', () => { - let mockLogger: LoggerInterface - let exitSpy: ReturnType - let activeWatchdog: HealthWatchdog | null = null - - beforeEach(() => { - mockLogger = createMockLogger() - exitSpy = spyOn(process, 'exit').mockImplementation((() => {}) as never) - }) - - afterEach(() => { - // Always stop watchdog to prevent timer leaks - activeWatchdog?.stop() - activeWatchdog = null - exitSpy.mockRestore() - }) - - describe('constructor', () => { - it('initializes with default timeout values', () => { - activeWatchdog = new HealthWatchdog({ logger: mockLogger }) - - // Should have a recent lastHealthCheckAt (within 100ms of now) - const timeSinceLastCheck = activeWatchdog.getTimeSinceLastCheck() - assert.ok(timeSinceLastCheck < 100, 'Initial time should be recent') - }) - - it('accepts custom timeout values', () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 1000, - timeoutMs: 5000, - }) - - // Watchdog created successfully with custom values - assert.ok(activeWatchdog.getTimeSinceLastCheck() < 100) - }) - }) - - describe('recordHealthCheck', () => { - it('resets the last health check timestamp', async () => { - activeWatchdog = new HealthWatchdog({ logger: mockLogger }) - - // Wait a bit - await new Promise((resolve) => setTimeout(resolve, 50)) - - const beforeRecord = activeWatchdog.getTimeSinceLastCheck() - assert.ok(beforeRecord >= 45, 'Should have elapsed time') - - activeWatchdog.recordHealthCheck() - - const afterRecord = activeWatchdog.getTimeSinceLastCheck() - assert.ok(afterRecord < 15, 'Should be reset to near-zero') - }) - }) - - describe('start/stop', () => { - it('starts and stops the timer without error', () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 10000, // Long interval to avoid triggering - timeoutMs: 20000, - }) - - activeWatchdog.start() - assert.ok(mockLogger.info, 'Should log start message') - - activeWatchdog.stop() - // No error = success - }) - - it('start is idempotent', () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 10000, - timeoutMs: 20000, - }) - - activeWatchdog.start() - activeWatchdog.start() // Should not create duplicate timers - activeWatchdog.stop() - }) - - it('stop is idempotent', () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 10000, - timeoutMs: 20000, - }) - - activeWatchdog.start() - activeWatchdog.stop() - activeWatchdog.stop() // Should not error - }) - }) - - describe('timeout behavior', () => { - it('calls process.exit when no health check within timeout', async () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 10, // Check every 10ms - timeoutMs: 30, // Timeout after 30ms - }) - - activeWatchdog.start() - - // Wait for timeout to trigger - await new Promise((resolve) => setTimeout(resolve, 100)) - - activeWatchdog.stop() - - // Verify exit was called with GENERAL_ERROR - assert.ok(exitSpy.mock.calls.length > 0, 'process.exit should be called') - assert.strictEqual( - exitSpy.mock.calls[0][0], - EXIT_CODES.GENERAL_ERROR, - 'Should exit with GENERAL_ERROR', - ) - }) - - it('does not exit when health checks are received', async () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 20, // Check every 20ms - timeoutMs: 50, // Timeout after 50ms - }) - - activeWatchdog.start() - - // Send health checks faster than timeout - const interval = setInterval(() => { - activeWatchdog?.recordHealthCheck() - }, 15) - - // Wait longer than timeout - await new Promise((resolve) => setTimeout(resolve, 120)) - - clearInterval(interval) - activeWatchdog.stop() - - // Should NOT have called exit - assert.strictEqual( - exitSpy.mock.calls.length, - 0, - 'process.exit should not be called when health checks are received', - ) - }) - }) - - describe('getTimeSinceLastCheck', () => { - it('returns elapsed time since last health check', async () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 10000, // Long interval to avoid triggering during test - timeoutMs: 20000, - }) - - await new Promise((resolve) => setTimeout(resolve, 50)) - - const elapsed = activeWatchdog.getTimeSinceLastCheck() - assert.ok(elapsed >= 45, 'Should return at least 45ms') - assert.ok(elapsed < 150, 'Should not be too much more than 50ms') - }) - - it('resets after recordHealthCheck', async () => { - activeWatchdog = new HealthWatchdog({ - logger: mockLogger, - checkIntervalMs: 10000, // Long interval to avoid triggering during test - timeoutMs: 20000, - }) - - await new Promise((resolve) => setTimeout(resolve, 50)) - - activeWatchdog.recordHealthCheck() - - const elapsed = activeWatchdog.getTimeSinceLastCheck() - assert.ok(elapsed < 20, 'Should be near zero after recording') - }) - }) -}) diff --git a/bun.lock b/bun.lock index 3ba7f246..06828a23 100644 --- a/bun.lock +++ b/bun.lock @@ -124,7 +124,7 @@ }, "apps/server": { "name": "@browseros/server", - "version": "0.0.47", + "version": "0.0.48", "bin": { "browseros-server": "./src/index.ts", }, @@ -1516,7 +1516,7 @@ "chroma-js": ["chroma-js@3.2.0", "", {}, "sha512-os/OippSlX1RlWWr+QDPcGUZs0uoqr32urfxESG9U93lhUfbnlyckte84Q8P1UQY/qth983AS1JONKmLS4T0nw=="], - "chrome-devtools-mcp": ["chrome-devtools-mcp@0.14.0", "", { "bin": { "chrome-devtools-mcp": "build/src/index.js" } }, "sha512-JsnA8tApxOZHAUwduMsGFk0Mc3aQF0MX58fo9LoPxJFkyKdq34QonGPGNG38rWXJVQ2X70eI8wosJbOrXN79dQ=="], + "chrome-devtools-mcp": ["chrome-devtools-mcp@0.15.1", "", { "bin": { "chrome-devtools-mcp": "build/src/index.js" } }, "sha512-tuVDg+uNtjXuCUiI+lA+SFxJ7OaXmdq7n+Ug/XNEra1Y/FVRfLaqNGKJJgHeJHWYdonm7HXAmdyBbg9Rd7uzmQ=="], "chrome-launcher": ["chrome-launcher@1.2.0", "", { "dependencies": { "@types/node": "*", "escape-string-regexp": "^4.0.0", "is-wsl": "^2.2.0", "lighthouse-logger": "^2.0.1" }, "bin": { "print-chrome-path": "bin/print-chrome-path.cjs" } }, "sha512-JbuGuBNss258bvGil7FT4HKdC3SC2K7UAEUqiPy3ACS3Yxo3hAW6bvFpCu2HsIJLgTqxgEX6BkujvzZfLpUD0Q=="],