diff --git a/000-docs/240-PP-RMAP-enterprise-architecture-roadmap.md b/000-docs/240-PP-RMAP-enterprise-architecture-roadmap.md new file mode 100644 index 0000000..87e4d7f --- /dev/null +++ b/000-docs/240-PP-RMAP-enterprise-architecture-roadmap.md @@ -0,0 +1,303 @@ +# 240-PP-RMAP: Enterprise Architecture Roadmap + +| Field | Value | +|-------|-------| +| Document ID | 240-PP-RMAP | +| Category | PP (Project Plan) | +| Type | RMAP (Roadmap) | +| Version | 1.0 | +| Status | Draft | +| Author | Intent Solutions Engineering | +| Date | 2026-02-17 | +| Filing Standard | 6767-a-DR-STND v4.2 | + +--- + +## 1. Executive Summary + +This PRD defines the enterprise architecture enhancement roadmap for Git With Intent (GWI). The platform currently runs on GCP with 5 Cloud Run services, Firestore, Pub/Sub, and OpenTofu IaC. This document maps the gap between current state and a full 8-layer GCP enterprise reference architecture, prioritizing additions that deliver the highest value for GWI's use case. + +### Target Outcome + +Transform GWI from an alpha-stage developer tool into an enterprise-grade SaaS platform with defense-in-depth networking, scalable analytics, and enterprise identity management. + +### Phases Overview + +| Phase | Timeline | Focus | +|-------|----------|-------| +| 6 | Q2-Q3 2026 | Enterprise Infrastructure (WAF, LB, Redis, API Gateway) | +| 7 | Q3-Q4 2026 | Analytics and Scale (BigQuery, CQRS, SLO) | +| 8 | Q4 2026 - Q1 2027 | Enterprise Identity (SCIM, App Integration, RBAC) | + +--- + +## 2. Reference Architecture (8-Layer GCP Enterprise Pattern) + +The target architecture follows the GCP enterprise foundation blueprint: + +| Layer | Components | Purpose | +|-------|------------|---------| +| 1. Network & Edge | Cloud Armor, External LB, Cloud CDN | DDoS protection, traffic routing, caching | +| 2. API Management | API Gateway, Apigee (optional) | Rate limiting, API keys, developer portal | +| 3. Compute | Cloud Run, GKE (future) | Stateless service hosting | +| 4. Data & Storage | Firestore, BigQuery, Memorystore | Operational DB, analytics, caching | +| 5. Messaging | Pub/Sub, Eventarc | Async communication, event routing | +| 6. Identity | IAM, WIF, SCIM 2.0 | Authentication, authorization, provisioning | +| 7. Observability | Cloud Monitoring, Trace, Logging | SLOs, distributed tracing, alerting | +| 8. Security | Secret Manager, VPC-SC, KMS | Secret management, network perimeter, encryption | + +--- + +## 3. Current State Inventory + +### Infrastructure Files (infra/) + +| File | Resources | Status | +|------|-----------|--------| +| `main.tf` | Project, providers | Active | +| `provider.tf` | GCP provider config | Active | +| `variables.tf` | Input variables (~50) | Active | +| `versions.tf` | Provider version constraints | Active | +| `outputs.tf` | Stack outputs | Active | +| `cloud_run.tf` | 5 Cloud Run services (api, gateway, webhook, worker, mcp-server) | Active | +| `service_topology.tf` | Per-service CPU/memory/scaling config | Active | +| `service_auth.tf` | Service-to-service IAM bindings | Active | +| `iam.tf` | Service accounts, WIF, roles | Active | +| `network.tf` | VPC, subnets, firewall rules | Active | +| `pubsub.tf` | Topics and subscriptions | Active | +| `storage.tf` | GCS buckets | Active | +| `firestore-backup.tf` | Scheduled Firestore exports | Active | +| `artifact_registry.tf` | Docker image registry | Active | +| `monitoring.tf` | Alert policies, notification channels | Active | +| `scheduler.tf` | Cloud Scheduler jobs | Active | +| `agent_engine.tf` | Vertex AI Agent Engine | Active | +| `webhook_receiver.tf` | Webhook receiver Cloud Run | Active | + +### Deployed Services + +| Service | Type | Description | +|---------|------|-------------| +| gwi-api | Cloud Run | REST API | +| gwi-gateway | Cloud Run | A2A agent coordination | +| gwi-webhook | Cloud Run | GitHub webhook handler | +| gwi-worker | Cloud Run | Background job processor | +| gwi-mcp-server | Cloud Run | MCP tool server | + +--- + +## 4. Gap Analysis Matrix + +| Layer | Component | Current Status | Gap | Priority | +|-------|-----------|---------------|-----|----------| +| 1. Network | Cloud Armor WAF | Missing | No DDoS/WAF protection on Cloud Run endpoints | High | +| 1. Network | External HTTPS LB | Missing | Cloud Run uses default ingress, no centralized LB | High | +| 1. Network | Cloud CDN | Missing | No edge caching for static assets | Medium | +| 2. API Mgmt | API Gateway | Missing | No centralized rate limiting or API key management | High | +| 2. API Mgmt | Apigee | Missing | No developer portal or advanced API analytics | Low (skip) | +| 3. Compute | Cloud Run | Has (5 services) | Sufficient for current scale | N/A | +| 3. Compute | GKE | Missing | Not needed unless scale exceeds Cloud Run limits | Low (skip) | +| 4. Data | Firestore | Has | Operational database in place | N/A | +| 4. Data | BigQuery | Missing | No analytics warehouse for DORA/usage metrics | High | +| 4. Data | Memorystore Redis | Missing | No distributed cache, session store, or rate limit backend | High | +| 5. Messaging | Pub/Sub | Has | Topics and subscriptions configured | N/A | +| 5. Messaging | Eventarc | Partial | Some triggers, not fully wired | Low | +| 6. Identity | IAM + WIF | Has | Service accounts and workload identity in place | N/A | +| 6. Identity | SCIM 2.0 | Missing | No automated user provisioning from IdPs | Medium | +| 6. Identity | Advanced RBAC | Partial | Basic roles exist, no fine-grained permissions | Medium | +| 7. Observability | Monitoring | Has | Alert policies configured | N/A | +| 7. Observability | SLO Monitoring | Missing | No formal SLI/SLO definitions in IaC | High | +| 7. Observability | Tracing | Partial | Some spans, not comprehensive | Medium | +| 8. Security | Secret Manager | Has | Secrets stored and referenced | N/A | +| 8. Security | VPC Service Controls | Missing | No service perimeter for Firestore/BigQuery | Medium | +| 8. Security | KMS (CMEK) | Missing | Using Google-managed keys, not customer-managed | Low | + +--- + +## 5. Implementation Roadmap + +### Phase 6: Enterprise Infrastructure (Q2-Q3 2026) + +**Goal:** Defense-in-depth networking and performance caching. + +#### 6.1 Cloud Armor WAF + +| Item | Value | +|------|-------| +| OpenTofu file | `infra/cloud_armor.tf` (new) | +| Resource | `google_compute_security_policy.gwi_waf` | +| Variables | `var.waf_enabled`, `var.waf_rules` | +| Rules | OWASP CRS 3.3, rate limiting (100 req/min default), geo-blocking (optional) | +| Binding | Attach to External LB backend services | + +#### 6.2 External HTTPS Load Balancer + +| Item | Value | +|------|-------| +| OpenTofu file | `infra/load_balancer.tf` (new) | +| Resources | `google_compute_global_address`, `google_compute_managed_ssl_certificate`, `google_compute_url_map`, `google_compute_target_https_proxy`, `google_compute_global_forwarding_rule` | +| Variables | `var.lb_enabled`, `var.lb_domain`, `var.lb_ssl_domains` | +| Backend services | One per Cloud Run service (serverless NEG) | +| Path routing | `/api/*` -> gwi-api, `/gateway/*` -> gwi-gateway, `/mcp/*` -> gwi-mcp-server | + +#### 6.3 Memorystore Redis + +| Item | Value | +|------|-------| +| OpenTofu file | `infra/memorystore.tf` (new) | +| Resource | `google_redis_instance.gwi_cache` | +| Variables | `var.redis_enabled`, `var.redis_tier` (BASIC/STANDARD), `var.redis_memory_size_gb` | +| Use cases | Session cache, rate limit counters, agent result caching | +| VPC connector | Requires Serverless VPC Access connector for Cloud Run | + +#### 6.4 API Gateway + +| Item | Value | +|------|-------| +| OpenTofu file | `infra/api_gateway.tf` (new) | +| Resources | `google_api_gateway_api`, `google_api_gateway_api_config`, `google_api_gateway_gateway` | +| Variables | `var.api_gateway_enabled`, `var.api_gateway_spec_path` | +| Features | Rate limiting, API key validation, request transformation | +| OpenAPI spec | `infra/api-gateway-spec.yaml` (new) | + + +### Phase 7: Analytics and Scale (Q3-Q4 2026) + +**Goal:** Data-driven insights and event-sourced architecture. + +#### 7.1 BigQuery Analytics Pipeline + +| Item | Value | +|------|-------| +| OpenTofu file | `infra/bigquery.tf` (new) | +| Resources | `google_bigquery_dataset.gwi_analytics`, tables for runs, agents, billing, DORA metrics | +| Variables | `var.bigquery_enabled`, `var.bigquery_location`, `var.bigquery_retention_days` | +| Pipeline | Firestore -> Pub/Sub -> Cloud Run (ETL) -> BigQuery | +| Datasets | `gwi_analytics` (run metrics), `gwi_billing` (cost tracking) | + +#### 7.2 CQRS Event Store + +| Item | Value | +|------|-------| +| OpenTofu file | `infra/eventstore.tf` (new) | +| Architecture | Pub/Sub topics as event bus, Firestore subcollections as event log | +| Topics | `gwi-events-run`, `gwi-events-agent`, `gwi-events-approval` | +| Consumers | BigQuery sink, notification service, audit logger | +| Variables | `var.cqrs_enabled`, `var.event_retention_days` | + +#### 7.3 SLO Monitoring + +| Item | Value | +|------|-------| +| OpenTofu file | Update `infra/monitoring.tf` | +| Resources | `google_monitoring_slo` per service, `google_monitoring_service` | +| SLIs | Availability (99.9%), latency (p95 < 2s), error rate (< 0.1%) | +| Variables | `var.slo_enabled`, `var.slo_targets` (map of service -> target) | +| Burn rate alerts | Fast burn (1h window), slow burn (24h window) | + + +### Phase 8: Enterprise Identity (Q4 2026 - Q1 2027) + +**Goal:** Automated user lifecycle and fine-grained access control. + +#### 8.1 SCIM 2.0 Provisioning + +| Item | Value | +|------|-------| +| Implementation | `apps/api/src/routes/scim/` (new) | +| Endpoints | `/scim/v2/Users`, `/scim/v2/Groups`, `/scim/v2/ServiceProviderConfig` | +| IdP support | Okta, Azure AD, Google Workspace | +| Storage | Firestore `tenants/{id}/scimUsers` collection | +| Variables | `var.scim_enabled` | + +#### 8.2 Application Integration + +| Item | Value | +|------|-------| +| Scope | Webhook-based integrations with enterprise tools | +| Integrations | PagerDuty (incidents), Datadog (metrics export), ServiceNow (tickets) | +| Implementation | `packages/integrations/src/enterprise/` (new) | +| Configuration | Per-tenant integration settings in Firestore | + +#### 8.3 Advanced RBAC + +| Item | Value | +|------|-------| +| Implementation | `packages/core/src/auth/rbac/` (enhance existing) | +| Roles | org_admin, project_admin, developer, reviewer, auditor, read_only | +| Permissions | Fine-grained: `runs.create`, `runs.approve`, `agents.configure`, `audit.export` | +| Storage | Firestore `tenants/{id}/roles` and `tenants/{id}/permissions` | +| Variables | `var.rbac_mode` (basic/advanced) | + +--- + +## 6. Dependency Graph + +``` +Phase 6.2 (External LB) + └── Phase 6.1 (Cloud Armor WAF) -- WAF attaches to LB + └── Phase 6.3 (Memorystore) -- needs VPC connector shared with LB +Phase 6.4 (API Gateway) -- independent, can parallel with 6.1-6.3 + +Phase 7.1 (BigQuery) + └── Phase 7.2 (CQRS Event Store) -- events feed into BigQuery + └── Phase 7.3 (SLO Monitoring) -- SLO data stored in BigQuery + +Phase 8.1 (SCIM) -- independent +Phase 8.2 (App Integration) -- independent +Phase 8.3 (Advanced RBAC) -- depends on 8.1 for user/group sync +``` + +### Critical Path + +1. External LB (6.2) -> Cloud Armor (6.1) -> API Gateway (6.4) +2. BigQuery (7.1) -> CQRS (7.2) -> SLO Monitoring (7.3) +3. SCIM (8.1) -> Advanced RBAC (8.3) + +--- + +## 7. Cost Estimates + +| Component | Monthly Cost (Dev) | Monthly Cost (Prod) | Notes | +|-----------|--------------------|---------------------|-------| +| Cloud Armor | $5 (policy) | $5 + $0.75/M requests | Per-policy + per-request pricing | +| External LB | $18 | $18 + data processing | Forwarding rule + backend | +| Memorystore Redis | $37 (1GB Basic) | $146 (2GB Standard HA) | Persistent HA for prod | +| API Gateway | Free tier | ~$3/M calls | First 2M calls/month free | +| BigQuery | Free tier (1TB) | ~$25/month | On-demand query pricing | +| SLO Monitoring | Free | Free | Included with Cloud Monitoring | +| **Phase 6 Total** | **~$60/mo** | **~$190/mo** | Infrastructure layer | +| **Phase 7 Total** | **~$5/mo** | **~$30/mo** | Analytics layer | +| **Phase 8 Total** | **$0** | **$0** | Application-layer (code only) | +| **Grand Total** | **~$65/mo** | **~$220/mo** | All three phases | + +--- + +## 8. Skip List (Not Needed for GWI) + +| Component | Reason | +|-----------|--------| +| Apigee | Overkill for internal API management; API Gateway covers rate limiting and key management | +| GKE | Cloud Run handles current scale; GKE adds operational overhead with no benefit at current traffic | +| Cloud CDN | GWI is API-first with minimal static assets; Firebase Hosting covers web dashboard | +| KMS (CMEK) | Google-managed encryption sufficient for current threat model; CMEK adds key rotation overhead | +| VPC Service Controls | Adds complexity; revisit if handling PII or regulated data beyond current scope | +| Anthos Service Mesh | No multi-cluster or hybrid needs; Cloud Run service-to-service auth is sufficient | +| Cloud NAT | Cloud Run has built-in egress; NAT only needed for GCE/GKE workloads | + +--- + +## 9. Success Criteria + +| Phase | Metric | Target | +|-------|--------|--------| +| 6 | WAF blocking malicious requests | >99% OWASP attack prevention | +| 6 | Cache hit rate (Redis) | >80% for repeated agent queries | +| 6 | API Gateway latency overhead | <10ms p99 added latency | +| 7 | Analytics query time | <5s for 30-day DORA dashboards | +| 7 | SLO burn rate alert latency | <5min for fast burn detection | +| 8 | SCIM provisioning time | <30s user creation from IdP | +| 8 | RBAC permission check | <5ms per authorization check | + +--- + +*Document follows 6767-a-DR-STND-document-filing-system-standard-v4-2 naming convention.* diff --git a/LICENSE b/LICENSE index 83afc3b..32b92d8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,70 @@ -MIT License - -Copyright (c) 2025 Intent Solutions LLC / Jeremy Longshore - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Business Source License 1.1 + +Parameters + +Licensor: Intent Solutions LLC +Licensed Work: Git With Intent v0.9.0 + The Licensed Work is (c) 2025-2026 Intent Solutions LLC. +Additional Use Grant: You may make use of the Licensed Work, provided that you + may not use the Licensed Work for a Competing SaaS Offering. + A "Competing SaaS Offering" is a commercial offering that + provides substantially the same functionality as the Licensed + Work to third parties as a managed service. + Personal and non-commercial use is always permitted. +Change Date: 2030-02-17 +Change License: Apache License, Version 2.0 + +For information about alternative licensing arrangements, contact +jeremy@intentsolutions.io. + +Notice + +The Business Source License (this document, or the "License") is not an Open +Source license. However, the Licensed Work will eventually be made available +under an Open Source License, as stated in this License. + +License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved. +"Business Source License" is a trademark of MariaDB Corporation Ab. + +Terms + +The Licensor hereby grants you the right to copy, modify, create derivative +works, redistribute, and make non-production use of the Licensed Work. + +The Licensor may make an Additional Use Grant, above, permitting limited +production use. + +Effective on the Change Date, or the fourth anniversary of the first publicly +available distribution of a specific version of the Licensed Work under this +License, whichever comes first, the Licensor hereby grants you rights under +the terms of the Change License, and the rights granted in the paragraph +above terminate. + +If your use of the Licensed Work does not comply with the requirements +currently in effect as described in this License, you must purchase a +commercial license from the Licensor, its affiliated entities, or authorized +resellers, or you must refrain from using the Licensed Work. + +All copies of the original and modified Licensed Work, and derivative works +of the Licensed Work, are subject to this License. This License applies +separately for each version of the Licensed Work and the Change Date may +vary for each version of the Licensed Work released by Licensor. + +You must conspicuously display this License on each original or modified copy +of the Licensed Work. If you receive the Licensed Work in original or +modified form from a third party, the terms and conditions set forth in this +License apply to your use of that work. + +Any use of the Licensed Work in violation of this License will automatically +terminate your rights under this License for the current and all other +versions of the Licensed Work. + +This License does not grant you any right in any trademark or logo of +Licensor or its affiliates (provided that you may use a trademark or logo of +Licensor as expressly required by this License). + +TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON +AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, +EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND +TITLE. diff --git a/README.md b/README.md index c43c021..c18b7eb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ CLI tool that automates PR workflows. Resolves merge conflicts, creates PRs from issues, reviews code, runs in full autopilot with approval gating. -**Version:** 0.8.0 | **Status:** Active development +**Version:** 0.9.0 | **Status:** Active development --- @@ -18,7 +18,7 @@ CLI tool that automates PR workflows. Resolves merge conflicts, creates PRs from - Need AI governance policies with approval workflows **Not for teams that:** -- Need a web dashboard first (CLI-only for now) +- Want a polished web UI (dashboard exists but CLI is primary interface) - Want AI to push directly without approval (intentionally blocked) - Need GitLab/Bitbucket support today (GitHub only for now) @@ -89,7 +89,6 @@ flowchart LR flowchart TB subgraph "Developer Interface" CLI[gwi CLI] - IDE[IDE Plugins] Dashboard[Web Dashboard] end @@ -114,7 +113,6 @@ flowchart TB end CLI --> Agents - IDE --> Agents Dashboard --> Engine Agents --> Engine Engine --> RAG @@ -128,7 +126,7 @@ flowchart TB Analytics --> Observability ``` -**Platform capabilities (v0.8.0):** +**Platform capabilities (v0.9.0):** | Category | Features | |----------|----------| @@ -294,11 +292,15 @@ flowchart TB approval[Approval Gate] end - subgraph Agents["AI Agents"] + subgraph Agents["AI Agents (8)"] + orchestrator[Orchestrator - Gemini Flash] + foreman[Foreman - Workflow Coordinator] triage[Triage - Gemini Flash] - coder[Coder - Claude Sonnet] + coder[Coder - Claude Sonnet/Opus] resolver[Resolver - Claude Sonnet/Opus] - reviewer[Reviewer - Claude Sonnet] + reviewer[Reviewer - Dynamic Model] + slop[Slop Detector - Quality Gate] + infra_agent[Infra - Infrastructure Agent] end subgraph Storage["Storage"] @@ -361,6 +363,7 @@ git-with-intent/ │ ├── api/ # REST API (Cloud Run) │ ├── gateway/ # A2A agent coordination │ ├── github-webhook/ # Webhook handler +│ ├── mcp-server/ # MCP tool server (Cloud Run) │ ├── webhook-receiver/ # Generic webhook receiver │ ├── worker/ # Background jobs │ ├── registry/ # Workflow template registry @@ -372,6 +375,7 @@ git-with-intent/ │ ├── integrations/ # GitHub/GitLab connectors │ ├── connectors/ # Airbyte-style data connectors │ ├── forecasting/ # TimeGPT integration +│ ├── sandbox/ # Secure execution sandbox │ └── sdk/ # TypeScript SDK └── infra/ # OpenTofu (GCP infrastructure) ``` @@ -526,7 +530,7 @@ gwi explain . gwi explain HEAD~3 ``` -> **Roadmap:** `gwi simulate` (world model simulation) is planned for Phase 35. +> **Available:** `gwi simulate` runs world model simulation against a run to predict outcomes before committing. --- @@ -681,7 +685,7 @@ npm run arv:smoke # Boot test | **Compliance & Analytics** | ✅ Complete | SOC2/SOX tracking (EPIC 021), DORA metrics (EPIC 022) | | **Developer Experience** | ✅ Complete | DevEx dashboard (EPIC 009), onboarding automation (EPIC 010) | | **Integrations** | ✅ Complete | GitHub, Jira, Linear, Slack connectors | -| **Knowledge & AI** | ✅ Complete | RAG search (EPIC 023), documentation generation (EPIC 012) | +| **Knowledge & AI** | 🚧 In Progress | RAG search (EPIC 023 — specification complete), documentation generation (EPIC 012) | | **Infrastructure** | ✅ Complete | CI/CD golden paths (EPIC 007), observability (EPIC 014), cost optimization (EPIC 013) | ```mermaid @@ -754,6 +758,7 @@ flowchart LR - `gwi-gateway` - A2A coordination (Cloud Run) - `gwi-webhook` - GitHub webhooks (Cloud Run) - `gwi-worker` - Background jobs (Cloud Run) +- `gwi-mcp-server` - MCP tool server (Cloud Run) - Firestore - Operational database - Firebase Hosting - Web dashboard @@ -803,6 +808,19 @@ gantt section Phase 5: Enterprise Knowledge Base/RAG :active, p5, 2026-02, 2026-04 Enterprise Features :milestone, p5b, 2026-04, 2026-06 + section Phase 6: Enterprise Infra + Cloud Armor WAF :p6a, 2026-04, 2026-06 + External LB :p6b, 2026-05, 2026-07 + Memorystore Redis :p6c, 2026-05, 2026-08 + API Gateway :p6d, 2026-06, 2026-09 + section Phase 7: Analytics & Scale + BigQuery Analytics :p7a, 2026-07, 2026-09 + CQRS Event Store :p7b, 2026-08, 2026-10 + SLO Monitoring :p7c, 2026-09, 2026-11 + section Phase 8: Enterprise Identity + SCIM 2.0 Provisioning :p8a, 2026-10, 2026-12 + App Integration :p8b, 2026-11, 2027-01 + Advanced RBAC :p8c, 2026-12, 2027-03 ``` | Phase | Status | Features | @@ -812,6 +830,9 @@ gantt | 3. Local Development | ✅ Complete | Local review, pre-commit hooks, `gwi gate` | | 4. Platform Services | ✅ Complete | Security scanning, AI governance, compliance, analytics | | 5. Enterprise | 🚧 In Progress | RAG knowledge base, enterprise auth, advanced reporting | +| 6. Enterprise Infrastructure | 📋 Planned | Cloud Armor WAF, External LB, Memorystore Redis, API Gateway | +| 7. Analytics & Scale | 📋 Planned | BigQuery analytics pipeline, CQRS event store, SLO monitoring | +| 8. Enterprise Identity | 📋 Planned | SCIM 2.0 provisioning, application integration, advanced RBAC | --- @@ -831,14 +852,14 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines. [Security Policy](SECURITY.md) -- Security audit completed Dec 2025 -- Pre-alpha software - not production-ready +- Security audit completed Feb 2026 +- Alpha software — approaching GA with readiness gates - Security issues: security@intentsolutions.io --- ## License -MIT License - Copyright (c) 2025-2026 Intent Solutions LLC +Business Source License 1.1 - Copyright (c) 2025-2026 Intent Solutions LLC -Open source CLI. Hosted service (when available) is commercial. +Source-available. Personal and non-commercial use permitted. Production use in a competing SaaS offering requires a commercial license. Converts to Apache License 2.0 on 2030-02-17. See [LICENSE](LICENSE) for details. diff --git a/package.json b/package.json index aadb678..5cd1ca7 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,7 @@ "automation" ], "author": "Intent Solutions ", - "license": "SEE LICENSE IN LICENSE", + "license": "BUSL-1.1", "engines": { "node": ">=20.0.0" }, diff --git a/packages/engine/src/hooks/__tests__/budget-management-hook.test.ts b/packages/engine/src/hooks/__tests__/budget-management-hook.test.ts new file mode 100644 index 0000000..65f6282 --- /dev/null +++ b/packages/engine/src/hooks/__tests__/budget-management-hook.test.ts @@ -0,0 +1,312 @@ +/** + * Budget Management Hook Tests + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { BudgetManagementHook, BudgetExceededError } from '../budget-management-hook.js'; +import type { AgentRunContext } from '../types.js'; + +function makeCtx(overrides?: Partial): AgentRunContext { + return { + runId: 'test-run-1', + runType: 'autopilot', + stepId: 'step-1', + agentRole: 'CODER', + stepStatus: 'completed', + timestamp: new Date().toISOString(), + metadata: {}, + ...overrides, + }; +} + +describe('BudgetManagementHook', () => { + let hook: BudgetManagementHook; + + beforeEach(() => { + hook = new BudgetManagementHook(); + }); + + describe('budget tracking', () => { + it('should track token usage across steps', async () => { + const ctx = makeCtx(); + await hook.onRunStart(ctx); + + // Step 1: 1000 input, 500 output + const step1 = makeCtx({ + stepId: 'step-1', + tokensUsed: { input: 1000, output: 500 }, + }); + await hook.onAfterStep(step1); + + const status1 = step1.metadata?.budgetStatus as any; + expect(status1.totalInputTokens).toBe(1000); + expect(status1.totalOutputTokens).toBe(500); + expect(status1.stepsCompleted).toBe(1); + + // Step 2: 2000 input, 1000 output + const step2 = makeCtx({ + stepId: 'step-2', + tokensUsed: { input: 2000, output: 1000 }, + }); + await hook.onAfterStep(step2); + + const status2 = step2.metadata?.budgetStatus as any; + expect(status2.totalInputTokens).toBe(3000); + expect(status2.totalOutputTokens).toBe(1500); + expect(status2.stepsCompleted).toBe(2); + }); + + it('should calculate cost estimate', async () => { + const ctx = makeCtx(); + await hook.onRunStart(ctx); + + const step = makeCtx({ + tokensUsed: { input: 10000, output: 5000 }, + }); + await hook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + // 10K input * $0.003/1K = $0.03 + // 5K output * $0.015/1K = $0.075 + // Total = $0.105 + expect(status.estimatedCostUsd).toBeCloseTo(0.105, 3); + }); + }); + + describe('warning levels', () => { + it('should show no warning under threshold', async () => { + const ctx = makeCtx(); + await hook.onRunStart(ctx); + + const step = makeCtx({ tokensUsed: { input: 1000, output: 500 } }); + await hook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + expect(status.warningLevel).toBe('none'); + }); + + it('should show approaching warning at 75%', async () => { + const smallBudgetHook = new BudgetManagementHook({ maxTotalTokens: 1000 }); + const ctx = makeCtx(); + await smallBudgetHook.onRunStart(ctx); + + const step = makeCtx({ tokensUsed: { input: 400, output: 400 } }); + await smallBudgetHook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + expect(status.warningLevel).toBe('approaching'); + }); + + it('should show critical warning at 90%', async () => { + const smallBudgetHook = new BudgetManagementHook({ maxTotalTokens: 1000 }); + const ctx = makeCtx(); + await smallBudgetHook.onRunStart(ctx); + + const step = makeCtx({ tokensUsed: { input: 500, output: 450 } }); + await smallBudgetHook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + expect(status.warningLevel).toBe('critical'); + }); + + it('should show exceeded at 100%', async () => { + const smallBudgetHook = new BudgetManagementHook({ maxTotalTokens: 1000 }); + const ctx = makeCtx(); + await smallBudgetHook.onRunStart(ctx); + + const step = makeCtx({ tokensUsed: { input: 600, output: 500 } }); + await smallBudgetHook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + expect(status.warningLevel).toBe('exceeded'); + }); + }); + + describe('budget warnings in context', () => { + it('should inject budget warning before step when approaching', async () => { + const smallBudgetHook = new BudgetManagementHook({ maxTotalTokens: 1000 }); + const ctx = makeCtx(); + await smallBudgetHook.onRunStart(ctx); + + // Consume 80% of budget + const step1 = makeCtx({ tokensUsed: { input: 400, output: 400 } }); + await smallBudgetHook.onAfterStep(step1); + + // Check warning before next step + const step2 = makeCtx({ stepId: 'step-2' }); + await smallBudgetHook.onBeforeStep(step2); + + expect(step2.metadata?.budgetWarning).toBeTruthy(); + expect(step2.metadata?.budgetWarning).toContain('consumed'); + }); + + it('should inject critical warning', async () => { + const smallBudgetHook = new BudgetManagementHook({ maxTotalTokens: 1000 }); + const ctx = makeCtx(); + await smallBudgetHook.onRunStart(ctx); + + // Consume 95% of budget + const step1 = makeCtx({ tokensUsed: { input: 500, output: 450 } }); + await smallBudgetHook.onAfterStep(step1); + + const step2 = makeCtx({ stepId: 'step-2' }); + await smallBudgetHook.onBeforeStep(step2); + + expect(step2.metadata?.budgetWarning).toContain('CRITICAL'); + }); + }); + + describe('blocking mode', () => { + it('should throw BudgetExceededError when budget exceeded', async () => { + const blockingHook = new BudgetManagementHook({ + maxTotalTokens: 1000, + enforceBlocking: true, + }); + + const ctx = makeCtx(); + await blockingHook.onRunStart(ctx); + + // Exceed budget + const step1 = makeCtx({ tokensUsed: { input: 600, output: 500 } }); + await blockingHook.onAfterStep(step1); + + // Next step should be blocked + const step2 = makeCtx({ stepId: 'step-2' }); + await expect(blockingHook.onBeforeStep(step2)).rejects.toThrow(BudgetExceededError); + }); + + it('should call onBudgetExceeded callback', async () => { + const onExceeded = vi.fn(); + const blockingHook = new BudgetManagementHook({ + maxTotalTokens: 1000, + enforceBlocking: true, + onBudgetExceeded: onExceeded, + }); + + const ctx = makeCtx(); + await blockingHook.onRunStart(ctx); + + const step1 = makeCtx({ tokensUsed: { input: 600, output: 500 } }); + await blockingHook.onAfterStep(step1); + + const step2 = makeCtx({ stepId: 'step-2' }); + await expect(blockingHook.onBeforeStep(step2)).rejects.toThrow(); + expect(onExceeded).toHaveBeenCalledOnce(); + }); + + it('should not throw when under budget', async () => { + const blockingHook = new BudgetManagementHook({ + maxTotalTokens: 1000000, + enforceBlocking: true, + }); + + const ctx = makeCtx(); + await blockingHook.onRunStart(ctx); + + const step1 = makeCtx({ tokensUsed: { input: 100, output: 50 } }); + await blockingHook.onAfterStep(step1); + + const step2 = makeCtx({ stepId: 'step-2' }); + await expect(blockingHook.onBeforeStep(step2)).resolves.toBeUndefined(); + }); + }); + + describe('cost-based budget', () => { + it('should trigger on cost exceeding budget', async () => { + const costHook = new BudgetManagementHook({ + maxTotalTokens: 10_000_000, // Very high token limit + maxCostUsd: 0.01, // Very low cost limit + costPer1kInputTokens: 0.003, + costPer1kOutputTokens: 0.015, + }); + + const ctx = makeCtx(); + await costHook.onRunStart(ctx); + + // 10K output tokens * $0.015/1K = $0.15 (far exceeds $0.01) + const step = makeCtx({ tokensUsed: { input: 1000, output: 10000 } }); + await costHook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + expect(status.warningLevel).toBe('exceeded'); + }); + }); + + describe('onRunEnd', () => { + it('should clean up and not error for unknown runs', async () => { + await expect(hook.onRunEnd(makeCtx({ runId: 'unknown' }), true)) + .resolves.toBeUndefined(); + }); + + it('should clean up tracking data', async () => { + const ctx = makeCtx(); + await hook.onRunStart(ctx); + + const step = makeCtx({ tokensUsed: { input: 100, output: 50 } }); + await hook.onAfterStep(step); + + await hook.onRunEnd(ctx, true); + + // After cleanup, before-step should not have budget info + const lateStep = makeCtx({ stepId: 'step-late' }); + await hook.onBeforeStep(lateStep); + expect(lateStep.metadata?.budgetStatus).toBeUndefined(); + }); + }); + + describe('idempotent onRunStart', () => { + it('should not reset budget on duplicate onRunStart calls', async () => { + const ctx = makeCtx(); + await hook.onRunStart(ctx); + + // Consume some tokens + const step1 = makeCtx({ tokensUsed: { input: 1000, output: 500 } }); + await hook.onAfterStep(step1); + + // Second onRunStart with same runId should NOT reset + await hook.onRunStart(makeCtx()); + + const step2 = makeCtx({ + stepId: 'step-2', + tokensUsed: { input: 500, output: 250 }, + }); + await hook.onAfterStep(step2); + + const status = step2.metadata?.budgetStatus as any; + expect(status.totalInputTokens).toBe(1500); + expect(status.totalOutputTokens).toBe(750); + expect(status.stepsCompleted).toBe(2); + }); + }); + + describe('no tokens used', () => { + it('should handle steps without token info', async () => { + const ctx = makeCtx(); + await hook.onRunStart(ctx); + + const step = makeCtx(); // No tokensUsed + await hook.onAfterStep(step); + + const status = step.metadata?.budgetStatus as any; + expect(status.totalInputTokens).toBe(0); + expect(status.totalOutputTokens).toBe(0); + }); + }); + + describe('isEnabled', () => { + it('should be enabled by default', async () => { + expect(await hook.isEnabled()).toBe(true); + }); + + it('should be disabled when env var is false', async () => { + const orig = process.env.GWI_BUDGET_MANAGEMENT_ENABLED; + process.env.GWI_BUDGET_MANAGEMENT_ENABLED = 'false'; + try { + expect(await hook.isEnabled()).toBe(false); + } finally { + if (orig === undefined) delete process.env.GWI_BUDGET_MANAGEMENT_ENABLED; + else process.env.GWI_BUDGET_MANAGEMENT_ENABLED = orig; + } + }); + }); +}); diff --git a/packages/engine/src/hooks/__tests__/environment-onboarding-hook.test.ts b/packages/engine/src/hooks/__tests__/environment-onboarding-hook.test.ts new file mode 100644 index 0000000..89304e9 --- /dev/null +++ b/packages/engine/src/hooks/__tests__/environment-onboarding-hook.test.ts @@ -0,0 +1,224 @@ +/** + * Environment Onboarding Hook Tests + */ + +import { describe, it, expect, beforeEach } from 'vitest'; +import { EnvironmentOnboardingHook } from '../environment-onboarding-hook.js'; +import type { AgentRunContext } from '../types.js'; + +function makeCtx(overrides?: Partial): AgentRunContext { + return { + runId: 'test-run-1', + runType: 'autopilot', + stepId: 'step-1', + agentRole: 'FOREMAN', + stepStatus: 'completed', + timestamp: new Date().toISOString(), + metadata: {}, + ...overrides, + }; +} + +describe('EnvironmentOnboardingHook', () => { + let hook: EnvironmentOnboardingHook; + + beforeEach(() => { + hook = new EnvironmentOnboardingHook(); + }); + + describe('onRunStart', () => { + it('should profile the environment from file list', async () => { + const ctx = makeCtx({ + metadata: { + files: [ + 'package.json', + 'tsconfig.json', + 'vitest.config.ts', + 'turbo.json', + 'src/index.ts', + 'src/utils.ts', + ], + }, + }); + + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile).toBeDefined(); + expect(profile.languages).toContain('typescript'); + expect(profile.testRunners).toContain('vitest'); + expect(profile.buildTools).toContain('turbo'); + expect(profile.packageManagers).toContain('npm'); + expect(profile.isMonorepo).toBe(true); + }); + + it('should detect Python projects', async () => { + const ctx = makeCtx({ + metadata: { + files: [ + 'pyproject.toml', + 'conftest.py', + 'src/main.py', + 'tests/test_main.py', + ], + }, + }); + + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile.languages).toContain('python'); + expect(profile.testRunners).toContain('pytest'); + }); + + it('should detect Go projects', async () => { + const ctx = makeCtx({ + metadata: { + files: ['go.mod', 'go.sum', 'main.go', 'handler_test.go'], + }, + }); + + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile.languages).toContain('go'); + expect(profile.packageManagers).toContain('go-modules'); + }); + + it('should detect Rust projects', async () => { + const ctx = makeCtx({ + metadata: { + files: ['Cargo.toml', 'Cargo.lock', 'src/main.rs'], + }, + }); + + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile.languages).toContain('rust'); + expect(profile.packageManagers).toContain('cargo'); + }); + + it('should detect monorepo from directory structure', async () => { + const ctx = makeCtx({ + metadata: { + files: [ + 'packages/core/index.ts', + 'packages/cli/index.ts', + 'apps/web/index.ts', + ], + }, + }); + + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile.isMonorepo).toBe(true); + }); + + it('should handle empty file list', async () => { + const ctx = makeCtx({ metadata: { files: [] } }); + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile).toBeDefined(); + expect(profile.languages).toEqual([]); + expect(profile.isMonorepo).toBe(false); + }); + + it('should handle missing files metadata', async () => { + const ctx = makeCtx({ metadata: {} }); + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile).toBeDefined(); + expect(profile.languages).toEqual([]); + }); + }); + + describe('onBeforeStep', () => { + it('should inject profile into step context', async () => { + const runCtx = makeCtx({ + metadata: { + files: ['package.json', 'src/index.ts'], + }, + }); + await hook.onRunStart(runCtx); + + const stepCtx = makeCtx({ stepId: 'step-2', agentRole: 'CODER' }); + await hook.onBeforeStep(stepCtx); + + expect(stepCtx.metadata?.environmentProfile).toBeDefined(); + }); + + it('should not inject when injectIntoSteps is false', async () => { + const noInjectHook = new EnvironmentOnboardingHook({ injectIntoSteps: false }); + + const runCtx = makeCtx({ + metadata: { files: ['package.json'] }, + }); + await noInjectHook.onRunStart(runCtx); + + const stepCtx = makeCtx({ stepId: 'step-2' }); + await noInjectHook.onBeforeStep(stepCtx); + + expect(stepCtx.metadata?.environmentProfile).toBeUndefined(); + }); + }); + + describe('onRunEnd', () => { + it('should clean up run profile', async () => { + const runCtx = makeCtx({ + metadata: { files: ['package.json'] }, + }); + await hook.onRunStart(runCtx); + + await hook.onRunEnd(runCtx, true); + + // After cleanup, step should not get profile + const stepCtx = makeCtx({ runId: 'test-run-1', stepId: 'step-late' }); + await hook.onBeforeStep(stepCtx); + expect(stepCtx.metadata?.environmentProfile).toBeUndefined(); + }); + }); + + describe('isEnabled', () => { + it('should be enabled by default', async () => { + expect(await hook.isEnabled()).toBe(true); + }); + + it('should be disabled when env var is false', async () => { + const orig = process.env.GWI_ENVIRONMENT_ONBOARDING_ENABLED; + process.env.GWI_ENVIRONMENT_ONBOARDING_ENABLED = 'false'; + try { + expect(await hook.isEnabled()).toBe(false); + } finally { + if (orig === undefined) delete process.env.GWI_ENVIRONMENT_ONBOARDING_ENABLED; + else process.env.GWI_ENVIRONMENT_ONBOARDING_ENABLED = orig; + } + }); + }); + + describe('configFiles', () => { + it('should track discovered config files', async () => { + const ctx = makeCtx({ + metadata: { + files: [ + 'package.json', + 'tsconfig.json', + 'Dockerfile', + 'src/index.ts', + ], + }, + }); + + await hook.onRunStart(ctx); + + const profile = ctx.metadata?.environmentProfile as any; + expect(profile.configFiles).toContain('package.json'); + expect(profile.configFiles).toContain('tsconfig.json'); + expect(profile.configFiles).toContain('Dockerfile'); + expect(profile.configFiles).not.toContain('src/index.ts'); + }); + }); +}); diff --git a/packages/engine/src/hooks/__tests__/hook-config.test.ts b/packages/engine/src/hooks/__tests__/hook-config.test.ts index ccf63a2..92ee205 100644 --- a/packages/engine/src/hooks/__tests__/hook-config.test.ts +++ b/packages/engine/src/hooks/__tests__/hook-config.test.ts @@ -100,14 +100,19 @@ describe('Hook Configuration', () => { expect(hookNames).not.toContain('code-quality'); }); - it('registers all three hooks when all enabled', async () => { + it('registers all hooks when all enabled', async () => { process.env.GWI_DECISION_TRACE_ENABLED = 'true'; const runner = await buildDefaultHookRunner(); const hookNames = runner.getRegisteredHooks(); expect(hookNames).toContain('decision-trace'); expect(hookNames).toContain('risk-enforcement'); expect(hookNames).toContain('code-quality'); - expect(hookNames.length).toBe(3); + expect(hookNames).toContain('trace-analysis'); + expect(hookNames).toContain('self-test'); + expect(hookNames).toContain('environment-onboarding'); + expect(hookNames).toContain('loop-detection'); + expect(hookNames).toContain('budget-management'); + expect(hookNames.length).toBe(8); }); }); }); diff --git a/packages/engine/src/hooks/__tests__/loop-detection-hook.test.ts b/packages/engine/src/hooks/__tests__/loop-detection-hook.test.ts new file mode 100644 index 0000000..237b2bd --- /dev/null +++ b/packages/engine/src/hooks/__tests__/loop-detection-hook.test.ts @@ -0,0 +1,264 @@ +/** + * Loop Detection Hook Tests + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { + LoopDetectionHook, + LoopDetectionError, + calculateSimilarity, +} from '../loop-detection-hook.js'; +import type { AgentRunContext } from '../types.js'; + +function makeCtx(overrides?: Partial): AgentRunContext { + return { + runId: 'test-run-1', + runType: 'autopilot', + stepId: 'step-1', + agentRole: 'CODER', + stepStatus: 'completed', + timestamp: new Date().toISOString(), + metadata: {}, + ...overrides, + }; +} + +describe('calculateSimilarity', () => { + it('should return 1 for identical strings', () => { + expect(calculateSimilarity('hello world', 'hello world')).toBe(1); + }); + + it('should return 0 for completely different strings', () => { + const similarity = calculateSimilarity('abc', 'xyz'); + expect(similarity).toBe(0); + }); + + it('should return high similarity for near-identical strings', () => { + const a = 'function validateEmail(email: string): boolean { return true; }'; + const b = 'function validateEmail(email: string): boolean { return false; }'; + expect(calculateSimilarity(a, b)).toBeGreaterThan(0.8); + }); + + it('should return low similarity for different code', () => { + const a = 'function validateEmail(email: string): boolean { return true; }'; + const b = 'class UserService { async getUser(id: number): Promise { return db.find(id); } }'; + expect(calculateSimilarity(a, b)).toBeLessThan(0.5); + }); + + it('should handle empty strings', () => { + // Two empty strings are identical + expect(calculateSimilarity('', '')).toBe(1); + expect(calculateSimilarity('abc', '')).toBe(0); + expect(calculateSimilarity('', 'abc')).toBe(0); + }); + + it('should normalize whitespace', () => { + expect(calculateSimilarity('hello world', 'hello world')).toBe(1); + }); +}); + +describe('LoopDetectionHook', () => { + let hook: LoopDetectionHook; + + beforeEach(() => { + hook = new LoopDetectionHook(); + }); + + describe('role filtering', () => { + it('should only track CODER role', async () => { + const ctx = makeCtx({ agentRole: 'REVIEWER', outputSummary: 'Review output' }); + await hook.onAfterStep(ctx); + expect(ctx.metadata?.loopDetection).toBeUndefined(); + }); + + it('should track CODER role', async () => { + const ctx = makeCtx({ outputSummary: 'Generated code' }); + await hook.onAfterStep(ctx); + // First output — no loop, but detection result is still attached + const detection = ctx.metadata?.loopDetection as any; + expect(detection).toBeDefined(); + expect(detection.loopDetected).toBe(false); + expect(detection.similarCount).toBe(1); + }); + }); + + describe('loop detection', () => { + it('should not detect loop with different outputs', async () => { + const outputs = [ + 'Generated email validator function', + 'Built user authentication service', + 'Created database migration script', + ]; + + for (const output of outputs) { + const ctx = makeCtx({ outputSummary: output }); + await hook.onAfterStep(ctx); + } + + // Last ctx would have loopDetection only if loop found + const lastCtx = makeCtx({ outputSummary: 'Another unique output' }); + await hook.onAfterStep(lastCtx); + const detection = lastCtx.metadata?.loopDetection as any; + if (detection) { + expect(detection.loopDetected).toBe(false); + } + }); + + it('should detect loop with identical outputs', async () => { + const repeatedOutput = 'Generated email validator with regex pattern for email validation'; + + // Submit 3 identical outputs (default maxSimilarOutputs=3) + for (let i = 0; i < 3; i++) { + const ctx = makeCtx({ + stepId: `step-${i}`, + outputSummary: repeatedOutput, + }); + await hook.onAfterStep(ctx); + + if (i === 2) { + const detection = ctx.metadata?.loopDetection as any; + expect(detection).toBeDefined(); + expect(detection.loopDetected).toBe(true); + expect(detection.similarCount).toBe(3); + expect(detection.suggestion).toBeTruthy(); + expect(ctx.metadata?.loopNudge).toBeTruthy(); + } + } + }); + + it('should detect loop with near-identical outputs', async () => { + const outputs = [ + 'Generated email validator with regex pattern for user@domain.com validation', + 'Generated email validator with regex pattern for user@domain.com validation check', + 'Generated email validator with regex pattern for user@domain.com validation test', + ]; + + let lastCtx: AgentRunContext | undefined; + for (let i = 0; i < outputs.length; i++) { + lastCtx = makeCtx({ stepId: `step-${i}`, outputSummary: outputs[i] }); + await hook.onAfterStep(lastCtx); + } + + const detection = lastCtx!.metadata?.loopDetection as any; + expect(detection).toBeDefined(); + expect(detection.loopDetected).toBe(true); + }); + }); + + describe('blocking mode', () => { + it('should throw LoopDetectionError in blocking mode', async () => { + const blockingHook = new LoopDetectionHook({ enforceBlocking: true }); + const output = 'Identical generated code output for loop test'; + + for (let i = 0; i < 2; i++) { + const ctx = makeCtx({ stepId: `step-${i}`, outputSummary: output }); + await blockingHook.onAfterStep(ctx); + } + + const ctx = makeCtx({ stepId: 'step-2', outputSummary: output }); + await expect(blockingHook.onAfterStep(ctx)).rejects.toThrow(LoopDetectionError); + }); + + it('should not throw when loop is not detected', async () => { + const blockingHook = new LoopDetectionHook({ enforceBlocking: true }); + + const ctx1 = makeCtx({ stepId: 'step-0', outputSummary: 'Output A' }); + await expect(blockingHook.onAfterStep(ctx1)).resolves.toBeUndefined(); + + const ctx2 = makeCtx({ stepId: 'step-1', outputSummary: 'Completely different output B' }); + await expect(blockingHook.onAfterStep(ctx2)).resolves.toBeUndefined(); + }); + }); + + describe('onLoopDetected callback', () => { + it('should call callback when loop detected', async () => { + const onLoop = vi.fn(); + const cbHook = new LoopDetectionHook({ onLoopDetected: onLoop }); + const output = 'Same output repeated multiple times in agent loop'; + + for (let i = 0; i < 3; i++) { + const ctx = makeCtx({ stepId: `step-${i}`, outputSummary: output }); + await cbHook.onAfterStep(ctx); + } + + expect(onLoop).toHaveBeenCalledOnce(); + }); + }); + + describe('onRunEnd cleanup', () => { + it('should clean up run state', async () => { + const ctx = makeCtx({ outputSummary: 'Some output' }); + await hook.onAfterStep(ctx); + + await hook.onRunEnd(makeCtx(), true); + + // After cleanup, same runId should start fresh + const newCtx = makeCtx({ outputSummary: 'Some output' }); + await hook.onAfterStep(newCtx); + // History was cleared, so this is first output — no loop + const detection = newCtx.metadata?.loopDetection as any; + expect(detection).toBeDefined(); + expect(detection.loopDetected).toBe(false); + expect(detection.similarCount).toBe(1); + }); + }); + + describe('config', () => { + it('should respect custom similarity threshold', async () => { + // With very high threshold, similar-but-not-identical should not trigger + const strictHook = new LoopDetectionHook({ similarityThreshold: 0.99 }); + + const outputs = [ + 'Generated email validator version 1', + 'Generated email validator version 2', + 'Generated email validator version 3', + ]; + + let lastCtx: AgentRunContext | undefined; + for (let i = 0; i < outputs.length; i++) { + lastCtx = makeCtx({ stepId: `step-${i}`, outputSummary: outputs[i] }); + await strictHook.onAfterStep(lastCtx); + } + + const detection = lastCtx!.metadata?.loopDetection as any; + if (detection) { + expect(detection.loopDetected).toBe(false); + } + }); + + it('should respect maxTrackedOutputs', async () => { + const smallHook = new LoopDetectionHook({ maxTrackedOutputs: 2 }); + + // Add 5 different outputs — only last 2 should be tracked + for (let i = 0; i < 5; i++) { + const ctx = makeCtx({ stepId: `step-${i}`, outputSummary: `Unique output ${i}` }); + await smallHook.onAfterStep(ctx); + } + + // No loop should be detected since all outputs are unique + const ctx = makeCtx({ stepId: 'step-5', outputSummary: 'Another unique output' }); + await smallHook.onAfterStep(ctx); + const detection = ctx.metadata?.loopDetection as any; + if (detection) { + expect(detection.loopDetected).toBe(false); + } + }); + }); + + describe('isEnabled', () => { + it('should be enabled by default', async () => { + expect(await hook.isEnabled()).toBe(true); + }); + + it('should be disabled when env var is false', async () => { + const orig = process.env.GWI_LOOP_DETECTION_ENABLED; + process.env.GWI_LOOP_DETECTION_ENABLED = 'false'; + try { + expect(await hook.isEnabled()).toBe(false); + } finally { + if (orig === undefined) delete process.env.GWI_LOOP_DETECTION_ENABLED; + else process.env.GWI_LOOP_DETECTION_ENABLED = orig; + } + }); + }); +}); diff --git a/packages/engine/src/hooks/__tests__/self-test-hook.test.ts b/packages/engine/src/hooks/__tests__/self-test-hook.test.ts new file mode 100644 index 0000000..9adc3df --- /dev/null +++ b/packages/engine/src/hooks/__tests__/self-test-hook.test.ts @@ -0,0 +1,277 @@ +/** + * Self-Test Hook Tests + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { SelfTestHook, SelfTestError } from '../self-test-hook.js'; +import type { AgentRunContext } from '../types.js'; + +function makeCtx(overrides?: Partial): AgentRunContext { + return { + runId: 'test-run-1', + runType: 'autopilot', + stepId: 'step-1', + agentRole: 'CODER', + stepStatus: 'completed', + timestamp: new Date().toISOString(), + metadata: {}, + ...overrides, + }; +} + +const SOURCE_FILES = [ + { + path: 'src/utils/validator.ts', + content: 'export function validateEmail(email: string): boolean {\n return /^[^@]+@[^@]+\\.[^@]+$/.test(email);\n}', + action: 'create' as const, + explanation: 'Email validation', + }, +]; + +const TEST_FILES = [ + { + path: 'src/utils/__tests__/validator.test.ts', + content: 'import { describe, it, expect } from "vitest";\nimport { validateEmail } from "../validator.js";\n\ndescribe("validateEmail", () => {\n it("should validate email", () => {\n expect(validateEmail("a@b.c")).toBe(true);\n });\n});', + action: 'create' as const, + explanation: 'Tests for email validation', + }, +]; + +const MIXED_FILES = [...SOURCE_FILES, ...TEST_FILES]; + +describe('SelfTestHook', () => { + let hook: SelfTestHook; + + beforeEach(() => { + hook = new SelfTestHook(); + }); + + describe('role filtering', () => { + it('should only activate for CODER role', async () => { + const ctx = makeCtx({ + agentRole: 'REVIEWER', + metadata: { generatedFiles: SOURCE_FILES }, + }); + await hook.onAfterStep(ctx); + expect(ctx.metadata?.selfTestValidation).toBeUndefined(); + }); + + it('should activate for CODER role', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await hook.onAfterStep(ctx); + expect(ctx.metadata?.selfTestValidation).toBeDefined(); + }); + }); + + describe('test detection', () => { + it('should detect test files', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: MIXED_FILES } }); + await hook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.hasTests).toBe(true); + expect(validation.testFileCount).toBe(1); + expect(validation.sourceFileCount).toBe(1); + expect(validation.testRatio).toBe(1); + }); + + it('should detect absence of tests', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await hook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.hasTests).toBe(false); + expect(validation.testFileCount).toBe(0); + }); + + it('should detect vitest framework', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: MIXED_FILES } }); + await hook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.detectedFrameworks).toContain('vitest'); + }); + + it('should detect assertions', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: MIXED_FILES } }); + await hook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.hasAssertions).toBe(true); + }); + }); + + describe('requireTests mode', () => { + it('should pass when requireTests is false (default)', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await hook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.passed).toBe(true); + }); + + it('should fail when requireTests is true and no tests provided', async () => { + const strictHook = new SelfTestHook({ requireTests: true }); + const ctx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await strictHook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.passed).toBe(false); + expect(validation.failureReasons).toHaveLength(1); + }); + + it('should pass when requireTests is true and tests are provided', async () => { + const strictHook = new SelfTestHook({ requireTests: true }); + const ctx = makeCtx({ metadata: { generatedFiles: MIXED_FILES } }); + await strictHook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.passed).toBe(true); + }); + }); + + describe('blocking mode', () => { + it('should throw SelfTestError on next onBeforeStep after failed validation', async () => { + const blockingHook = new SelfTestHook({ + requireTests: true, + enforceBlocking: true, + }); + // onAfterStep stores the failure (does NOT throw — runner swallows afterStep errors) + const afterCtx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await blockingHook.onAfterStep(afterCtx); + + // onBeforeStep blocks the next step + const beforeCtx = makeCtx({ stepId: 'step-2' }); + await expect(blockingHook.onBeforeStep(beforeCtx)).rejects.toThrow(SelfTestError); + }); + + it('should not throw when passing in blocking mode', async () => { + const blockingHook = new SelfTestHook({ + requireTests: true, + enforceBlocking: true, + }); + const afterCtx = makeCtx({ metadata: { generatedFiles: MIXED_FILES } }); + await blockingHook.onAfterStep(afterCtx); + + const beforeCtx = makeCtx({ stepId: 'step-2' }); + await expect(blockingHook.onBeforeStep(beforeCtx)).resolves.toBeUndefined(); + }); + + it('should call onBlocked callback on onBeforeStep', async () => { + const onBlocked = vi.fn(); + const blockingHook = new SelfTestHook({ + requireTests: true, + enforceBlocking: true, + onBlocked, + }); + const afterCtx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await blockingHook.onAfterStep(afterCtx); + + const beforeCtx = makeCtx({ stepId: 'step-2' }); + await expect(blockingHook.onBeforeStep(beforeCtx)).rejects.toThrow(); + expect(onBlocked).toHaveBeenCalledOnce(); + }); + + it('should not block when enforceBlocking is false', async () => { + const nonBlockingHook = new SelfTestHook({ requireTests: true }); + const afterCtx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await nonBlockingHook.onAfterStep(afterCtx); + + const beforeCtx = makeCtx({ stepId: 'step-2' }); + await expect(nonBlockingHook.onBeforeStep(beforeCtx)).resolves.toBeUndefined(); + }); + + it('should clean up validation state on run end', async () => { + const blockingHook = new SelfTestHook({ + requireTests: true, + enforceBlocking: true, + }); + const afterCtx = makeCtx({ metadata: { generatedFiles: SOURCE_FILES } }); + await blockingHook.onAfterStep(afterCtx); + + // Clean up + await blockingHook.onRunEnd(makeCtx(), true); + + // Should not block after cleanup + const beforeCtx = makeCtx({ stepId: 'step-3' }); + await expect(blockingHook.onBeforeStep(beforeCtx)).resolves.toBeUndefined(); + }); + }); + + describe('minTestRatio', () => { + it('should fail when test ratio is below minimum', async () => { + const ratioHook = new SelfTestHook({ minTestRatio: 0.5 }); + // 2 source files, 0 test files → ratio 0 + const files = [ + ...SOURCE_FILES, + { path: 'src/other.ts', content: 'export const x = 1;', action: 'create' as const, explanation: 'Other' }, + ]; + const ctx = makeCtx({ metadata: { generatedFiles: files } }); + await ratioHook.onAfterStep(ctx); + + const validation = ctx.metadata!.selfTestValidation as any; + expect(validation.passed).toBe(false); + }); + }); + + describe('missing metadata', () => { + it('should handle missing metadata', async () => { + const ctx = makeCtx({ metadata: undefined }); + await expect(hook.onAfterStep(ctx)).resolves.toBeUndefined(); + }); + + it('should handle empty files', async () => { + const ctx = makeCtx({ metadata: { generatedFiles: [] } }); + await expect(hook.onAfterStep(ctx)).resolves.toBeUndefined(); + }); + }); + + describe('isEnabled', () => { + it('should be enabled by default', async () => { + expect(await hook.isEnabled()).toBe(true); + }); + + it('should be disabled when env var is false', async () => { + const orig = process.env.GWI_SELF_TEST_HOOK_ENABLED; + process.env.GWI_SELF_TEST_HOOK_ENABLED = 'false'; + try { + expect(await hook.isEnabled()).toBe(false); + } finally { + if (orig === undefined) delete process.env.GWI_SELF_TEST_HOOK_ENABLED; + else process.env.GWI_SELF_TEST_HOOK_ENABLED = orig; + } + }); + }); + + describe('test pattern matching', () => { + it('should recognize .test.ts files', () => { + const files = [{ path: 'foo.test.ts', content: '', action: 'create' as const, explanation: '' }]; + const result = hook.validate(files); + expect(result.testFileCount).toBe(1); + }); + + it('should recognize .spec.js files', () => { + const files = [{ path: 'foo.spec.js', content: '', action: 'create' as const, explanation: '' }]; + const result = hook.validate(files); + expect(result.testFileCount).toBe(1); + }); + + it('should recognize __tests__/ directory', () => { + const files = [{ path: '__tests__/foo.ts', content: '', action: 'create' as const, explanation: '' }]; + const result = hook.validate(files); + expect(result.testFileCount).toBe(1); + }); + + it('should recognize Python test files', () => { + const files = [{ path: 'test_foo.py', content: '', action: 'create' as const, explanation: '' }]; + const result = hook.validate(files); + expect(result.testFileCount).toBe(1); + }); + + it('should recognize Go test files', () => { + const files = [{ path: 'foo_test.go', content: '', action: 'create' as const, explanation: '' }]; + const result = hook.validate(files); + expect(result.testFileCount).toBe(1); + }); + }); +}); diff --git a/packages/engine/src/hooks/__tests__/trace-analysis-hook.test.ts b/packages/engine/src/hooks/__tests__/trace-analysis-hook.test.ts new file mode 100644 index 0000000..bdfeef4 --- /dev/null +++ b/packages/engine/src/hooks/__tests__/trace-analysis-hook.test.ts @@ -0,0 +1,256 @@ +/** + * Trace Analysis Hook Tests + */ + +import { describe, it, expect, vi } from 'vitest'; +import { TraceAnalysisHook } from '../trace-analysis-hook.js'; +import type { AgentRunContext } from '../types.js'; +import type { AgentDecisionTrace, DecisionTraceStore } from '@gwi/core'; + +function makeCtx(overrides?: Partial): AgentRunContext { + return { + runId: 'test-run-1', + runType: 'autopilot', + stepId: 'step-1', + agentRole: 'FOREMAN', + stepStatus: 'completed', + timestamp: new Date().toISOString(), + metadata: {}, + ...overrides, + }; +} + +function makeTrace(overrides?: Partial): AgentDecisionTrace { + return { + id: `trace-${Math.random().toString(36).slice(2)}`, + runId: 'test-run-1', + stepId: 'step-1', + agentType: 'coder', + timestamp: new Date(), + tenantId: 'test-tenant', + inputs: { + prompt: 'Test input', + contextWindow: [], + previousSteps: [], + }, + decision: { + action: 'generate_code', + reasoning: 'Generated code based on input', + confidence: 0.8, + alternatives: [], + }, + outcome: { + result: 'success', + determinedAt: new Date(), + }, + metadata: { + durationMs: 5000, + tokensUsed: { input: 1000, output: 500 }, + }, + ...overrides, + }; +} + +function makeMockStore(traces: AgentDecisionTrace[]): DecisionTraceStore { + return { + saveTrace: vi.fn(), + getTrace: vi.fn(), + listTraces: vi.fn().mockResolvedValue(traces), + getTracesForRun: vi.fn().mockResolvedValue(traces), + updateOutcome: vi.fn(), + addFeedback: vi.fn(), + findSimilar: vi.fn().mockResolvedValue([]), + getOverriddenTraces: vi.fn().mockResolvedValue([]), + deleteTrace: vi.fn(), + }; +} + +describe('TraceAnalysisHook', () => { + let hook: TraceAnalysisHook; + let store: DecisionTraceStore; + + describe('onAfterStep', () => { + it('should be a no-op', async () => { + store = makeMockStore([]); + hook = new TraceAnalysisHook({}, store); + const ctx = makeCtx(); + await expect(hook.onAfterStep(ctx)).resolves.toBeUndefined(); + }); + }); + + describe('onRunEnd', () => { + it('should analyze traces on run end', async () => { + const traces = [ + makeTrace(), + makeTrace({ agentType: 'reviewer' }), + ]; + store = makeMockStore(traces); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx(); + await hook.onRunEnd(ctx, true); + + expect(store.getTracesForRun).toHaveBeenCalledWith('test-run-1'); + expect(ctx.metadata?.traceAnalysis).toBeDefined(); + const analysis = ctx.metadata!.traceAnalysis as any; + expect(analysis.totalTraces).toBe(2); + expect(analysis.failedTraces).toBe(0); + }); + + it('should detect repeated failures', async () => { + const traces = [ + makeTrace({ agentType: 'coder', outcome: { result: 'failure', determinedAt: new Date() } }), + makeTrace({ agentType: 'coder', outcome: { result: 'failure', determinedAt: new Date() } }), + makeTrace({ agentType: 'coder', outcome: { result: 'failure', determinedAt: new Date() } }), + ]; + store = makeMockStore(traces); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx(); + await hook.onRunEnd(ctx, false); + + const analysis = ctx.metadata!.traceAnalysis as any; + expect(analysis.failedTraces).toBe(3); + expect(analysis.patterns.length).toBeGreaterThan(0); + expect(analysis.patterns[0].category).toBe('repeated_failure'); + expect(analysis.patterns[0].agentType).toBe('coder'); + }); + + it('should detect low confidence failures', async () => { + const traces = [ + makeTrace({ + agentType: 'coder', + decision: { action: 'generate_code', reasoning: 'Unsure', confidence: 0.2, alternatives: [] }, + outcome: { result: 'failure', determinedAt: new Date() }, + }), + makeTrace({ + agentType: 'coder', + decision: { action: 'generate_code', reasoning: 'Not confident', confidence: 0.3, alternatives: [] }, + outcome: { result: 'failure', determinedAt: new Date() }, + }), + ]; + store = makeMockStore(traces); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx(); + await hook.onRunEnd(ctx, false); + + const analysis = ctx.metadata!.traceAnalysis as any; + const lowConfPattern = analysis.patterns.find((p: any) => p.category === 'low_confidence_failure'); + expect(lowConfPattern).toBeDefined(); + expect(lowConfPattern.count).toBe(2); + }); + + it('should skip analysis when no traces exist', async () => { + store = makeMockStore([]); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx(); + await hook.onRunEnd(ctx, true); + + expect(ctx.metadata?.traceAnalysis).toBeUndefined(); + }); + + it('should call onAnalysis callback', async () => { + const onAnalysis = vi.fn(); + const traces = [makeTrace()]; + store = makeMockStore(traces); + hook = new TraceAnalysisHook({ logResults: false, onAnalysis }, store); + + await hook.onRunEnd(makeCtx(), true); + + expect(onAnalysis).toHaveBeenCalledOnce(); + expect(onAnalysis).toHaveBeenCalledWith(expect.objectContaining({ totalTraces: 1 })); + }); + + it('should handle store errors gracefully', async () => { + store = makeMockStore([]); + (store.getTracesForRun as any).mockRejectedValue(new Error('Store unavailable')); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx(); + await expect(hook.onRunEnd(ctx, true)).resolves.toBeUndefined(); + }); + }); + + describe('analyzeTraces', () => { + it('should calculate agent failure rates', () => { + store = makeMockStore([]); + hook = new TraceAnalysisHook({}, store); + + const traces = [ + makeTrace({ agentType: 'coder', outcome: { result: 'success', determinedAt: new Date() } }), + makeTrace({ agentType: 'coder', outcome: { result: 'failure', determinedAt: new Date() } }), + makeTrace({ agentType: 'reviewer', outcome: { result: 'success', determinedAt: new Date() } }), + ]; + + const result = hook.analyzeTraces('run-1', traces); + expect(result.agentFailureRates.coder).toEqual({ total: 2, failed: 1, rate: 0.5 }); + expect(result.agentFailureRates.reviewer).toEqual({ total: 1, failed: 0, rate: 0 }); + }); + + it('should aggregate token usage', () => { + store = makeMockStore([]); + hook = new TraceAnalysisHook({}, store); + + const traces = [ + makeTrace({ metadata: { durationMs: 1000, tokensUsed: { input: 100, output: 50 } } }), + makeTrace({ metadata: { durationMs: 2000, tokensUsed: { input: 200, output: 100 } } }), + ]; + + const result = hook.analyzeTraces('run-1', traces); + expect(result.totalTokens).toEqual({ input: 300, output: 150 }); + expect(result.avgDurationMs).toBe(1500); + }); + }); + + describe('tenant isolation', () => { + it('should use listTraces with tenantId when tenantId is present', async () => { + const traces = [makeTrace()]; + store = makeMockStore(traces); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx({ tenantId: 'tenant-123' }); + await hook.onRunEnd(ctx, true); + + expect(store.listTraces).toHaveBeenCalledWith({ + runId: 'test-run-1', + tenantId: 'tenant-123', + }); + expect(store.getTracesForRun).not.toHaveBeenCalled(); + }); + + it('should fall back to getTracesForRun when no tenantId', async () => { + const traces = [makeTrace()]; + store = makeMockStore(traces); + hook = new TraceAnalysisHook({ logResults: false }, store); + + const ctx = makeCtx(); // no tenantId + await hook.onRunEnd(ctx, true); + + expect(store.getTracesForRun).toHaveBeenCalledWith('test-run-1'); + expect(store.listTraces).not.toHaveBeenCalled(); + }); + }); + + describe('isEnabled', () => { + it('should be enabled by default', async () => { + store = makeMockStore([]); + hook = new TraceAnalysisHook({}, store); + expect(await hook.isEnabled()).toBe(true); + }); + + it('should be disabled when env var is false', async () => { + const orig = process.env.GWI_TRACE_ANALYSIS_ENABLED; + process.env.GWI_TRACE_ANALYSIS_ENABLED = 'false'; + try { + store = makeMockStore([]); + hook = new TraceAnalysisHook({}, store); + expect(await hook.isEnabled()).toBe(false); + } finally { + if (orig === undefined) delete process.env.GWI_TRACE_ANALYSIS_ENABLED; + else process.env.GWI_TRACE_ANALYSIS_ENABLED = orig; + } + }); + }); +}); diff --git a/packages/engine/src/hooks/budget-management-hook.ts b/packages/engine/src/hooks/budget-management-hook.ts new file mode 100644 index 0000000..d5f2c79 --- /dev/null +++ b/packages/engine/src/hooks/budget-management-hook.ts @@ -0,0 +1,281 @@ +/** + * Budget Management Hook + * + * Harness Engineering Pattern 6: Time/Token Budgeting + * + * Tracks cumulative token usage and cost per run. Injects budget + * warnings into context metadata when approaching limits. For autopilot + * mode, triggers early verification when budget is nearly spent. + * + * @module @gwi/engine/hooks + */ + +import { getLogger } from '@gwi/core'; +import type { AgentHook, AgentRunContext } from './types.js'; + +const logger = getLogger('budget-management-hook'); + +// ============================================================================= +// Types +// ============================================================================= + +/** + * Budget status for a run + */ +export interface BudgetStatus { + /** Total input tokens consumed */ + totalInputTokens: number; + /** Total output tokens consumed */ + totalOutputTokens: number; + /** Estimated cost in USD */ + estimatedCostUsd: number; + /** Percentage of token budget consumed (0-100) */ + tokenBudgetPercent: number; + /** Percentage of cost budget consumed (0-100) */ + costBudgetPercent: number; + /** Number of steps completed */ + stepsCompleted: number; + /** Total duration so far in ms */ + totalDurationMs: number; + /** Budget warning level */ + warningLevel: 'none' | 'approaching' | 'critical' | 'exceeded'; +} + +/** + * Configuration for budget management + */ +export interface BudgetManagementConfig { + /** Maximum total tokens (input + output) per run. @default 500000 */ + maxTotalTokens: number; + /** Maximum cost in USD per run. @default 5.0 */ + maxCostUsd: number; + /** Warning threshold as percentage (0-100). @default 75 */ + warningThresholdPercent: number; + /** Critical threshold as percentage (0-100). @default 90 */ + criticalThresholdPercent: number; + /** Block on budget exceeded. @default false */ + enforceBlocking: boolean; + /** Approximate cost per 1K input tokens in USD. @default 0.003 */ + costPer1kInputTokens: number; + /** Approximate cost per 1K output tokens in USD. @default 0.015 */ + costPer1kOutputTokens: number; + /** Callback when budget is exceeded */ + onBudgetExceeded?: (ctx: AgentRunContext, status: BudgetStatus) => Promise; +} + +/** + * Default configuration + */ +export const DEFAULT_BUDGET_CONFIG: BudgetManagementConfig = { + maxTotalTokens: 500_000, + maxCostUsd: 5.0, + warningThresholdPercent: 75, + criticalThresholdPercent: 90, + enforceBlocking: false, + costPer1kInputTokens: 0.003, + costPer1kOutputTokens: 0.015, +}; + +// ============================================================================= +// Error +// ============================================================================= + +/** + * Error thrown when budget is exceeded in blocking mode + */ +export class BudgetExceededError extends Error { + constructor( + message: string, + public readonly budgetStatus: BudgetStatus, + ) { + super(message); + this.name = 'BudgetExceededError'; + } +} + +// ============================================================================= +// Hook Implementation +// ============================================================================= + +/** + * Budget Management Hook + * + * Accumulates token usage and cost across steps in a run. + * Injects warning signals when approaching budget limits. + */ +export class BudgetManagementHook implements AgentHook { + readonly name = 'budget-management'; + private config: BudgetManagementConfig; + + /** Map of runId → accumulated budget status */ + private runBudgets = new Map(); + + constructor(config?: Partial) { + this.config = { ...DEFAULT_BUDGET_CONFIG, ...config }; + } + + /** + * Initialize budget tracking on run start (idempotent per runId) + */ + async onRunStart(ctx: AgentRunContext): Promise { + // Guard: autopilot calls runStart multiple times per runId + if (this.runBudgets.has(ctx.runId)) return; + + this.runBudgets.set(ctx.runId, { + inputTokens: 0, + outputTokens: 0, + stepsCompleted: 0, + startTime: Date.now(), + }); + } + + /** + * Check budget before step and inject warnings + */ + async onBeforeStep(ctx: AgentRunContext): Promise { + const budget = this.runBudgets.get(ctx.runId); + if (!budget) return; + + const status = this.calculateStatus(budget); + + // Inject budget status into metadata + if (ctx.metadata) { + ctx.metadata.budgetStatus = status; + + if (status.warningLevel === 'approaching') { + ctx.metadata.budgetWarning = + `Budget ${status.tokenBudgetPercent.toFixed(0)}% consumed. ` + + `Prioritize completing current work and verifying output.`; + } else if (status.warningLevel === 'critical') { + ctx.metadata.budgetWarning = + `Budget ${status.tokenBudgetPercent.toFixed(0)}% consumed (CRITICAL). ` + + `Wrap up immediately. Skip non-essential work. Verify and finalize.`; + } + } + + // Block if exceeded and enforcement is on + if (status.warningLevel === 'exceeded' && this.config.enforceBlocking) { + await this.config.onBudgetExceeded?.(ctx, status); + throw new BudgetExceededError( + `Run budget exceeded: ${status.tokenBudgetPercent.toFixed(0)}% tokens, $${status.estimatedCostUsd.toFixed(4)} cost`, + status, + ); + } + } + + /** + * Accumulate usage after each step + */ + async onAfterStep(ctx: AgentRunContext): Promise { + const budget = this.runBudgets.get(ctx.runId); + if (!budget) return; + + // Accumulate token usage + if (ctx.tokensUsed) { + budget.inputTokens += ctx.tokensUsed.input; + budget.outputTokens += ctx.tokensUsed.output; + } + budget.stepsCompleted++; + + const status = this.calculateStatus(budget); + + // Attach status to metadata + if (ctx.metadata) { + ctx.metadata.budgetStatus = status; + } + + if (status.warningLevel !== 'none') { + logger.warn('Budget warning', { + runId: ctx.runId, + level: status.warningLevel, + tokenPercent: status.tokenBudgetPercent.toFixed(1), + costUsd: status.estimatedCostUsd.toFixed(4), + stepsCompleted: status.stepsCompleted, + }); + } + } + + /** + * Clean up on run end + */ + async onRunEnd(ctx: AgentRunContext, _success: boolean): Promise { + const budget = this.runBudgets.get(ctx.runId); + if (budget) { + const status = this.calculateStatus(budget); + logger.info('Run budget summary', { + runId: ctx.runId, + totalInputTokens: status.totalInputTokens, + totalOutputTokens: status.totalOutputTokens, + estimatedCostUsd: status.estimatedCostUsd.toFixed(4), + stepsCompleted: status.stepsCompleted, + durationMs: status.totalDurationMs, + }); + } + this.runBudgets.delete(ctx.runId); + } + + /** + * Check if this hook is enabled + */ + async isEnabled(): Promise { + return process.env.GWI_BUDGET_MANAGEMENT_ENABLED !== 'false'; + } + + /** + * Calculate current budget status + */ + private calculateStatus(budget: { + inputTokens: number; + outputTokens: number; + stepsCompleted: number; + startTime: number; + }): BudgetStatus { + const totalTokens = budget.inputTokens + budget.outputTokens; + const estimatedCostUsd = + (budget.inputTokens / 1000) * this.config.costPer1kInputTokens + + (budget.outputTokens / 1000) * this.config.costPer1kOutputTokens; + + const tokenBudgetPercent = (totalTokens / this.config.maxTotalTokens) * 100; + const costBudgetPercent = (estimatedCostUsd / this.config.maxCostUsd) * 100; + + const maxPercent = Math.max(tokenBudgetPercent, costBudgetPercent); + + let warningLevel: BudgetStatus['warningLevel'] = 'none'; + if (maxPercent >= 100) { + warningLevel = 'exceeded'; + } else if (maxPercent >= this.config.criticalThresholdPercent) { + warningLevel = 'critical'; + } else if (maxPercent >= this.config.warningThresholdPercent) { + warningLevel = 'approaching'; + } + + return { + totalInputTokens: budget.inputTokens, + totalOutputTokens: budget.outputTokens, + estimatedCostUsd, + tokenBudgetPercent, + costBudgetPercent, + stepsCompleted: budget.stepsCompleted, + totalDurationMs: Date.now() - budget.startTime, + warningLevel, + }; + } +} + +// ============================================================================= +// Factory +// ============================================================================= + +/** + * Create a budget management hook + */ +export function createBudgetManagementHook( + config?: Partial, +): BudgetManagementHook { + return new BudgetManagementHook(config); +} diff --git a/packages/engine/src/hooks/config.ts b/packages/engine/src/hooks/config.ts index 2c69167..2fabf82 100644 --- a/packages/engine/src/hooks/config.ts +++ b/packages/engine/src/hooks/config.ts @@ -11,6 +11,11 @@ * - GWI_DECISION_TRACE_ENABLED: Enable decision trace hook (default: false) * - GWI_RISK_ENFORCEMENT_ENABLED: Enable risk enforcement hook (default: true) * - GWI_CODE_QUALITY_HOOK_ENABLED: Enable code quality hook (default: true) + * - GWI_TRACE_ANALYSIS_ENABLED: Enable trace analysis hook (default: true) + * - GWI_SELF_TEST_HOOK_ENABLED: Enable self-test validation hook (default: true) + * - GWI_ENVIRONMENT_ONBOARDING_ENABLED: Enable environment onboarding hook (default: true) + * - GWI_LOOP_DETECTION_ENABLED: Enable loop detection hook (default: true) + * - GWI_BUDGET_MANAGEMENT_ENABLED: Enable budget management hook (default: true) * * @module @gwi/engine/hooks */ @@ -22,6 +27,11 @@ import { AgentHookRunner } from './runner.js'; import { DecisionTraceHook } from './decision-trace-hook.js'; import { CodeQualityHook } from './code-quality-hook.js'; import { RiskEnforcementHook } from './risk-enforcement-hook.js'; +import { TraceAnalysisHook } from './trace-analysis-hook.js'; +import { SelfTestHook } from './self-test-hook.js'; +import { EnvironmentOnboardingHook } from './environment-onboarding-hook.js'; +import { LoopDetectionHook } from './loop-detection-hook.js'; +import { BudgetManagementHook } from './budget-management-hook.js'; const logger = getLogger('hooks'); @@ -85,6 +95,24 @@ export async function buildDefaultHookRunner(): Promise { } } + // Register harness engineering hooks (all default ON, opt-out via env) + const harnessHooks: Array<{ envVar: string; create: () => AgentHook; label: string }> = [ + { envVar: 'GWI_TRACE_ANALYSIS_ENABLED', create: () => new TraceAnalysisHook(), label: 'Trace analysis' }, + { envVar: 'GWI_SELF_TEST_HOOK_ENABLED', create: () => new SelfTestHook(), label: 'Self-test' }, + { envVar: 'GWI_ENVIRONMENT_ONBOARDING_ENABLED', create: () => new EnvironmentOnboardingHook(), label: 'Environment onboarding' }, + { envVar: 'GWI_LOOP_DETECTION_ENABLED', create: () => new LoopDetectionHook(), label: 'Loop detection' }, + { envVar: 'GWI_BUDGET_MANAGEMENT_ENABLED', create: () => new BudgetManagementHook(), label: 'Budget management' }, + ]; + + for (const { envVar, create, label } of harnessHooks) { + if (process.env[envVar] !== 'false') { + runner.register(create()); + if (config.debug) { + logger.debug(`${label} hook registered`); + } + } + } + return runner; } diff --git a/packages/engine/src/hooks/environment-onboarding-hook.ts b/packages/engine/src/hooks/environment-onboarding-hook.ts new file mode 100644 index 0000000..1d3fe1b --- /dev/null +++ b/packages/engine/src/hooks/environment-onboarding-hook.ts @@ -0,0 +1,289 @@ +/** + * Environment Onboarding Hook + * + * Harness Engineering Pattern 2: Context Engineering (Environment Onboarding) + * + * Scans repository structure on run start to discover language, framework, + * test runner, and project layout. Injects findings into context metadata + * so all subsequent agent steps have environmental awareness without + * wasting cycles on discovery. + * + * @module @gwi/engine/hooks + */ + +import { getLogger } from '@gwi/core'; +import type { AgentHook, AgentRunContext } from './types.js'; + +const logger = getLogger('environment-onboarding-hook'); + +// ============================================================================= +// Types +// ============================================================================= + +/** + * Discovered environment information + */ +export interface EnvironmentProfile { + /** Primary language(s) detected */ + languages: string[]; + /** Frameworks/libraries detected */ + frameworks: string[]; + /** Package managers detected */ + packageManagers: string[]; + /** Test runners detected */ + testRunners: string[]; + /** Build tools detected */ + buildTools: string[]; + /** Whether the project is a monorepo */ + isMonorepo: boolean; + /** Key config files found */ + configFiles: string[]; + /** Repository root (if available) */ + repoRoot?: string; +} + +/** + * Configuration for environment onboarding + */ +export interface EnvironmentOnboardingConfig { + /** Whether to inject profile into all step contexts. @default true */ + injectIntoSteps: boolean; + /** Callback when environment is profiled */ + onProfiled?: (profile: EnvironmentProfile) => Promise; +} + +/** + * Default configuration + */ +export const DEFAULT_ENV_ONBOARDING_CONFIG: EnvironmentOnboardingConfig = { + injectIntoSteps: true, +}; + +// ============================================================================= +// Detection Rules +// ============================================================================= + +/** + * Map of config files to what they indicate + */ +const CONFIG_FILE_SIGNALS: Array<{ + file: string; + language?: string; + framework?: string; + packageManager?: string; + testRunner?: string; + buildTool?: string; + monorepo?: boolean; +}> = [ + // JavaScript/TypeScript ecosystem + { file: 'package.json', language: 'typescript', packageManager: 'npm' }, + { file: 'package-lock.json', packageManager: 'npm' }, + { file: 'yarn.lock', packageManager: 'yarn' }, + { file: 'pnpm-lock.yaml', packageManager: 'pnpm' }, + { file: 'bun.lockb', packageManager: 'bun' }, + { file: 'tsconfig.json', language: 'typescript' }, + { file: 'vitest.config.ts', testRunner: 'vitest' }, + { file: 'vitest.config.js', testRunner: 'vitest' }, + { file: 'jest.config.ts', testRunner: 'jest' }, + { file: 'jest.config.js', testRunner: 'jest' }, + { file: '.eslintrc.json', buildTool: 'eslint' }, + { file: 'eslint.config.js', buildTool: 'eslint' }, + { file: 'turbo.json', buildTool: 'turbo', monorepo: true }, + { file: 'nx.json', buildTool: 'nx', monorepo: true }, + { file: 'lerna.json', monorepo: true }, + { file: 'next.config.js', framework: 'nextjs' }, + { file: 'next.config.ts', framework: 'nextjs' }, + { file: 'vite.config.ts', buildTool: 'vite' }, + { file: 'webpack.config.js', buildTool: 'webpack' }, + + // Python ecosystem + { file: 'pyproject.toml', language: 'python', packageManager: 'pip' }, + { file: 'setup.py', language: 'python', packageManager: 'pip' }, + { file: 'requirements.txt', language: 'python', packageManager: 'pip' }, + { file: 'Pipfile', language: 'python', packageManager: 'pipenv' }, + { file: 'poetry.lock', language: 'python', packageManager: 'poetry' }, + { file: 'pytest.ini', testRunner: 'pytest' }, + { file: 'conftest.py', testRunner: 'pytest' }, + + // Go ecosystem + { file: 'go.mod', language: 'go', packageManager: 'go-modules' }, + { file: 'go.sum', language: 'go' }, + + // Rust ecosystem + { file: 'Cargo.toml', language: 'rust', packageManager: 'cargo' }, + { file: 'Cargo.lock', language: 'rust' }, + + // Java/JVM ecosystem + { file: 'pom.xml', language: 'java', buildTool: 'maven' }, + { file: 'build.gradle', language: 'java', buildTool: 'gradle' }, + { file: 'build.gradle.kts', language: 'kotlin', buildTool: 'gradle' }, + + // Infrastructure + { file: 'Dockerfile', buildTool: 'docker' }, + { file: 'docker-compose.yml', buildTool: 'docker-compose' }, + { file: 'terraform.tf', buildTool: 'terraform' }, + { file: 'main.tf', buildTool: 'terraform' }, +]; + +// ============================================================================= +// Hook Implementation +// ============================================================================= + +/** + * Environment Onboarding Hook + * + * On run start, analyzes context metadata to build an environment profile. + * The profile is injected into subsequent step contexts to give agents + * immediate awareness of the project's tech stack. + */ +export class EnvironmentOnboardingHook implements AgentHook { + readonly name = 'environment-onboarding'; + private config: EnvironmentOnboardingConfig; + private runProfiles = new Map(); + + constructor(config?: Partial) { + this.config = { ...DEFAULT_ENV_ONBOARDING_CONFIG, ...config }; + } + + /** + * Profile the environment on run start + */ + async onRunStart(ctx: AgentRunContext): Promise { + try { + const profile = this.profileFromContext(ctx); + this.runProfiles.set(ctx.runId, profile); + + // Inject into current context + if (ctx.metadata) { + ctx.metadata.environmentProfile = profile; + } + + logger.info('Environment profiled', { + runId: ctx.runId, + languages: profile.languages, + frameworks: profile.frameworks, + testRunners: profile.testRunners, + isMonorepo: profile.isMonorepo, + }); + + await this.config.onProfiled?.(profile); + } catch (error) { + logger.error('Environment profiling failed', { + runId: ctx.runId, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + /** + * No-op — profile injection happens in onBeforeStep only + */ + async onAfterStep(_ctx: AgentRunContext): Promise { + // Profile is injected via onBeforeStep; no post-step action needed + } + + /** + * Inject profile before each step + */ + async onBeforeStep(ctx: AgentRunContext): Promise { + if (!this.config.injectIntoSteps) return; + + const profile = this.runProfiles.get(ctx.runId); + if (profile && ctx.metadata) { + ctx.metadata.environmentProfile = profile; + } + } + + /** + * Clean up on run end + */ + async onRunEnd(ctx: AgentRunContext, _success: boolean): Promise { + this.runProfiles.delete(ctx.runId); + } + + /** + * Check if this hook is enabled + */ + async isEnabled(): Promise { + return process.env.GWI_ENVIRONMENT_ONBOARDING_ENABLED !== 'false'; + } + + /** + * Build an environment profile from context metadata + * + * Uses file lists, repo info, and config file presence from context + * to determine the project's tech stack without filesystem access. + */ + profileFromContext(ctx: AgentRunContext): EnvironmentProfile { + const languages = new Set(); + const frameworks = new Set(); + const packageManagers = new Set(); + const testRunners = new Set(); + const buildTools = new Set(); + const configFiles: string[] = []; + let isMonorepo = false; + + // Extract and validate file list from metadata (populated by triage or previous steps) + const rawFiles = ctx.metadata?.files; + const files: string[] = Array.isArray(rawFiles) + ? rawFiles.filter((f): f is string => typeof f === 'string').slice(0, 10_000) + : []; + const rawRoot = ctx.metadata?.repoRoot; + const repoRoot = typeof rawRoot === 'string' ? rawRoot : undefined; + + // Scan file list against known config signals + for (const filePath of files) { + const fileName = filePath.split('/').pop() ?? filePath; + + for (const signal of CONFIG_FILE_SIGNALS) { + if (fileName === signal.file) { + configFiles.push(filePath); + if (signal.language) languages.add(signal.language); + if (signal.framework) frameworks.add(signal.framework); + if (signal.packageManager) packageManagers.add(signal.packageManager); + if (signal.testRunner) testRunners.add(signal.testRunner); + if (signal.buildTool) buildTools.add(signal.buildTool); + if (signal.monorepo) isMonorepo = true; + } + } + + // Infer language from file extensions + if (/\.[jt]sx?$/.test(filePath)) languages.add('typescript'); + if (/\.py$/.test(filePath)) languages.add('python'); + if (/\.go$/.test(filePath)) languages.add('go'); + if (/\.rs$/.test(filePath)) languages.add('rust'); + if (/\.java$/.test(filePath)) languages.add('java'); + if (/\.kt$/.test(filePath)) languages.add('kotlin'); + if (/\.rb$/.test(filePath)) languages.add('ruby'); + } + + // Check for monorepo signals in directory structure + if (files.some((f) => /^(packages|apps|libs|modules)\//.test(f))) { + isMonorepo = true; + } + + return { + languages: [...languages], + frameworks: [...frameworks], + packageManagers: [...packageManagers], + testRunners: [...testRunners], + buildTools: [...buildTools], + isMonorepo, + configFiles, + repoRoot, + }; + } +} + +// ============================================================================= +// Factory +// ============================================================================= + +/** + * Create an environment onboarding hook + */ +export function createEnvironmentOnboardingHook( + config?: Partial, +): EnvironmentOnboardingHook { + return new EnvironmentOnboardingHook(config); +} diff --git a/packages/engine/src/hooks/index.ts b/packages/engine/src/hooks/index.ts index de8bf6f..8c7611c 100644 --- a/packages/engine/src/hooks/index.ts +++ b/packages/engine/src/hooks/index.ts @@ -79,3 +79,60 @@ export { DEFAULT_CODE_QUALITY_CONFIG, } from './code-quality-hook.js'; export type { CodeQualityConfig, QualityAssessment } from './code-quality-hook.js'; + +// Trace Analysis Hook (Harness Engineering Pattern 5: Trace Analysis Feedback Loop) +export { + TraceAnalysisHook, + createTraceAnalysisHook, + DEFAULT_TRACE_ANALYSIS_CONFIG, +} from './trace-analysis-hook.js'; +export type { + TraceAnalysisConfig, + TraceAnalysisResult, + FailurePattern, +} from './trace-analysis-hook.js'; + +// Self-Test Hook (Harness Engineering Pattern 1: Build & Self-Verify) +export { + SelfTestHook, + createSelfTestHook, + SelfTestError, + DEFAULT_SELF_TEST_CONFIG, +} from './self-test-hook.js'; +export type { SelfTestConfig, SelfTestValidation } from './self-test-hook.js'; + +// Environment Onboarding Hook (Harness Engineering Pattern 2: Context Engineering) +export { + EnvironmentOnboardingHook, + createEnvironmentOnboardingHook, + DEFAULT_ENV_ONBOARDING_CONFIG, +} from './environment-onboarding-hook.js'; +export type { + EnvironmentOnboardingConfig, + EnvironmentProfile, +} from './environment-onboarding-hook.js'; + +// Loop Detection Hook (Harness Engineering Pattern 3: Loop Detection) +export { + LoopDetectionHook, + createLoopDetectionHook, + LoopDetectionError, + DEFAULT_LOOP_DETECTION_CONFIG, + calculateSimilarity, +} from './loop-detection-hook.js'; +export type { + LoopDetectionConfig, + LoopDetectionResult, +} from './loop-detection-hook.js'; + +// Budget Management Hook (Harness Engineering Pattern 6: Time/Token Budgeting) +export { + BudgetManagementHook, + createBudgetManagementHook, + BudgetExceededError, + DEFAULT_BUDGET_CONFIG, +} from './budget-management-hook.js'; +export type { + BudgetManagementConfig, + BudgetStatus, +} from './budget-management-hook.js'; diff --git a/packages/engine/src/hooks/loop-detection-hook.ts b/packages/engine/src/hooks/loop-detection-hook.ts new file mode 100644 index 0000000..e9de4e8 --- /dev/null +++ b/packages/engine/src/hooks/loop-detection-hook.ts @@ -0,0 +1,261 @@ +/** + * Loop Detection Hook + * + * Harness Engineering Pattern 3: Loop Detection + * + * Tracks consecutive CODER outputs per run to detect when the agent + * is producing near-identical outputs (semantic loops). When detected, + * injects a "try different approach" signal into context metadata. + * + * @module @gwi/engine/hooks + */ + +import { getLogger } from '@gwi/core'; +import type { AgentHook, AgentRunContext } from './types.js'; + +const logger = getLogger('loop-detection-hook'); + +// ============================================================================= +// Types +// ============================================================================= + +/** + * Loop detection result attached to context metadata + */ +export interface LoopDetectionResult { + /** Whether a loop was detected */ + loopDetected: boolean; + /** Number of similar consecutive outputs */ + similarCount: number; + /** Similarity score of last two outputs (0-1) */ + lastSimilarity: number; + /** Suggested action */ + suggestion?: string; +} + +/** + * Configuration for loop detection + */ +export interface LoopDetectionConfig { + /** Similarity threshold to consider outputs "the same" (0-1). @default 0.8 */ + similarityThreshold: number; + /** Number of similar outputs before flagging. @default 3 */ + maxSimilarOutputs: number; + /** Maximum outputs to track per run. @default 10 */ + maxTrackedOutputs: number; + /** Block on loop detection (throw) or just warn. @default false */ + enforceBlocking: boolean; + /** Callback when loop is detected */ + onLoopDetected?: (ctx: AgentRunContext, result: LoopDetectionResult) => Promise; +} + +/** + * Default configuration + */ +export const DEFAULT_LOOP_DETECTION_CONFIG: LoopDetectionConfig = { + similarityThreshold: 0.8, + maxSimilarOutputs: 3, + maxTrackedOutputs: 10, + enforceBlocking: false, +}; + +// ============================================================================= +// Error +// ============================================================================= + +/** + * Error thrown when a loop is detected in blocking mode + */ +export class LoopDetectionError extends Error { + constructor( + message: string, + public readonly detection: LoopDetectionResult, + ) { + super(message); + this.name = 'LoopDetectionError'; + } +} + +// ============================================================================= +// Similarity Calculation +// ============================================================================= + +/** + * Calculate bigram-based similarity between two strings. + * Returns a value between 0 (completely different) and 1 (identical). + * + * Uses Dice's coefficient on character bigrams — fast and effective + * for detecting near-duplicate text without external dependencies. + */ +export function calculateSimilarity(a: string, b: string): number { + if (a === b) return 1; + if (a.length === 0 || b.length === 0) return 0; + + // Normalize whitespace for comparison + const normA = a.replace(/\s+/g, ' ').trim(); + const normB = b.replace(/\s+/g, ' ').trim(); + + if (normA === normB) return 1; + + const bigramsA = getBigrams(normA); + const bigramsB = getBigrams(normB); + + if (bigramsA.size === 0 || bigramsB.size === 0) return 0; + + let intersectionSize = 0; + for (const [bigram, countA] of bigramsA) { + const countB = bigramsB.get(bigram) ?? 0; + intersectionSize += Math.min(countA, countB); + } + + const totalSize = sumValues(bigramsA) + sumValues(bigramsB); + return (2 * intersectionSize) / totalSize; +} + +function getBigrams(str: string): Map { + const bigrams = new Map(); + for (let i = 0; i < str.length - 1; i++) { + const bigram = str.substring(i, i + 2); + bigrams.set(bigram, (bigrams.get(bigram) ?? 0) + 1); + } + return bigrams; +} + +function sumValues(map: Map): number { + let sum = 0; + for (const v of map.values()) sum += v; + return sum; +} + +// ============================================================================= +// Hook Implementation +// ============================================================================= + +/** + * Loop Detection Hook + * + * Tracks CODER outputs per run and detects semantic loops + * where the agent produces near-identical code repeatedly. + */ +export class LoopDetectionHook implements AgentHook { + readonly name = 'loop-detection'; + private config: LoopDetectionConfig; + + /** Map of runId → list of output summaries for that run */ + private runOutputs = new Map(); + + constructor(config?: Partial) { + this.config = { ...DEFAULT_LOOP_DETECTION_CONFIG, ...config }; + } + + /** + * Track CODER outputs and detect loops + */ + async onAfterStep(ctx: AgentRunContext): Promise { + // Only track CODER role outputs + if (ctx.agentRole !== 'CODER') { + return; + } + + const output = ctx.outputSummary ?? ''; + if (!output) return; + + // Get or create output history for this run + let outputs = this.runOutputs.get(ctx.runId); + if (!outputs) { + outputs = []; + this.runOutputs.set(ctx.runId, outputs); + } + + // Calculate similarity with previous outputs + let consecutiveSimilar = 0; + let lastSimilarity = 0; + + for (let i = outputs.length - 1; i >= 0; i--) { + const similarity = calculateSimilarity(output, outputs[i]); + if (i === outputs.length - 1) { + lastSimilarity = similarity; + } + if (similarity >= this.config.similarityThreshold) { + consecutiveSimilar++; + } else { + break; // Stop at first dissimilar output + } + } + + // Add current output to history + outputs.push(output); + + // Trim history to max tracked + if (outputs.length > this.config.maxTrackedOutputs) { + outputs.splice(0, outputs.length - this.config.maxTrackedOutputs); + } + + const loopDetected = consecutiveSimilar >= this.config.maxSimilarOutputs - 1; + + const result: LoopDetectionResult = { + loopDetected, + similarCount: consecutiveSimilar + 1, // Include current output + lastSimilarity, + suggestion: loopDetected + ? 'Agent appears stuck in a loop producing similar outputs. Consider a fundamentally different approach, different model, or simplified input.' + : undefined, + }; + + // Attach to metadata + if (ctx.metadata) { + ctx.metadata.loopDetection = result; + if (loopDetected) { + // Inject nudge for the agent's next iteration + ctx.metadata.loopNudge = + 'WARNING: You have produced similar output multiple times. ' + + 'Try a fundamentally different approach. Consider: (1) breaking the problem into smaller pieces, ' + + '(2) using a different algorithm or pattern, (3) simplifying your solution.'; + } + } + + if (loopDetected) { + logger.warn('Loop detected in CODER output', { + runId: ctx.runId, + similarCount: result.similarCount, + lastSimilarity: result.lastSimilarity.toFixed(3), + }); + + await this.config.onLoopDetected?.(ctx, result); + + if (this.config.enforceBlocking) { + throw new LoopDetectionError( + `Loop detected: ${result.similarCount} similar CODER outputs (similarity: ${result.lastSimilarity.toFixed(3)})`, + result, + ); + } + } + } + + /** + * Clean up on run end + */ + async onRunEnd(ctx: AgentRunContext, _success: boolean): Promise { + this.runOutputs.delete(ctx.runId); + } + + /** + * Check if this hook is enabled + */ + async isEnabled(): Promise { + return process.env.GWI_LOOP_DETECTION_ENABLED !== 'false'; + } +} + +// ============================================================================= +// Factory +// ============================================================================= + +/** + * Create a loop detection hook + */ +export function createLoopDetectionHook( + config?: Partial, +): LoopDetectionHook { + return new LoopDetectionHook(config); +} diff --git a/packages/engine/src/hooks/self-test-hook.ts b/packages/engine/src/hooks/self-test-hook.ts new file mode 100644 index 0000000..f1d44a5 --- /dev/null +++ b/packages/engine/src/hooks/self-test-hook.ts @@ -0,0 +1,275 @@ +/** + * Self-Test Execution Hook + * + * Harness Engineering Pattern 1: Build & Self-Verify + * + * Validates CODER output by checking whether generated code includes tests, + * whether test files follow naming conventions, and whether the code/test + * ratio is reasonable. Attaches validation metadata for the Reviewer to consume. + * + * @module @gwi/engine/hooks + */ + +import { getLogger, type CodeGenerationResult } from '@gwi/core'; +import type { AgentHook, AgentRunContext } from './types.js'; + +const logger = getLogger('self-test-hook'); + +// ============================================================================= +// Types +// ============================================================================= + +/** + * Result of self-test validation + */ +export interface SelfTestValidation { + /** Whether tests were included with the generated code */ + hasTests: boolean; + /** Number of test files found */ + testFileCount: number; + /** Number of source files found */ + sourceFileCount: number; + /** Test-to-source ratio */ + testRatio: number; + /** Detected test framework patterns */ + detectedFrameworks: string[]; + /** Whether code contains assertion-like patterns */ + hasAssertions: boolean; + /** Validation passed */ + passed: boolean; + /** Reasons for failure */ + failureReasons: string[]; +} + +/** + * Configuration for self-test hook + */ +export interface SelfTestConfig { + /** Require tests for generated code. @default false (warn only) */ + requireTests: boolean; + /** Minimum test-to-source ratio. @default 0 (no minimum) */ + minTestRatio: number; + /** Block on failure (throw) or just warn. @default false */ + enforceBlocking: boolean; + /** File patterns considered test files */ + testPatterns: RegExp[]; + /** Callback when validation fails */ + onBlocked?: (ctx: AgentRunContext, reason: string) => Promise; +} + +/** + * Default configuration + */ +export const DEFAULT_SELF_TEST_CONFIG: SelfTestConfig = { + requireTests: false, + minTestRatio: 0, + enforceBlocking: false, + testPatterns: [ + /\.test\.[jt]sx?$/, + /\.spec\.[jt]sx?$/, + /__tests__\//, + /\.test\.py$/, + /test_.*\.py$/, + /\.test\.go$/, + /_test\.go$/, + ], +}; + +// ============================================================================= +// Test Framework Detection Patterns +// ============================================================================= + +const FRAMEWORK_PATTERNS: Array<{ pattern: RegExp; name: string }> = [ + { pattern: /import\s+.*\bvitest\b|from\s+['"]vitest['"]/, name: 'vitest' }, + { pattern: /import\s+.*\bjest\b|from\s+['"]@jest\//, name: 'jest' }, + { pattern: /import\s+.*\bmocha\b|describe\s*\(/, name: 'mocha' }, + { pattern: /import\s+.*\bpytest\b|def\s+test_/, name: 'pytest' }, + { pattern: /import\s+.*testing\b|func\s+Test/, name: 'go-testing' }, + { pattern: /assert\s*\(|expect\s*\(|assertEquals/, name: 'assertions' }, +]; + +// ============================================================================= +// Error +// ============================================================================= + +/** + * Error thrown when self-test validation fails in blocking mode + */ +export class SelfTestError extends Error { + constructor( + message: string, + public readonly validation: SelfTestValidation, + ) { + super(message); + this.name = 'SelfTestError'; + } +} + +// ============================================================================= +// Hook Implementation +// ============================================================================= + +/** + * Self-Test Execution Hook + * + * Validates that CODER output includes appropriate tests. + * Runs after CODER steps and attaches validation metadata + * for the Reviewer agent to consume. + */ +export class SelfTestHook implements AgentHook { + readonly name = 'self-test'; + private config: SelfTestConfig; + + /** Stored validation results per run for onBeforeStep blocking */ + private runValidations = new Map(); + + constructor(config?: Partial) { + this.config = { ...DEFAULT_SELF_TEST_CONFIG, ...config }; + } + + /** + * Block the next step if the previous CODER output failed validation + * (onBeforeStep errors propagate — this is the correct place to block) + */ + async onBeforeStep(ctx: AgentRunContext): Promise { + if (!this.config.enforceBlocking) return; + + const stored = this.runValidations.get(ctx.runId); + if (!stored || stored.passed) return; + + const reason = `Self-test validation failed: ${stored.failureReasons.join('; ')}`; + await this.config.onBlocked?.(ctx, reason); + throw new SelfTestError(reason, stored); + } + + /** + * Validate CODER output for test inclusion (observational — never throws) + */ + async onAfterStep(ctx: AgentRunContext): Promise { + // Only activate for CODER role + if (ctx.agentRole !== 'CODER') { + return; + } + + const generatedFiles = ctx.metadata?.generatedFiles as CodeGenerationResult['files'] | undefined; + + if (!generatedFiles || generatedFiles.length === 0) { + return; + } + + const validation = this.validate(generatedFiles); + + // Store for onBeforeStep blocking + this.runValidations.set(ctx.runId, validation); + + // Attach validation to metadata for Reviewer consumption + if (ctx.metadata) { + ctx.metadata.selfTestValidation = validation; + } + + if (validation.passed) { + logger.info('Self-test validation passed', { + runId: ctx.runId, + testFileCount: validation.testFileCount, + testRatio: validation.testRatio.toFixed(2), + }); + return; + } + + logger.warn('Self-test validation failed', { + runId: ctx.runId, + reasons: validation.failureReasons, + testFileCount: validation.testFileCount, + sourceFileCount: validation.sourceFileCount, + enforceBlocking: this.config.enforceBlocking, + }); + } + + /** + * Clean up stored validation state on run end + */ + async onRunEnd(ctx: AgentRunContext, _success: boolean): Promise { + this.runValidations.delete(ctx.runId); + } + + /** + * Check if this hook is enabled + */ + async isEnabled(): Promise { + return process.env.GWI_SELF_TEST_HOOK_ENABLED !== 'false'; + } + + /** + * Validate generated files for test inclusion + */ + validate(files: CodeGenerationResult['files']): SelfTestValidation { + const testFiles: string[] = []; + const sourceFiles: string[] = []; + const detectedFrameworks = new Set(); + let hasAssertions = false; + + for (const file of files) { + const isTest = this.config.testPatterns.some((p) => p.test(file.path)); + + if (isTest) { + testFiles.push(file.path); + } else { + sourceFiles.push(file.path); + } + + // Detect test frameworks in content + for (const { pattern, name } of FRAMEWORK_PATTERNS) { + if (pattern.test(file.content)) { + detectedFrameworks.add(name); + } + } + + // Check for assertion patterns + if (/\b(assert|expect|should)\s*[.(]/.test(file.content)) { + hasAssertions = true; + } + } + + const testRatio = sourceFiles.length > 0 + ? testFiles.length / sourceFiles.length + : testFiles.length > 0 ? 1 : 0; + + const failureReasons: string[] = []; + + if (this.config.requireTests && testFiles.length === 0 && sourceFiles.length > 0) { + failureReasons.push( + `No test files generated for ${sourceFiles.length} source file(s)` + ); + } + + if (this.config.minTestRatio > 0 && testRatio < this.config.minTestRatio) { + failureReasons.push( + `Test ratio ${testRatio.toFixed(2)} below minimum ${this.config.minTestRatio}` + ); + } + + return { + hasTests: testFiles.length > 0, + testFileCount: testFiles.length, + sourceFileCount: sourceFiles.length, + testRatio, + detectedFrameworks: [...detectedFrameworks], + hasAssertions, + passed: failureReasons.length === 0, + failureReasons, + }; + } +} + +// ============================================================================= +// Factory +// ============================================================================= + +/** + * Create a self-test hook + */ +export function createSelfTestHook( + config?: Partial, +): SelfTestHook { + return new SelfTestHook(config); +} diff --git a/packages/engine/src/hooks/trace-analysis-hook.ts b/packages/engine/src/hooks/trace-analysis-hook.ts new file mode 100644 index 0000000..e8737c6 --- /dev/null +++ b/packages/engine/src/hooks/trace-analysis-hook.ts @@ -0,0 +1,299 @@ +/** + * Trace Analysis Hook + * + * Harness Engineering Pattern 5: Trace Analysis Feedback Loop + * + * Analyzes completed run traces to identify failure patterns, + * categorize errors by agent/step, and generate improvement signals. + * Reads from DecisionTraceStore to mine historical decision data. + * + * @module @gwi/engine/hooks + */ + +import { getLogger } from '@gwi/core'; +import type { + DecisionTraceStore, + AgentDecisionTrace, + AgentType, +} from '@gwi/core'; +import { getDecisionTraceStore } from '@gwi/core'; +import type { AgentHook, AgentRunContext } from './types.js'; + +const logger = getLogger('trace-analysis-hook'); + +// ============================================================================= +// Types +// ============================================================================= + +/** + * A detected failure pattern from trace analysis + */ +export interface FailurePattern { + /** Agent type that failed */ + agentType: AgentType; + /** Category of failure (e.g., 'timeout', 'low_confidence', 'repeated_error') */ + category: string; + /** Number of occurrences */ + count: number; + /** Representative error messages */ + samples: string[]; + /** Suggested mitigation */ + suggestion: string; +} + +/** + * Result of analyzing a run's traces + */ +export interface TraceAnalysisResult { + runId: string; + totalTraces: number; + failedTraces: number; + patterns: FailurePattern[]; + agentFailureRates: Record; + avgDurationMs: number; + totalTokens: { input: number; output: number }; +} + +/** + * Configuration for trace analysis + */ +export interface TraceAnalysisConfig { + /** Minimum failure count to report a pattern. @default 2 */ + minPatternCount: number; + /** Whether to log analysis results. @default true */ + logResults: boolean; + /** Callback when analysis completes */ + onAnalysis?: (result: TraceAnalysisResult) => Promise; +} + +/** + * Default configuration + */ +export const DEFAULT_TRACE_ANALYSIS_CONFIG: TraceAnalysisConfig = { + minPatternCount: 2, + logResults: true, +}; + +// ============================================================================= +// Hook Implementation +// ============================================================================= + +/** + * Trace Analysis Hook + * + * Runs on onRunEnd to analyze all decision traces from the completed run. + * Identifies failure patterns, calculates agent failure rates, and generates + * improvement suggestions that can feed back into prompt tuning or hook config. + */ +export class TraceAnalysisHook implements AgentHook { + readonly name = 'trace-analysis'; + private config: TraceAnalysisConfig; + private store: DecisionTraceStore; + + constructor(config?: Partial, store?: DecisionTraceStore) { + this.config = { ...DEFAULT_TRACE_ANALYSIS_CONFIG, ...config }; + this.store = store ?? getDecisionTraceStore(); + } + + /** + * Required by interface — no-op for this hook + */ + async onAfterStep(_ctx: AgentRunContext): Promise { + // Trace analysis happens at run end, not per step + } + + /** + * Analyze all traces when a run completes + */ + async onRunEnd(ctx: AgentRunContext, success: boolean): Promise { + try { + // Use tenant-scoped query when tenantId is available (multi-tenant safety) + const traces = ctx.tenantId + ? await this.store.listTraces({ runId: ctx.runId, tenantId: ctx.tenantId }) + : await this.store.getTracesForRun(ctx.runId); + + if (traces.length === 0) { + logger.debug('No traces found for run, skipping analysis', { runId: ctx.runId }); + return; + } + + const result = this.analyzeTraces(ctx.runId, traces); + + // Attach to context metadata for downstream consumption + if (ctx.metadata) { + ctx.metadata.traceAnalysis = result; + } + + if (this.config.logResults) { + const level = success ? 'info' : 'warn'; + logger[level]('Trace analysis complete', { + runId: ctx.runId, + totalTraces: result.totalTraces, + failedTraces: result.failedTraces, + patternCount: result.patterns.length, + avgDurationMs: Math.round(result.avgDurationMs), + }); + + for (const pattern of result.patterns) { + logger.warn('Failure pattern detected', { + runId: ctx.runId, + agent: pattern.agentType, + category: pattern.category, + count: pattern.count, + suggestion: pattern.suggestion, + }); + } + } + + await this.config.onAnalysis?.(result); + } catch (error) { + logger.error('Trace analysis failed', { + runId: ctx.runId, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + /** + * Check if this hook is enabled + */ + async isEnabled(): Promise { + return process.env.GWI_TRACE_ANALYSIS_ENABLED !== 'false'; + } + + /** + * Analyze traces for a run and detect patterns + */ + analyzeTraces(runId: string, traces: AgentDecisionTrace[]): TraceAnalysisResult { + const failedTraces = traces.filter( + (t) => t.outcome?.result === 'failure' + ); + + // Calculate agent failure rates + const agentStats = new Map(); + for (const trace of traces) { + const stats = agentStats.get(trace.agentType) ?? { total: 0, failed: 0 }; + stats.total++; + if (trace.outcome?.result === 'failure') { + stats.failed++; + } + agentStats.set(trace.agentType, stats); + } + + const agentFailureRates: Record = {}; + for (const [agent, stats] of agentStats) { + agentFailureRates[agent] = { + ...stats, + rate: stats.total > 0 ? stats.failed / stats.total : 0, + }; + } + + // Detect failure patterns + const patterns = this.detectPatterns(failedTraces); + + // Aggregate timing + const durations = traces + .map((t) => t.metadata?.durationMs) + .filter((d): d is number => d !== undefined); + const avgDurationMs = durations.length > 0 + ? durations.reduce((a, b) => a + b, 0) / durations.length + : 0; + + // Aggregate tokens + const totalTokens = { input: 0, output: 0 }; + for (const trace of traces) { + if (trace.metadata?.tokensUsed) { + totalTokens.input += trace.metadata.tokensUsed.input; + totalTokens.output += trace.metadata.tokensUsed.output; + } + } + + return { + runId, + totalTraces: traces.length, + failedTraces: failedTraces.length, + patterns, + agentFailureRates, + avgDurationMs, + totalTokens, + }; + } + + /** + * Detect failure patterns from failed traces + */ + private detectPatterns(failedTraces: AgentDecisionTrace[]): FailurePattern[] { + const patterns: FailurePattern[] = []; + + // Group failures by agent type + const byAgent = new Map(); + for (const trace of failedTraces) { + const existing = byAgent.get(trace.agentType) ?? []; + existing.push(trace); + byAgent.set(trace.agentType, existing); + } + + for (const [agentType, agentTraces] of byAgent) { + // Pattern: Repeated failures from same agent + if (agentTraces.length >= this.config.minPatternCount) { + patterns.push({ + agentType, + category: 'repeated_failure', + count: agentTraces.length, + samples: agentTraces + .slice(0, 3) + .map((t) => t.decision.reasoning.slice(0, 200)), + suggestion: `${agentType} agent failed ${agentTraces.length} times. Review prompt, model selection, or input quality.`, + }); + } + + // Pattern: Low confidence decisions that failed + const lowConfidence = agentTraces.filter( + (t) => t.decision.confidence < 0.5 + ); + if (lowConfidence.length >= this.config.minPatternCount) { + patterns.push({ + agentType, + category: 'low_confidence_failure', + count: lowConfidence.length, + samples: lowConfidence + .slice(0, 3) + .map((t) => `confidence=${t.decision.confidence}: ${t.decision.reasoning.slice(0, 150)}`), + suggestion: `${agentType} made ${lowConfidence.length} low-confidence decisions that failed. Consider escalating to a more capable model earlier.`, + }); + } + + // Pattern: Slow failures (timeout-like) + const slowFailures = agentTraces.filter( + (t) => t.metadata?.durationMs && t.metadata.durationMs > 30000 + ); + if (slowFailures.length >= this.config.minPatternCount) { + patterns.push({ + agentType, + category: 'slow_failure', + count: slowFailures.length, + samples: slowFailures + .slice(0, 3) + .map((t) => `${t.metadata?.durationMs}ms: ${t.decision.action}`), + suggestion: `${agentType} had ${slowFailures.length} slow failures (>30s). May indicate timeout issues or overly complex inputs.`, + }); + } + } + + return patterns; + } +} + +// ============================================================================= +// Factory +// ============================================================================= + +/** + * Create a trace analysis hook + */ +export function createTraceAnalysisHook( + config?: Partial, + store?: DecisionTraceStore, +): TraceAnalysisHook { + return new TraceAnalysisHook(config, store); +}