Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions src/agents/dynamic-agent-prompt-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,70 @@ Briefly announce "Consulting Oracle for [reason]" before invocation.
</Oracle_Usage>`
}

export function buildSherlockSection(agents: AvailableAgent[]): string {
const sherlockAgent = agents.find((a) => a.name === "sherlock")
if (!sherlockAgent) return ""

const useWhen = sherlockAgent.metadata.useWhen || []
const avoidWhen = sherlockAgent.metadata.avoidWhen || []

return `<Sherlock_Usage>
## Sherlock — Hypothesis-Driven Debugger

Sherlock is a debugging specialist. Unlike Oracle (read-only consultant), Sherlock ACTIVELY debugs: instruments code, runs tests, analyzes logs, and implements fixes.

### CRITICAL: Oracle → Sherlock Flow

**After 2+ failed fix attempts, ALWAYS consult Oracle FIRST:**
1. **Oracle** provides system context (architecture, known gotchas, focus areas)
2. **Sherlock** receives Oracle's context and uses it to form better hypotheses
3. This prevents wasted iterations on wrong subsystems

**Example**: Bug with timezone display
- Without Oracle: Sherlock spends 4 iterations instrumenting UI code
- With Oracle: Oracle says "Prisma strips timezone by default" → Sherlock targets ORM immediately

### WHEN to Delegate:

| Trigger | Action |
|---------|--------|
${useWhen.map((w) => `| ${w} | Consult Oracle → Delegate to Sherlock |`).join("\n")}
| After 2+ failed fix attempts | Oracle context → Sherlock debugging |

### WHEN NOT to Delegate:

${avoidWhen.map((w) => `- ${w}`).join("\n")}

### Oracle vs Sherlock:

| Situation | Use |
|-----------|-----|
| **System context before debugging** | Oracle FIRST |
| Need debugging **advice only** | Oracle (read-only) |
| Need bug **investigation and fix** | Sherlock (with Oracle context) |
| Architecture questions | Oracle |
| Runtime behavior differs from expected | Oracle context → Sherlock |

### Usage Pattern (Oracle → Sherlock):
\`\`\`typescript
// Step 1: Get context from Oracle
sisyphus_task(agent="oracle", prompt="Debug context request: [bug description]")

// Step 2: Delegate to Sherlock WITH Oracle's context
sisyphus_task(agent="sherlock", prompt="
## Bug Report
[description]

## Oracle's System Context
[paste Oracle's analysis]

## Failed Attempts
[list attempts]
")
\`\`\`
</Sherlock_Usage>`
}

export function buildHardBlocksSection(): string {
const blocks = [
"| Type error suppression (`as any`, `@ts-ignore`) | Never |",
Expand Down
3 changes: 1 addition & 2 deletions src/agents/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ export { createSisyphusAgent } from "./sisyphus"
export { createOracleAgent, ORACLE_PROMPT_METADATA } from "./oracle"
export { createLibrarianAgent, LIBRARIAN_PROMPT_METADATA } from "./librarian"
export { createExploreAgent, EXPLORE_PROMPT_METADATA } from "./explore"


export { createSherlockAgent, SHERLOCK_PROMPT_METADATA } from "./sherlock"
export { createMultimodalLookerAgent, MULTIMODAL_LOOKER_PROMPT_METADATA } from "./multimodal-looker"
export { createMetisAgent, METIS_SYSTEM_PROMPT, metisPromptMetadata } from "./metis"
export { createMomusAgent, MOMUS_SYSTEM_PROMPT, momusPromptMetadata } from "./momus"
Expand Down
129 changes: 129 additions & 0 deletions src/agents/sherlock.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import { describe, test, expect } from "bun:test"
import { createSherlockAgent, SHERLOCK_PROMPT_METADATA } from "./sherlock"

describe("Sherlock Debug Agent", () => {
// #given a sherlock agent configuration
const agent = createSherlockAgent()

test("should use GPT-5.2 by default", () => {
// #when checking the model
// #then it should be GPT-5.2
expect(agent.model).toBe("openai/gpt-5.2")
})

test("should have low temperature for consistent reasoning", () => {
// #when checking temperature
// #then it should be 0.1
expect(agent.temperature).toBe(0.1)
})

test("should be configured as subagent mode", () => {
// #when checking mode
// #then it should be subagent
expect(agent.mode).toBe("subagent")
})

test("should have GPT-specific settings for GPT models", () => {
// #given a GPT model
const gptAgent = createSherlockAgent("openai/gpt-5.2")
// #then it should have reasoningEffort and textVerbosity
expect((gptAgent as Record<string, unknown>).reasoningEffort).toBe("medium")
expect((gptAgent as Record<string, unknown>).textVerbosity).toBe("high")
})

test("should have thinking enabled for non-GPT models", () => {
// #given a Claude model
const claudeAgent = createSherlockAgent("anthropic/claude-sonnet-4-5")
// #then it should have thinking enabled
expect((claudeAgent as Record<string, unknown>).thinking).toEqual({
type: "enabled",
budgetTokens: 32000,
})
})

test("should have specialist category metadata", () => {
// #when checking metadata
// #then category should be specialist
expect(SHERLOCK_PROMPT_METADATA.category).toBe("specialist")
expect(SHERLOCK_PROMPT_METADATA.cost).toBe("EXPENSIVE")
})

test("should have correct triggers", () => {
// #when checking triggers
// #then should include bug investigation triggers
const domains = SHERLOCK_PROMPT_METADATA.triggers.map((t) => t.domain)
expect(domains).toContain("Bug investigation")
expect(domains).toContain("Hard debugging")
expect(domains).toContain("State issues")
})

test("should have useWhen hints", () => {
// #when checking useWhen
// #then should include debugging scenarios
expect(SHERLOCK_PROMPT_METADATA.useWhen).toContain(
"Bug requires runtime evidence to diagnose"
)
expect(SHERLOCK_PROMPT_METADATA.useWhen).toContain(
"Multiple possible root causes"
)
})

test("should have avoidWhen hints", () => {
// #when checking avoidWhen
// #then should include simple cases
expect(SHERLOCK_PROMPT_METADATA.avoidWhen).toContain(
"Simple typos or syntax errors (use linter)"
)
expect(SHERLOCK_PROMPT_METADATA.avoidWhen).toContain(
"Type errors visible from static analysis (use LSP)"
)
})

test("should allow custom model override", () => {
// #given a custom model
const customAgent = createSherlockAgent("anthropic/claude-opus-4-5")
// #then the model should be overridden
expect(customAgent.model).toBe("anthropic/claude-opus-4-5")
})

test("should have a description", () => {
// #when checking description
// #then it should describe the debugging specialization
expect(agent.description).toContain("Hypothesis-driven debugging")
expect(agent.description).toContain("runtime evidence")
})

test("should have a comprehensive system prompt", () => {
// #when checking the prompt
// #then it should contain key sections
expect(agent.prompt).toContain("You are Sherlock")
expect(agent.prompt).toContain("Core Principles")
expect(agent.prompt).toContain("8 Phases")
expect(agent.prompt).toContain("Instrumentation Templates")
expect(agent.prompt).toContain("Log Analysis")
expect(agent.prompt).toContain("Security Rules")
})

test("should include hypothesis workflow in prompt", () => {
// #when checking the prompt
// #then it should describe the hypothesis workflow
expect(agent.prompt).toContain("Hypothesis A")
expect(agent.prompt).toContain("CONFIRMED")
expect(agent.prompt).toContain("REJECTED")
expect(agent.prompt).toContain("INCONCLUSIVE")
})

test("should include instrumentation patterns in prompt", () => {
// #when checking the prompt
// #then it should include code instrumentation templates
expect(agent.prompt).toContain("#region agent log")
expect(agent.prompt).toContain("hypothesisId")
expect(agent.prompt).toContain("127.0.0.1:7242")
})

test("should have promptAlias in metadata", () => {
// #when checking promptAlias
// #then it should be Sherlock
expect(SHERLOCK_PROMPT_METADATA.promptAlias).toBe("Sherlock")
})
})
Loading