From 00c01c23a3e69b387bbbbcf41943f2981e5d6977 Mon Sep 17 00:00:00 2001 From: Joshua Menke Date: Thu, 21 May 2026 14:25:05 +0200 Subject: [PATCH] fix: capture every turn's gen_ai.output.messages (0.2.11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs combined to drop most assistant outputs from Sentry AI Conversations: 1. closeTurnSpan only ran on the next UserPromptSubmit / SessionEnd, racing the transcript flush. If assistant text wasn't on disk at close time, tokens.response was null and gen_ai.output.messages was silently skipped — and the 200 ms retry only triggered on zero usage, not on missing text. Wire the Stop hook (was a no-op) to closeCurrentTurn; closeCurrentTurn is idempotent so the follow-up close is harmless. Extend the late-flush retry to also trigger when recordOutputs is on and response is null. 2. selectTurn's ordinal fallback misattributed an earlier transcript turn's response to the current turn when the collector spawned mid-session (synthesized=true, local turnIndex out of sync with transcript real-turn index). Refuse ordinal matches under synthesis and record usage_extraction.status = no_matching_turn_synthesized_ordinal so the gap is observable. Adds two regression tests covering Stop-based close + the synthesized-ordinal refusal. All 426 tests pass. Co-Authored-By: Claude Sonnet 4.6 --- .claude-plugin/marketplace.json | 2 +- .claude-plugin/plugin.json | 2 +- CHANGELOG.md | 8 ++++ package.json | 2 +- scripts/server.js | 54 +++++++++++++++++++---- src/server.ts | 54 +++++++++++++++++++---- src/types.ts | 1 + tests/server-lifecycle.test.ts | 76 +++++++++++++++++++++++++++++++++ 8 files changed, 180 insertions(+), 19 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 9b3adc8..2925bdb 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -8,7 +8,7 @@ { "name": "claude-code-ai-observability", "description": "Realtime Sentry traces with per-turn tokens, cost, subagent and error instrumentation, and rich auto-tagging (session / git / host). Forked from sergical/claude-code-sentry-monitor (MIT).", - "version": "0.2.10", + "version": "0.2.11", "author": { "name": "Joshkop" }, diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 3be6f2e..d09307a 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "claude-code-ai-observability", "description": "Comprehensive AI Agent Observability plugin for Claude Code — realtime Sentry traces with per-turn tokens, cost, subagent and error instrumentation, and rich auto-tagging (session / git / host). Forked from sergical/claude-code-sentry-monitor (MIT).", - "version": "0.2.10", + "version": "0.2.11", "author": { "name": "Joshkop" }, diff --git a/CHANGELOG.md b/CHANGELOG.md index 4817583..fe05226 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.2.11] - 2026-05-21 + +### Fixed + +- **Sentry AI Conversations now shows every assistant output, not just the first.** Two bugs combined to drop most assistant messages from the Conversations view: + - `closeTurnSpan` previously ran only on the *next* `UserPromptSubmit` (or `SessionEnd`), racing the transcript JSONL flush. If assistant text wasn't on disk at close time, `tokens.response` was `null` and the `gen_ai.output.messages` attribute was silently skipped — and the existing 200 ms retry only triggered when usage was zero, not when text was missing. The `Stop` hook now closes the current turn (it fires right after assistant text lands, so the transcript is freshest). `closeCurrentTurn` is idempotent, so the follow-up close from UserPromptSubmit / SessionEnd is a no-op. The late-flush retry now also triggers when `recordOutputs` is on and `response` is null. + - When the collector spawned (or self-healed) mid-session, its local `turnIndex` no longer aligned with the transcript's real-turn ordinal. `selectTurn`'s ordinal fallback would silently attribute an earlier turn's response to the current turn (the "how are you" turn would surface turn 0's "Hi! How can I help today?" output). Synthesized sessions now refuse ordinal matches and record `claude_code.usage_extraction.status = no_matching_turn_synthesized_ordinal` so the gap is observable in Sentry. + ## [0.2.10] - 2026-05-21 ### Fixed diff --git a/package.json b/package.json index 1a41e45..5a33a67 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "claude-code-ai-observability", - "version": "0.2.10", + "version": "0.2.11", "description": "Comprehensive AI Agent Observability plugin for Claude Code — realtime Sentry traces with per-turn tokens, cost, subagent and error instrumentation, and rich auto-tagging (session / git / host).", "type": "module", "scripts": { diff --git a/scripts/server.js b/scripts/server.js index 3538df9..4941e61 100644 --- a/scripts/server.js +++ b/scripts/server.js @@ -169,26 +169,47 @@ export function startServer(sentry, config, baseAutoTags) { // UserPromptSubmit corresponds to exactly one real (non-sidechain, // non-tool_result) user line. const selected = selectTurn(result, record.currentPromptId, record.turnIndex); - if (!selected.turn) { - tokenExtractionStatus = "no_matching_turn"; + // Synthesized sessions start record.turnIndex at -1 even when the + // transcript already holds prior real turns (collector spawned mid- + // session or self-healed after a restart). The local ordinal no longer + // aligns with the transcript's real-turn index, so an ordinal fallback + // can silently attach an earlier turn's response to a later turn. + // Better to drop one turn's data than to attribute the wrong assistant + // text to it — refuse ordinal matches under synthesis. + const unsafeOrdinal = record.synthesized && selected.matchedBy === "ordinal"; + if (!selected.turn || unsafeOrdinal) { + tokenExtractionStatus = unsafeOrdinal + ? "no_matching_turn_synthesized_ordinal" + : "no_matching_turn"; } else { tokens = selected.turn; - if (tokens.inputTokens + tokens.outputTokens === 0) { - // Late-flush hypothesis: assistant usage may not yet be on disk. - // Sleep briefly and try once more. + // Late-flush hypothesis: assistant usage OR assistant text may not yet + // be on disk when the prior turn closes. Retry once if either is + // missing — without the response retry, turns whose usage flushed + // before text silently drop gen_ai.output.messages, leaving Sentry AI + // Conversations with only the inputs. + const usageMissing = tokens.inputTokens + tokens.outputTokens === 0; + const responseMissing = config.recordOutputs && !tokens.response; + if (usageMissing || responseMissing) { await new Promise((r) => setTimeout(r, 200)); const retry = readTranscript(record.transcriptPath); parseDegraded = retry.degraded; sessionDims = retry.session; const retrySelected = selectTurn(retry, record.currentPromptId, record.turnIndex); - if (retrySelected.turn && (retrySelected.turn.inputTokens + retrySelected.turn.outputTokens) > 0) { - tokens = retrySelected.turn; + const retryTurn = retrySelected.turn; + const retryUsageOk = retryTurn && (retryTurn.inputTokens + retryTurn.outputTokens) > 0; + const retryResponseOk = retryTurn && (!config.recordOutputs || retryTurn.response); + if (retryTurn && (usageMissing ? retryUsageOk : true) && (responseMissing ? retryResponseOk : true)) { + tokens = retryTurn; tokenExtractionStatus = "ok|matched_after_retry"; } - else { + else if (usageMissing) { tokenExtractionStatus = "turn_had_no_usage"; } + else { + tokenExtractionStatus = "turn_had_no_response"; + } } else { tokenExtractionStatus = "ok"; @@ -424,6 +445,21 @@ export function startServer(sentry, config, baseAutoTags) { span.end(); record.pendingTools.delete(key); }; + // Stop fires right after the assistant text lands in the transcript — the + // freshest moment to harvest the response. Without this, closeCurrentTurn + // only runs on the next UserPromptSubmit (or SessionEnd) and races the + // transcript flush, silently dropping gen_ai.output.messages for any turn + // where text wasn't on disk yet. closeCurrentTurn is idempotent (clears + // currentTurnSpans), so a follow-up close from UserPromptSubmit is a no-op. + const handleStop = async (event) => { + const record = sessions.get(event.session_id); + if (!record) + return; + if (event.transcript_path && !record.transcriptPath) { + record.transcriptPath = event.transcript_path; + } + await closeCurrentTurn(record); + }; const handleSessionEnd = async (event) => { const record = sessions.get(event.session_id); if (!record) @@ -500,6 +536,8 @@ export function startServer(sentry, config, baseAutoTags) { await handleSessionEnd(event); return; case "Stop": + await handleStop(event); + return; case "PreCompact": return; } diff --git a/src/server.ts b/src/server.ts index ed84f17..ce4e915 100644 --- a/src/server.ts +++ b/src/server.ts @@ -10,6 +10,7 @@ import type { ResolvedPluginConfig, SessionEndEvent, SessionStartEvent, + StopEvent, UserPromptSubmitEvent, } from "./types.js"; import { reportPluginError } from "./sentry-errors.js"; @@ -225,23 +226,43 @@ export function startServer( // UserPromptSubmit corresponds to exactly one real (non-sidechain, // non-tool_result) user line. const selected = selectTurn(result, record.currentPromptId, record.turnIndex); - if (!selected.turn) { - tokenExtractionStatus = "no_matching_turn"; + // Synthesized sessions start record.turnIndex at -1 even when the + // transcript already holds prior real turns (collector spawned mid- + // session or self-healed after a restart). The local ordinal no longer + // aligns with the transcript's real-turn index, so an ordinal fallback + // can silently attach an earlier turn's response to a later turn. + // Better to drop one turn's data than to attribute the wrong assistant + // text to it — refuse ordinal matches under synthesis. + const unsafeOrdinal = record.synthesized && selected.matchedBy === "ordinal"; + if (!selected.turn || unsafeOrdinal) { + tokenExtractionStatus = unsafeOrdinal + ? "no_matching_turn_synthesized_ordinal" + : "no_matching_turn"; } else { tokens = selected.turn; - if (tokens.inputTokens + tokens.outputTokens === 0) { - // Late-flush hypothesis: assistant usage may not yet be on disk. - // Sleep briefly and try once more. + // Late-flush hypothesis: assistant usage OR assistant text may not yet + // be on disk when the prior turn closes. Retry once if either is + // missing — without the response retry, turns whose usage flushed + // before text silently drop gen_ai.output.messages, leaving Sentry AI + // Conversations with only the inputs. + const usageMissing = tokens.inputTokens + tokens.outputTokens === 0; + const responseMissing = config.recordOutputs && !tokens.response; + if (usageMissing || responseMissing) { await new Promise((r) => setTimeout(r, 200)); const retry = readTranscript(record.transcriptPath); parseDegraded = retry.degraded; sessionDims = retry.session; const retrySelected = selectTurn(retry, record.currentPromptId, record.turnIndex); - if (retrySelected.turn && (retrySelected.turn.inputTokens + retrySelected.turn.outputTokens) > 0) { - tokens = retrySelected.turn; + const retryTurn = retrySelected.turn; + const retryUsageOk = retryTurn && (retryTurn.inputTokens + retryTurn.outputTokens) > 0; + const retryResponseOk = retryTurn && (!config.recordOutputs || retryTurn.response); + if (retryTurn && (usageMissing ? retryUsageOk : true) && (responseMissing ? retryResponseOk : true)) { + tokens = retryTurn; tokenExtractionStatus = "ok|matched_after_retry"; - } else { + } else if (usageMissing) { tokenExtractionStatus = "turn_had_no_usage"; + } else { + tokenExtractionStatus = "turn_had_no_response"; } } else { tokenExtractionStatus = "ok"; @@ -499,6 +520,21 @@ export function startServer( record.pendingTools.delete(key); }; + // Stop fires right after the assistant text lands in the transcript — the + // freshest moment to harvest the response. Without this, closeCurrentTurn + // only runs on the next UserPromptSubmit (or SessionEnd) and races the + // transcript flush, silently dropping gen_ai.output.messages for any turn + // where text wasn't on disk yet. closeCurrentTurn is idempotent (clears + // currentTurnSpans), so a follow-up close from UserPromptSubmit is a no-op. + const handleStop = async (event: StopEvent): Promise => { + const record = sessions.get(event.session_id); + if (!record) return; + if (event.transcript_path && !record.transcriptPath) { + record.transcriptPath = event.transcript_path; + } + await closeCurrentTurn(record); + }; + const handleSessionEnd = async (event: SessionEndEvent): Promise => { const record = sessions.get(event.session_id); if (!record) return; @@ -566,6 +602,8 @@ export function startServer( await handleSessionEnd(event); return; case "Stop": + await handleStop(event); + return; case "PreCompact": return; } diff --git a/src/types.ts b/src/types.ts index 733ff62..b76db86 100644 --- a/src/types.ts +++ b/src/types.ts @@ -153,6 +153,7 @@ export interface PreCompactEvent extends AiobsEnvelope { export interface StopEvent extends AiobsEnvelope { hook_event_name: "Stop"; session_id: string; + transcript_path?: string; } export type HookEvent = diff --git a/tests/server-lifecycle.test.ts b/tests/server-lifecycle.test.ts index 7ded350..af0be11 100644 --- a/tests/server-lifecycle.test.ts +++ b/tests/server-lifecycle.test.ts @@ -269,6 +269,52 @@ describe("server: reader integration (C1/C6)", () => { rmSync(dir, { recursive: true, force: true }); } }); + +}); + +describe("server: Stop hook closes the turn with response", () => { + let port: number; + let sentry: ReturnType; + let close: () => Promise; + + beforeEach(async () => { + port = await findFreePort(); + process.env.SENTRY_COLLECTOR_PORT = String(port); + sentry = makeFakeSentry(); + // Stop's whole job is to make output capture reliable, so recordOutputs + // must be on for the assertion to mean anything. + const cfg: ResolvedPluginConfig = { ...baseConfig, recordOutputs: true, recordInputs: true }; + const server = startServer(sentry as never, cfg, baseTags); + close = server.close; + for (let i = 0; i < 25; i++) { + try { const r = await fetch(`http://127.0.0.1:${port}/health`); if (r.ok) break; } catch { /* ignore */ } + await new Promise((r) => setTimeout(r, 50)); + } + }); + afterEach(async () => { await close(); delete process.env.SENTRY_COLLECTOR_PORT; }); + + it("Stop captures gen_ai.output.messages and SessionEnd is idempotent", async () => { + const dir = mkdtempSync(join(tmpdir(), "srv-stop-")); + const tx = join(dir, "s.jsonl"); + writeFileSync(tx, [ + JSON.stringify({ type: "user", promptId: "P1", message: { content: "hello" } }), + JSON.stringify({ type: "assistant", message: { model: "claude-opus-4-7", usage: { input_tokens: 5, output_tokens: 7 }, content: [{ type: "text", text: "hi there" }] } }), + ].join("\n"), "utf8"); + try { + await postHook(port, { hook_event_name: "SessionStart", session_id: "stop1", transcript_path: tx }); + await postHook(port, { hook_event_name: "UserPromptSubmit", session_id: "stop1", prompt: "hello", prompt_id: "P1" }); + await postHook(port, { hook_event_name: "Stop", session_id: "stop1", transcript_path: tx }); + const chat = sentry.spans.find((s) => s.op === "gen_ai.chat"); + expect(chat).toBeTruthy(); + expect(chat!.attrs["gen_ai.output.messages"]).toBe( + JSON.stringify([{ role: "assistant", content: "hi there" }]), + ); + await postHook(port, { hook_event_name: "SessionEnd", session_id: "stop1", transcript_path: tx }); + expect(sentry.spans.filter((s) => s.op === "gen_ai.chat")).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); }); describe("server: per-session git cwd (C4)", () => { @@ -355,6 +401,36 @@ describe("server: lazy session synthesis (R2)", () => { rmSync(dir, { recursive: true, force: true }); } }); + + it("refuses ordinal fallback when the session is synthesized to avoid wrong-turn attribution", async () => { + const dir = mkdtempSync(join(tmpdir(), "srv-r2-ord-")); + const tx = join(dir, "s.jsonl"); + try { + // Transcript already holds two prior real turns from before the + // collector spawned. Synthesized session's local turnIndex starts at + // 0 on its first UserPromptSubmit, which would ordinal-match the + // FIRST transcript turn's response — exactly the bug this guards. + writeFileSync(tx, [ + JSON.stringify({ type: "user", promptId: "OLD1", message: { content: "old1" } }), + JSON.stringify({ type: "assistant", message: { model: "m", usage: { input_tokens: 5, output_tokens: 5 }, content: [{ type: "text", text: "OLD RESPONSE 1" }] } }), + JSON.stringify({ type: "user", promptId: "OLD2", message: { content: "old2" } }), + JSON.stringify({ type: "assistant", message: { model: "m", usage: { input_tokens: 5, output_tokens: 5 }, content: [{ type: "text", text: "OLD RESPONSE 2" }] } }), + ].join("\n"), "utf8"); + // Skip SessionStart; UserPromptSubmit with a prompt_id that does NOT + // exist in the transcript forces ordinal fallback. + await postHook(port, { hook_event_name: "UserPromptSubmit", session_id: "syn-ord", prompt: "new", prompt_id: "NEW_NOT_IN_TRANSCRIPT", _aiobs: { context: { cwd: dir } } }); + await postHook(port, { hook_event_name: "SessionEnd", session_id: "syn-ord", transcript_path: tx }); + const chat = sentry.spans.find((s) => s.op === "gen_ai.chat"); + expect(chat).toBeTruthy(); + // We must NOT have inherited OLD1's response. + expect(chat!.attrs["gen_ai.output.messages"]).toBeUndefined(); + expect(chat!.attrs["claude_code.usage_extraction.status"]).toBe( + "no_matching_turn_synthesized_ordinal", + ); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); }); describe("server: dropped attr + heartbeat (R3/R4)", () => {