diff --git a/.github/actions/spelling/allow/allow.txt b/.github/actions/spelling/allow/allow.txt index 9765825f1..c1de50f5d 100644 --- a/.github/actions/spelling/allow/allow.txt +++ b/.github/actions/spelling/allow/allow.txt @@ -224,6 +224,7 @@ sids SIGKILLs signtool slnt +spams splitn splitw ssh diff --git a/.github/actions/spelling/allow/apis.txt b/.github/actions/spelling/allow/apis.txt index 5c4f63f0d..da7af6e36 100644 --- a/.github/actions/spelling/allow/apis.txt +++ b/.github/actions/spelling/allow/apis.txt @@ -128,6 +128,7 @@ PAGESCROLL PALLOC PATINVERT Pels +PHANDLER PICKFOLDERS PINPUT pmr diff --git a/src/cascadia/TerminalApp/SharedWta.cpp b/src/cascadia/TerminalApp/SharedWta.cpp index c5ffbf5ea..8a708aaf1 100644 --- a/src/cascadia/TerminalApp/SharedWta.cpp +++ b/src/cascadia/TerminalApp/SharedWta.cpp @@ -9,6 +9,7 @@ #include "../WinRTUtils/inc/WtExeUtils.h" #include "../inc/WtaProcess.h" +#include "AgentPaneLog.h" namespace winrt::TerminalApp::implementation { @@ -419,6 +420,9 @@ namespace winrt::TerminalApp::implementation // Order matters: drop the job FIRST so KILL_ON_JOB_CLOSE // terminates wta + descendants while we still hold a process // handle that lets us observe the termination if needed. + // Deliberate teardown: the master is reaped silently (job close, no + // console event), so it can't log its own death — record it here. + _agentPaneLog("releasing wta-master pid=" + std::to_string(_pid) + " (deliberate teardown via KILL_ON_JOB_CLOSE)"); _job.reset(); _process.reset(); if (_waitHandle) @@ -470,6 +474,13 @@ namespace winrt::TerminalApp::implementation // ran. Nothing to do. return; } + // The master exited on its own — crash, OOM, or an external kill + // (taskkill /F, Task Manager). It can't log its own hard death from + // inside, but this wait callback (the parent observing it) can. This + // is the external observer that makes otherwise-silent master deaths + // diagnosable; deliberate teardowns never reach here (they reset + // _process first, so the validity check above bails). + _agentPaneLog("wta-master exited unexpectedly pid=" + std::to_string(observedPid) + " (crash/OOM/external kill — observed by wait callback)"); _job.reset(); _process.reset(); if (_waitHandle) diff --git a/tools/wta/Cargo.toml b/tools/wta/Cargo.toml index 302c082ee..ee6115b8e 100644 --- a/tools/wta/Cargo.toml +++ b/tools/wta/Cargo.toml @@ -29,6 +29,7 @@ windows-sys = { version = "0.61", features = [ "Win32_Globalization", "Win32_Security_Credentials", "Win32_Storage_Packaging_Appx", + "Win32_System_Console", "Win32_System_DataExchange", "Win32_System_Environment", "Win32_System_Memory", diff --git a/tools/wta/src/agent_sessions.rs b/tools/wta/src/agent_sessions.rs index 35a27f462..8e04f4b81 100644 --- a/tools/wta/src/agent_sessions.rs +++ b/tools/wta/src/agent_sessions.rs @@ -1142,7 +1142,9 @@ impl AgentSessionRegistry { self.known_alive_panes.insert(pane_lc); } self.dirty = true; - tracing::info!( + // Per-row, fires on every alive-snapshot upgrade — debug, + // not info (this was by far the highest-volume info line). + tracing::debug!( target: "agent_session_registry", key = %sid, pane = ?pane_opt, diff --git a/tools/wta/src/app.rs b/tools/wta/src/app.rs index cc5cb4b86..8a7e515da 100644 --- a/tools/wta/src/app.rs +++ b/tools/wta/src/app.rs @@ -658,7 +658,8 @@ where } } let key_for_refresh = key.clone(); - tracing::info!( + // Per-agent-event — debug, not info. + tracing::debug!( target: "agent_route", event = %event, asid = %asid, @@ -827,7 +828,8 @@ where } let dirty = reg.take_dirty(); - tracing::info!( + // Per-agent-event (partner of "routing") — debug, not info. + tracing::debug!( target: "agent_route", event = %event, dirty = dirty, @@ -5286,7 +5288,10 @@ impl App { tab_id, params, } => { - tracing::debug!(target: "autofix", method = %method, pane_id = %pane_id, tab_id = ?tab_id, self_pane_id = ?self.pane_id, "WtEvent"); + // Per-WT-event (every vt_sequence included) — trace-only; the + // single per-event breadcrumb stays at debug in main.rs + // (`wt_event_rx: received event`). + tracing::trace!(target: "autofix", method = %method, pane_id = %pane_id, tab_id = ?tab_id, self_pane_id = ?self.pane_id, "WtEvent"); // Hook bridge events: fire-and-forget into the agent registry // so the agent session view stays current. Unrelated to autofix / @@ -5906,7 +5911,8 @@ impl App { } let notification = classify_wt_event(&method, &pane_id, tab_id.as_deref(), ¶ms); - tracing::debug!(target: "autofix", severity = ?notification.severity, summary = %notification.summary, tab_id = ?notification.tab_id, "classified"); + // Per-WT-event classification — trace-only (vt_sequence volume). + tracing::trace!(target: "autofix", severity = ?notification.severity, summary = %notification.summary, tab_id = ?notification.tab_id, "classified"); // Per-tab filter. WT broadcasts pane-scoped events to every // helper in the window, but another tab's failures are not @@ -5921,7 +5927,9 @@ impl App { && !self_tab.is_empty() && event_tab != self_tab { - tracing::debug!( + // Per-cross-tab-event (very high volume in multi-tab + // windows) — trace-only. + tracing::trace!( target: "autofix", event_tab, self_tab, diff --git a/tools/wta/src/logging.rs b/tools/wta/src/logging.rs index 06c725f90..4688d04a7 100644 --- a/tools/wta/src/logging.rs +++ b/tools/wta/src/logging.rs @@ -175,6 +175,181 @@ pub fn shutdown_flush() { } } +/// Install a Windows console control handler that records the teardown +/// signal and drains the log appender before the OS terminates us. +/// +/// The wta-**helper** runs as a ConPTY child of Windows Terminal (it's the +/// process rendered in the agent pane). When its pane/tab/window closes — or +/// the user logs off / shuts down — the OS delivers a control event +/// (`CTRL_CLOSE`/`CTRL_LOGOFF`/`CTRL_SHUTDOWN`) and then terminates it at the +/// end of a short grace window. Without a handler those deaths are invisible: +/// the process vanishes mid-stream and the non-blocking appender's last +/// buffered records are lost, because [`shutdown_flush`] never runs (the +/// `WorkerGuard` lives in a `static` and `static`s don't `Drop` at teardown). +/// That is exactly the "helper just stopped responding" signature where the +/// success path is logged exhaustively but the teardown path is silent and +/// the incident is undiagnosable. +/// +/// This closes that gap for the helper: it logs WHICH control event tore the +/// process down and flushes so the final records (e.g. the transport-lost +/// WARN in `run_acp_client_over_pipe`) reach disk. The handler returns FALSE +/// so the default handler still runs and the process terminates as before — +/// we only ADD a log line + flush, never changing termination behavior. It's +/// installed process-wide (cheap and harmless), so any wta process that does +/// receive a console control event benefits. +/// +/// Coverage limits — what this does NOT catch: +/// * The wta-**master** is spawned `CREATE_NO_WINDOW` and contained in a +/// Job Object with `KILL_ON_JOB_CLOSE` (see C++ `SharedWta`). Its normal +/// teardown is the parent dropping that job, which reaps the master like +/// a `TerminateProcess` — NO control event — so *this handler* does not +/// trace routine master teardown. That teardown is not unlogged overall, +/// though: the C++ parent (`SharedWta`) records both the deliberate +/// job-close and an unexpected exit to `terminal-agent-pane.log`. This +/// handler fires for the master only on genuine console signals +/// (logoff/shutdown), if delivered at all. +/// * A hard `TerminateProcess` (Task Manager "End task", `taskkill /F`, an +/// OS resource kill, or the Job-Object reap above) delivers no control +/// event and stays untraceable from inside the process. +/// * While the Ratatui TUI holds the console in raw mode, Ctrl+C arrives as +/// a key event (not `CTRL_C_EVENT`), so this handler doesn't normally see +/// it and doesn't alter the TUI's Ctrl+C behavior. +pub fn install_ctrl_handler() { + use windows_sys::Win32::Foundation::{GetLastError, ERROR_INVALID_HANDLE}; + use windows_sys::Win32::System::Console::{ + SetConsoleCtrlHandler, CTRL_BREAK_EVENT, CTRL_CLOSE_EVENT, CTRL_C_EVENT, + CTRL_LOGOFF_EVENT, CTRL_SHUTDOWN_EVENT, + }; + + // Returns `windows_sys`' `BOOL` (an `i32` alias) to match the + // PHANDLER_ROUTINE signature: 0 == FALSE (fall through to default). + unsafe extern "system" fn handler(ctrl_type: u32) -> i32 { + let event = match ctrl_type { + CTRL_C_EVENT => "CTRL_C", + CTRL_BREAK_EVENT => "CTRL_BREAK", + CTRL_CLOSE_EVENT => "CTRL_CLOSE", + CTRL_LOGOFF_EVENT => "CTRL_LOGOFF", + CTRL_SHUTDOWN_EVENT => "CTRL_SHUTDOWN", + _ => "UNKNOWN", + }; + tracing::warn!( + target: "lifecycle", + ctrl_type, + event, + "console control event received — process being torn down; flushing logs" + ); + // Drain the appender so the line above (and any earlier buffered + // records) hit disk before the grace window ends and we're killed. + shutdown_flush(); + // FALSE → fall through to the default handler (terminate). We only + // add logging + flush; termination behavior is unchanged. + 0 + } + + // SAFETY: `handler` is a valid `extern "system"` routine matching the + // PHANDLER_ROUTINE signature; registering a control handler is a + // process-global, thread-safe Win32 operation. + unsafe { + if SetConsoleCtrlHandler(Some(handler), 1) == 0 { + // Capture the Win32 error immediately, before any other call (incl. + // the logging macro's own work) can reset thread-last-error. + let error_code = GetLastError(); + if error_code == ERROR_INVALID_HANDLE { + // Expected for a windowless wta process (the CREATE_NO_WINDOW + // master, a detached CLI invocation): there's no console to + // signal, and teardown for those is covered elsewhere (the C++ + // side observes the master via its wait callback). Benign — + // debug only, so it never spams release logs. + tracing::debug!( + target: "lifecycle", + error_code, + "SetConsoleCtrlHandler: no console attached (expected for windowless process)" + ); + } else { + // Any other failure is the diagnostic feature itself failing to + // arm where we DID expect a console (e.g. the helper) — warn so + // release (info) logs explain why later teardown signals are + // absent rather than leaving it a silent mystery. + tracing::warn!( + target: "lifecycle", + error_code, + "SetConsoleCtrlHandler failed — teardown signals will not be logged" + ); + } + } + } +} + +/// Install a panic hook that records the panic to disk, then chains to the +/// previous hook. +/// +/// A Rust panic otherwise writes only to stderr — invisible for a ConPTY- +/// hosted helper or a `CREATE_NO_WINDOW` master — and the non-blocking +/// appender's buffered tail is lost when a *fatal* panic kills the process +/// before the background worker drains it. So a panic is a "died for no +/// logged reason" blind spot. This closes it WITHOUT changing panic semantics +/// (it chains the previous hook, so unwind/abort and backtraces are +/// unchanged): +/// * a `tracing::error!` so the panic correlates in the normal log (this +/// drains fine for a *recovered* panic, e.g. behind a `catch_unwind`), and +/// * a synchronous append to `wta-panic.log`, independent of the async +/// appender, so the record reaches disk even when a fatal panic kills us. +/// +/// It deliberately does NOT call [`shutdown_flush`]: that drops the appender +/// guard and would permanently kill logging after a recoverable panic. The +/// synchronous file write is the durable path instead. +pub fn install_panic_hook() { + let prev = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + // Same payload extraction the rest of the codebase uses. + let msg = info + .payload() + .downcast_ref::<&'static str>() + .copied() + .or_else(|| info.payload().downcast_ref::().map(|s| s.as_str())) + .unwrap_or(""); + let location = info + .location() + .map(|l| format!("{}:{}:{}", l.file(), l.line(), l.column())) + .unwrap_or_else(|| "".to_string()); + let thread_name = std::thread::current() + .name() + .unwrap_or("") + .to_string(); + + tracing::error!( + target: "panic", + message = %msg, + location = %location, + thread = %thread_name, + "thread panicked" + ); + + // Guaranteed-on-disk backstop: a fatal main-thread panic unwinds past + // main() without reaching any `shutdown_flush`, so the appender's + // buffered tail (incl. the error above) can be lost. A synchronous + // append here does not depend on the appender being alive. + let millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0); + if let Ok(mut f) = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(log_dir().join("wta-panic.log")) + { + use std::io::Write; + let _ = writeln!( + f, + "[{millis}ms] pid={} thread={thread_name} panicked at {location}: {msg}", + std::process::id() + ); + } + + prev(info); + })); +} + /// Filesystem upkeep run once per process at logging init, before our own /// appender opens. /// diff --git a/tools/wta/src/main.rs b/tools/wta/src/main.rs index b728ba35e..0a19e6de6 100644 --- a/tools/wta/src/main.rs +++ b/tools/wta/src/main.rs @@ -646,6 +646,18 @@ async fn main() -> Result<()> { // is held in a global and flushed via `logging::shutdown_flush()` on every // exit path (see the calls below and before each `process::exit`). logging::init(&process_label(&cli)); + // Log + flush on console teardown signals (pane/tab/window close, logoff, + // shutdown) so a torn-down helper isn't a silent disappearance. Installed + // process-wide; see `install_ctrl_handler` for coverage limits — notably + // the master is job-killed (KILL_ON_JOB_CLOSE) and won't observe these, so + // *this handler* doesn't trace routine master teardown. That teardown is + // still logged, just by the C++ parent: `SharedWta` records both the + // deliberate job-close and an unexpected exit to terminal-agent-pane.log. + logging::install_ctrl_handler(); + // Record panics to disk (+ a synchronous wta-panic.log backstop) so a + // panic isn't a silent death — stderr is invisible for a ConPTY helper / + // CREATE_NO_WINDOW master. Chains the default hook; semantics unchanged. + logging::install_panic_hook(); tracing::info!(version = env!("CARGO_PKG_VERSION"), "=== wta starting ==="); let locale = cli diff --git a/tools/wta/src/master/mod.rs b/tools/wta/src/master/mod.rs index 6f73669b0..ae2790534 100644 --- a/tools/wta/src/master/mod.rs +++ b/tools/wta/src/master/mod.rs @@ -429,7 +429,9 @@ impl acp::Client for MasterClient { "helper notification channel drained — backpressure cleared" ); } - tracing::debug!( + // Per-streamed-chunk; trace-only so default debug logs + // stay readable. Turn-level flow is in `prompt_timing`. + tracing::trace!( target: "master", step = "agent→helper", op = "session_notification", @@ -1268,7 +1270,9 @@ impl acp::Agent for HelperHandler { return handle_sessions_list(&self.state, &args.params).await; } if method == crate::session_registry::INTELLTERM_METHOD_SESSION_HOOK { - tracing::info!( + // Per-session-hook (every tool start/stop/session event) — debug, + // not info; the reducer logs its own outcome where it matters. + tracing::debug!( target: "master", op = "ext_method", method = %method, @@ -1993,7 +1997,9 @@ async fn serve_helper( Some(notif) = notif_rx.recv() => { let sid = notif.session_id.clone(); let kind = notification_kind(¬if); - tracing::debug!( + // Per-streamed-chunk; trace-only to keep default debug logs + // readable (this line alone dominated the master log volume). + tracing::trace!( target: "master", step = "master→helper", op = "session_notification", @@ -2514,11 +2520,29 @@ async fn handle_session_hook( acp::Error::invalid_params().data(serde_json::json!({ "message": err.to_string() })) })?; - tracing::info!( - target: "session_hook", - event = ?event, - "received helper session hook" - ); + // Split by event kind so field diagnosis of session-state bugs survives at + // the default release level: terminal/lifecycle transitions (session + // start/stop, pane closed, connection failed) stay at info; the + // high-frequency routine events (tool start/stop, notifications, resume + // bookkeeping) go to debug. Keeps the load-bearing transitions visible + // without the per-tool flood that dominated the info logs. + { + use crate::agent_sessions::SessionEvent; + // Match on a reference so the level decision borrows rather than + // consumes `event` (it's used again below for the reducer). + let lifecycle = matches!( + &event, + SessionEvent::SessionStarted { .. } + | SessionEvent::SessionStopped { .. } + | SessionEvent::ConnectionFailed { .. } + | SessionEvent::PaneClosed { .. } + ); + if lifecycle { + tracing::info!(target: "session_hook", event = ?event, "received helper session hook"); + } else { + tracing::debug!(target: "session_hook", event = ?event, "received helper session hook"); + } + } // Capture the session key BEFORE moving `event` into the reducer so // we can dispatch the post-apply title refresh against the right diff --git a/tools/wta/src/protocol/acp/client.rs b/tools/wta/src/protocol/acp/client.rs index 9f3d49b09..98ee22368 100644 --- a/tools/wta/src/protocol/acp/client.rs +++ b/tools/wta/src/protocol/acp/client.rs @@ -1554,7 +1554,9 @@ impl acp::Client for WtaClient { async fn session_notification(&self, args: acp::SessionNotification) -> acp::Result<()> { let kind = session_update_kind(&args.update); - acp_log(&format!("session_notification: kind={}", kind)); + // Per-streamed-chunk; trace-only (not via acp_log's debug) so default + // debug logs aren't flooded with one line per token chunk. + tracing::trace!(target: "acp", "session_notification: kind={}", kind); // The full update carries agent message/thought text, tool-call // content, plan bodies, and replayed user-message chunks — trace only. acp_trace_content(&format!("session_notification update: {:?}", args.update));