Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/actions/spelling/allow/allow.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ sids
SIGKILLs
signtool
slnt
spams
splitn
splitw
ssh
Expand Down
1 change: 1 addition & 0 deletions .github/actions/spelling/allow/apis.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ PAGESCROLL
PALLOC
PATINVERT
Pels
PHANDLER
PICKFOLDERS
PINPUT
pmr
Expand Down
11 changes: 11 additions & 0 deletions src/cascadia/TerminalApp/SharedWta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "../WinRTUtils/inc/WtExeUtils.h"
#include "../inc/WtaProcess.h"
#include "AgentPaneLog.h"

namespace winrt::TerminalApp::implementation
{
Expand Down Expand Up @@ -419,6 +420,9 @@ namespace winrt::TerminalApp::implementation
// Order matters: drop the job FIRST so KILL_ON_JOB_CLOSE
// terminates wta + descendants while we still hold a process
// handle that lets us observe the termination if needed.
// Deliberate teardown: the master is reaped silently (job close, no
// console event), so it can't log its own death — record it here.
_agentPaneLog("releasing wta-master pid=" + std::to_string(_pid) + " (deliberate teardown via KILL_ON_JOB_CLOSE)");
_job.reset();
_process.reset();
if (_waitHandle)
Expand Down Expand Up @@ -470,6 +474,13 @@ namespace winrt::TerminalApp::implementation
// ran. Nothing to do.
return;
}
// The master exited on its own — crash, OOM, or an external kill
// (taskkill /F, Task Manager). It can't log its own hard death from
// inside, but this wait callback (the parent observing it) can. This
// is the external observer that makes otherwise-silent master deaths
// diagnosable; deliberate teardowns never reach here (they reset
// _process first, so the validity check above bails).
_agentPaneLog("wta-master exited unexpectedly pid=" + std::to_string(observedPid) + " (crash/OOM/external kill — observed by wait callback)");
_job.reset();
_process.reset();
if (_waitHandle)
Expand Down
1 change: 1 addition & 0 deletions tools/wta/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ windows-sys = { version = "0.61", features = [
"Win32_Globalization",
"Win32_Security_Credentials",
"Win32_Storage_Packaging_Appx",
"Win32_System_Console",
"Win32_System_DataExchange",
"Win32_System_Environment",
"Win32_System_Memory",
Expand Down
4 changes: 3 additions & 1 deletion tools/wta/src/agent_sessions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,9 @@ impl AgentSessionRegistry {
self.known_alive_panes.insert(pane_lc);
}
self.dirty = true;
tracing::info!(
// Per-row, fires on every alive-snapshot upgrade — debug,
// not info (this was by far the highest-volume info line).
tracing::debug!(
target: "agent_session_registry",
key = %sid,
pane = ?pane_opt,
Expand Down
18 changes: 13 additions & 5 deletions tools/wta/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,8 @@ where
}
}
let key_for_refresh = key.clone();
tracing::info!(
// Per-agent-event — debug, not info.
tracing::debug!(
target: "agent_route",
event = %event,
asid = %asid,
Expand Down Expand Up @@ -827,7 +828,8 @@ where
}

let dirty = reg.take_dirty();
tracing::info!(
// Per-agent-event (partner of "routing") — debug, not info.
tracing::debug!(
target: "agent_route",
event = %event,
dirty = dirty,
Expand Down Expand Up @@ -5286,7 +5288,10 @@ impl App {
tab_id,
params,
} => {
tracing::debug!(target: "autofix", method = %method, pane_id = %pane_id, tab_id = ?tab_id, self_pane_id = ?self.pane_id, "WtEvent");
// Per-WT-event (every vt_sequence included) — trace-only; the
// single per-event breadcrumb stays at debug in main.rs
// (`wt_event_rx: received event`).
tracing::trace!(target: "autofix", method = %method, pane_id = %pane_id, tab_id = ?tab_id, self_pane_id = ?self.pane_id, "WtEvent");

// Hook bridge events: fire-and-forget into the agent registry
// so the agent session view stays current. Unrelated to autofix /
Expand Down Expand Up @@ -5906,7 +5911,8 @@ impl App {
}

let notification = classify_wt_event(&method, &pane_id, tab_id.as_deref(), &params);
tracing::debug!(target: "autofix", severity = ?notification.severity, summary = %notification.summary, tab_id = ?notification.tab_id, "classified");
// Per-WT-event classification — trace-only (vt_sequence volume).
tracing::trace!(target: "autofix", severity = ?notification.severity, summary = %notification.summary, tab_id = ?notification.tab_id, "classified");

// Per-tab filter. WT broadcasts pane-scoped events to every
// helper in the window, but another tab's failures are not
Expand All @@ -5921,7 +5927,9 @@ impl App {
&& !self_tab.is_empty()
&& event_tab != self_tab
{
tracing::debug!(
// Per-cross-tab-event (very high volume in multi-tab
// windows) — trace-only.
tracing::trace!(
target: "autofix",
event_tab,
self_tab,
Expand Down
175 changes: 175 additions & 0 deletions tools/wta/src/logging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,181 @@ pub fn shutdown_flush() {
}
}

/// Install a Windows console control handler that records the teardown
/// signal and drains the log appender before the OS terminates us.
///
/// The wta-**helper** runs as a ConPTY child of Windows Terminal (it's the
/// process rendered in the agent pane). When its pane/tab/window closes — or
/// the user logs off / shuts down — the OS delivers a control event
/// (`CTRL_CLOSE`/`CTRL_LOGOFF`/`CTRL_SHUTDOWN`) and then terminates it at the
/// end of a short grace window. Without a handler those deaths are invisible:
/// the process vanishes mid-stream and the non-blocking appender's last
/// buffered records are lost, because [`shutdown_flush`] never runs (the
/// `WorkerGuard` lives in a `static` and `static`s don't `Drop` at teardown).
/// That is exactly the "helper just stopped responding" signature where the
/// success path is logged exhaustively but the teardown path is silent and
/// the incident is undiagnosable.
///
/// This closes that gap for the helper: it logs WHICH control event tore the
/// process down and flushes so the final records (e.g. the transport-lost
/// WARN in `run_acp_client_over_pipe`) reach disk. The handler returns FALSE
/// so the default handler still runs and the process terminates as before —
/// we only ADD a log line + flush, never changing termination behavior. It's
/// installed process-wide (cheap and harmless), so any wta process that does
/// receive a console control event benefits.
///
/// Coverage limits — what this does NOT catch:
/// * The wta-**master** is spawned `CREATE_NO_WINDOW` and contained in a
/// Job Object with `KILL_ON_JOB_CLOSE` (see C++ `SharedWta`). Its normal
/// teardown is the parent dropping that job, which reaps the master like
/// a `TerminateProcess` — NO control event — so *this handler* does not
/// trace routine master teardown. That teardown is not unlogged overall,
/// though: the C++ parent (`SharedWta`) records both the deliberate
/// job-close and an unexpected exit to `terminal-agent-pane.log`. This
/// handler fires for the master only on genuine console signals
/// (logoff/shutdown), if delivered at all.
/// * A hard `TerminateProcess` (Task Manager "End task", `taskkill /F`, an
/// OS resource kill, or the Job-Object reap above) delivers no control
/// event and stays untraceable from inside the process.
/// * While the Ratatui TUI holds the console in raw mode, Ctrl+C arrives as
/// a key event (not `CTRL_C_EVENT`), so this handler doesn't normally see
/// it and doesn't alter the TUI's Ctrl+C behavior.
pub fn install_ctrl_handler() {
use windows_sys::Win32::Foundation::{GetLastError, ERROR_INVALID_HANDLE};
use windows_sys::Win32::System::Console::{
SetConsoleCtrlHandler, CTRL_BREAK_EVENT, CTRL_CLOSE_EVENT, CTRL_C_EVENT,
CTRL_LOGOFF_EVENT, CTRL_SHUTDOWN_EVENT,
};

// Returns `windows_sys`' `BOOL` (an `i32` alias) to match the
// PHANDLER_ROUTINE signature: 0 == FALSE (fall through to default).
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
unsafe extern "system" fn handler(ctrl_type: u32) -> i32 {
let event = match ctrl_type {
CTRL_C_EVENT => "CTRL_C",
CTRL_BREAK_EVENT => "CTRL_BREAK",
CTRL_CLOSE_EVENT => "CTRL_CLOSE",
CTRL_LOGOFF_EVENT => "CTRL_LOGOFF",
CTRL_SHUTDOWN_EVENT => "CTRL_SHUTDOWN",
_ => "UNKNOWN",
};
tracing::warn!(
target: "lifecycle",
ctrl_type,
event,
"console control event received — process being torn down; flushing logs"
);
// Drain the appender so the line above (and any earlier buffered
// records) hit disk before the grace window ends and we're killed.
shutdown_flush();
// FALSE → fall through to the default handler (terminate). We only
// add logging + flush; termination behavior is unchanged.
0
}

// SAFETY: `handler` is a valid `extern "system"` routine matching the
// PHANDLER_ROUTINE signature; registering a control handler is a
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
// process-global, thread-safe Win32 operation.
unsafe {
if SetConsoleCtrlHandler(Some(handler), 1) == 0 {
// Capture the Win32 error immediately, before any other call (incl.
// the logging macro's own work) can reset thread-last-error.
let error_code = GetLastError();
if error_code == ERROR_INVALID_HANDLE {
// Expected for a windowless wta process (the CREATE_NO_WINDOW
// master, a detached CLI invocation): there's no console to
// signal, and teardown for those is covered elsewhere (the C++
// side observes the master via its wait callback). Benign —
// debug only, so it never spams release logs.
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
tracing::debug!(
target: "lifecycle",
error_code,
"SetConsoleCtrlHandler: no console attached (expected for windowless process)"
);
} else {
// Any other failure is the diagnostic feature itself failing to
// arm where we DID expect a console (e.g. the helper) — warn so
// release (info) logs explain why later teardown signals are
// absent rather than leaving it a silent mystery.
tracing::warn!(
target: "lifecycle",
error_code,
"SetConsoleCtrlHandler failed — teardown signals will not be logged"
);
}
}
}
}

/// Install a panic hook that records the panic to disk, then chains to the
/// previous hook.
///
/// A Rust panic otherwise writes only to stderr — invisible for a ConPTY-
/// hosted helper or a `CREATE_NO_WINDOW` master — and the non-blocking
/// appender's buffered tail is lost when a *fatal* panic kills the process
/// before the background worker drains it. So a panic is a "died for no
/// logged reason" blind spot. This closes it WITHOUT changing panic semantics
/// (it chains the previous hook, so unwind/abort and backtraces are
/// unchanged):
/// * a `tracing::error!` so the panic correlates in the normal log (this
/// drains fine for a *recovered* panic, e.g. behind a `catch_unwind`), and
/// * a synchronous append to `wta-panic.log`, independent of the async
/// appender, so the record reaches disk even when a fatal panic kills us.
///
/// It deliberately does NOT call [`shutdown_flush`]: that drops the appender
/// guard and would permanently kill logging after a recoverable panic. The
/// synchronous file write is the durable path instead.
pub fn install_panic_hook() {
let prev = std::panic::take_hook();
std::panic::set_hook(Box::new(move |info| {
// Same payload extraction the rest of the codebase uses.
let msg = info
.payload()
.downcast_ref::<&'static str>()
.copied()
.or_else(|| info.payload().downcast_ref::<String>().map(|s| s.as_str()))
.unwrap_or("<non-string panic payload>");
let location = info
.location()
.map(|l| format!("{}:{}:{}", l.file(), l.line(), l.column()))
.unwrap_or_else(|| "<unknown>".to_string());
let thread_name = std::thread::current()
.name()
.unwrap_or("<unnamed>")
.to_string();

tracing::error!(
target: "panic",
message = %msg,
location = %location,
thread = %thread_name,
"thread panicked"
);

// Guaranteed-on-disk backstop: a fatal main-thread panic unwinds past
// main() without reaching any `shutdown_flush`, so the appender's
// buffered tail (incl. the error above) can be lost. A synchronous
// append here does not depend on the appender being alive.
let millis = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_millis())
.unwrap_or(0);
if let Ok(mut f) = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(log_dir().join("wta-panic.log"))
{
use std::io::Write;
let _ = writeln!(
f,
"[{millis}ms] pid={} thread={thread_name} panicked at {location}: {msg}",
std::process::id()
);
}

prev(info);
}));
}

/// Filesystem upkeep run once per process at logging init, before our own
/// appender opens.
///
Expand Down
12 changes: 12 additions & 0 deletions tools/wta/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,18 @@ async fn main() -> Result<()> {
// is held in a global and flushed via `logging::shutdown_flush()` on every
// exit path (see the calls below and before each `process::exit`).
logging::init(&process_label(&cli));
// Log + flush on console teardown signals (pane/tab/window close, logoff,
// shutdown) so a torn-down helper isn't a silent disappearance. Installed
// process-wide; see `install_ctrl_handler` for coverage limits — notably
// the master is job-killed (KILL_ON_JOB_CLOSE) and won't observe these, so
// *this handler* doesn't trace routine master teardown. That teardown is
// still logged, just by the C++ parent: `SharedWta` records both the
// deliberate job-close and an unexpected exit to terminal-agent-pane.log.
logging::install_ctrl_handler();
// Record panics to disk (+ a synchronous wta-panic.log backstop) so a
// panic isn't a silent death — stderr is invisible for a ConPTY helper /
// CREATE_NO_WINDOW master. Chains the default hook; semantics unchanged.
logging::install_panic_hook();
tracing::info!(version = env!("CARGO_PKG_VERSION"), "=== wta starting ===");

let locale = cli
Expand Down
40 changes: 32 additions & 8 deletions tools/wta/src/master/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,9 @@ impl acp::Client for MasterClient {
"helper notification channel drained — backpressure cleared"
);
}
tracing::debug!(
// Per-streamed-chunk; trace-only so default debug logs
// stay readable. Turn-level flow is in `prompt_timing`.
tracing::trace!(
target: "master",
step = "agent→helper",
op = "session_notification",
Expand Down Expand Up @@ -1268,7 +1270,9 @@ impl acp::Agent for HelperHandler {
return handle_sessions_list(&self.state, &args.params).await;
}
if method == crate::session_registry::INTELLTERM_METHOD_SESSION_HOOK {
tracing::info!(
// Per-session-hook (every tool start/stop/session event) — debug,
// not info; the reducer logs its own outcome where it matters.
tracing::debug!(
target: "master",
op = "ext_method",
method = %method,
Expand Down Expand Up @@ -1993,7 +1997,9 @@ async fn serve_helper(
Some(notif) = notif_rx.recv() => {
let sid = notif.session_id.clone();
let kind = notification_kind(&notif);
tracing::debug!(
// Per-streamed-chunk; trace-only to keep default debug logs
// readable (this line alone dominated the master log volume).
tracing::trace!(
target: "master",
step = "master→helper",
op = "session_notification",
Expand Down Expand Up @@ -2514,11 +2520,29 @@ async fn handle_session_hook(
acp::Error::invalid_params().data(serde_json::json!({ "message": err.to_string() }))
})?;

tracing::info!(
target: "session_hook",
event = ?event,
"received helper session hook"
);
// Split by event kind so field diagnosis of session-state bugs survives at
// the default release level: terminal/lifecycle transitions (session
// start/stop, pane closed, connection failed) stay at info; the
// high-frequency routine events (tool start/stop, notifications, resume
// bookkeeping) go to debug. Keeps the load-bearing transitions visible
// without the per-tool flood that dominated the info logs.
{
use crate::agent_sessions::SessionEvent;
// Match on a reference so the level decision borrows rather than
// consumes `event` (it's used again below for the reducer).
let lifecycle = matches!(
&event,
SessionEvent::SessionStarted { .. }
| SessionEvent::SessionStopped { .. }
| SessionEvent::ConnectionFailed { .. }
| SessionEvent::PaneClosed { .. }
);
if lifecycle {
tracing::info!(target: "session_hook", event = ?event, "received helper session hook");
} else {
tracing::debug!(target: "session_hook", event = ?event, "received helper session hook");
}
}
Comment thread
Copilot marked this conversation as resolved.

// Capture the session key BEFORE moving `event` into the reducer so
// we can dispatch the post-apply title refresh against the right
Expand Down
4 changes: 3 additions & 1 deletion tools/wta/src/protocol/acp/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1554,7 +1554,9 @@ impl acp::Client for WtaClient {

async fn session_notification(&self, args: acp::SessionNotification) -> acp::Result<()> {
let kind = session_update_kind(&args.update);
acp_log(&format!("session_notification: kind={}", kind));
// Per-streamed-chunk; trace-only (not via acp_log's debug) so default
// debug logs aren't flooded with one line per token chunk.
tracing::trace!(target: "acp", "session_notification: kind={}", kind);
// The full update carries agent message/thought text, tool-call
// content, plan bodies, and replayed user-message chunks — trace only.
acp_trace_content(&format!("session_notification update: {:?}", args.update));
Expand Down
Loading