From a736c70f2133cb29979ff7d69e73c51c892db3d9 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 19:19:02 +0900 Subject: [PATCH 01/23] feat(pi-natives): add computer-use coordinate contract (slice 1) First slice of the native computer-use tool (macOS-only v1), scoped via the deep-interview spec + ralplan consensus plan. Lands the pure, framework-free coordinate contract: NormalizedDisplay maps normalized screenshot pixels to macOS logical points (Retina/HiDPI-safe) with out-of-bounds and invalid-scale rejection, plus full unit tests. Adds docs/computer-use/ capturing locked decisions, the coordinate contract, and the delivery roadmap. Native capture/input backend, kill-switch supervisor, napi/TS tool surface, and the manual macOS end-to-end acceptance are tracked follow-ups (require macOS hardware, granted TCC, and a human-operated drill). --- crates/pi-natives/src/computer/coords.rs | 267 +++++++++++++++++++++++ crates/pi-natives/src/computer/mod.rs | 26 +++ crates/pi-natives/src/lib.rs | 1 + docs/computer-use/README.md | 74 +++++++ 4 files changed, 368 insertions(+) create mode 100644 crates/pi-natives/src/computer/coords.rs create mode 100644 crates/pi-natives/src/computer/mod.rs create mode 100644 docs/computer-use/README.md diff --git a/crates/pi-natives/src/computer/coords.rs b/crates/pi-natives/src/computer/coords.rs new file mode 100644 index 000000000..152779fc7 --- /dev/null +++ b/crates/pi-natives/src/computer/coords.rs @@ -0,0 +1,267 @@ +//! Coordinate contract for the native computer-use tool. +//! +//! # Overview +//! The computer-use tool exposes a single *normalized virtual display* to the +//! model: the dimensions of the returned screenshot (in pixels) define the +//! action coordinate space. Every model-supplied `x`/`y` is a pixel in that +//! screenshot. macOS input injection (`CGEvent`) operates in *logical points*, +//! not physical pixels, so on Retina/HiDPI displays a screenshot pixel and a +//! logical point differ by the display scale factor. This module owns the one +//! authoritative transform from screenshot pixels to macOS logical points, plus +//! strict bounds rejection. +//! +//! It is deliberately framework-free (no `CoreGraphics`, no napi) so the +//! coordinate math is unit-testable without a display or granted permissions. +//! The native capture/input backend that produces [`NormalizedDisplay`] values +//! lands in a later slice (see `docs/computer-use/`). +//! +//! # Example +//! ``` +//! use pi_natives::computer::coords::NormalizedDisplay; +//! +//! // A 200x100-point Retina display captured at 2x => 400x200 screenshot px. +//! let display = NormalizedDisplay::new(400, 200, 2.0, 2.0, 0.0, 0.0); +//! let point = display.to_logical_point(100.0, 50.0).unwrap(); +//! assert!((point.x - 50.0).abs() < 0.5); +//! assert!((point.y - 25.0).abs() < 0.5); +//! ``` + +use core::fmt; + +/// A point in macOS logical (point) coordinate space, suitable for `CGEvent` +/// injection by the native input backend. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct LogicalPoint { + /// Logical X (points), including the display's logical origin. + pub x: f64, + /// Logical Y (points), including the display's logical origin. + pub y: f64, +} + +/// Reason a screenshot-space pixel could not be mapped to a logical point. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum CoordError { + /// The pixel coordinate is outside the normalized display bounds, or not a + /// finite number. Side-effecting actions must reject rather than clamp. + OutOfBounds { + /// Offending X pixel. + x: f64, + /// Offending Y pixel. + y: f64, + /// Normalized display width in pixels. + width_px: u32, + /// Normalized display height in pixels. + height_px: u32, + }, + /// The display descriptor has a non-positive or non-finite scale factor, so + /// no correct transform exists. + InvalidScale { + /// Offending X scale. + scale_x: f64, + /// Offending Y scale. + scale_y: f64, + }, +} + +impl fmt::Display for CoordError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Self::OutOfBounds { x, y, width_px, height_px } => write!( + f, + "pixel ({x}, {y}) is out of bounds for a {width_px}x{height_px} normalized display" + ), + Self::InvalidScale { scale_x, scale_y } => { + write!(f, "invalid display scale ({scale_x}, {scale_y}); must be finite and > 0") + }, + } + } +} + +impl std::error::Error for CoordError {} + +/// Descriptor of the single normalized virtual display whose screenshot pixels +/// define the action coordinate space. +/// +/// `scale_x`/`scale_y` are the per-axis ratios of physical screenshot pixels to +/// logical points (typically `1.0` on non-Retina and `2.0` on Retina). +/// `origin_x`/`origin_y` are the display's logical origin, preserved so the +/// transform stays correct for non-zero display origins. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct NormalizedDisplay { + /// Screenshot width in physical pixels. + pub width_px: u32, + /// Screenshot height in physical pixels. + pub height_px: u32, + /// Physical-pixels-per-logical-point along X. + pub scale_x: f64, + /// Physical-pixels-per-logical-point along Y. + pub scale_y: f64, + /// Logical origin X of the display (points). + pub origin_x: f64, + /// Logical origin Y of the display (points). + pub origin_y: f64, +} + +impl NormalizedDisplay { + /// Construct a descriptor from raw capture geometry. + #[must_use] + pub const fn new( + width_px: u32, + height_px: u32, + scale_x: f64, + scale_y: f64, + origin_x: f64, + origin_y: f64, + ) -> Self { + Self { width_px, height_px, scale_x, scale_y, origin_x, origin_y } + } + + /// Whether both scale factors are finite and strictly positive. + #[must_use] + pub fn has_valid_scale(&self) -> bool { + self.scale_x.is_finite() + && self.scale_x > 0.0 + && self.scale_y.is_finite() + && self.scale_y > 0.0 + } + + /// Whether `(x, y)` is a finite pixel inside `[0, width_px) x [0, + /// height_px)`. + #[must_use] + pub fn contains(&self, x: f64, y: f64) -> bool { + // `Range::contains` is false for NaN, so non-finite pixels are rejected too. + (0.0..f64::from(self.width_px)).contains(&x) && (0.0..f64::from(self.height_px)).contains(&y) + } + + /// Map a screenshot-space pixel to a macOS logical point. + /// + /// # Errors + /// Returns [`CoordError::InvalidScale`] when the descriptor's scale is not + /// finite and positive, or [`CoordError::OutOfBounds`] when `(x, y)` is not + /// a finite pixel inside the display bounds. + pub fn to_logical_point(&self, x: f64, y: f64) -> Result { + if !self.has_valid_scale() { + return Err(CoordError::InvalidScale { scale_x: self.scale_x, scale_y: self.scale_y }); + } + if !self.contains(x, y) { + return Err(CoordError::OutOfBounds { + x, + y, + width_px: self.width_px, + height_px: self.height_px, + }); + } + Ok(LogicalPoint { x: self.origin_x + x / self.scale_x, y: self.origin_y + y / self.scale_y }) + } +} + +#[cfg(test)] +mod tests { + use super::{CoordError, NormalizedDisplay}; + + /// Logical points must match the expected value well within the 0.5-point + /// accuracy tolerance the plan requires. + const TOLERANCE: f64 = 0.5; + + fn assert_close(actual: f64, expected: f64) { + assert!((actual - expected).abs() < TOLERANCE, "expected ~{expected}, got {actual}"); + } + + #[test] + fn identity_scale_zero_origin() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + let p = display.to_logical_point(40.0, 60.0).unwrap(); + assert_close(p.x, 40.0); + assert_close(p.y, 60.0); + } + + #[test] + fn retina_scale_halves_pixels() { + let display = NormalizedDisplay::new(400, 200, 2.0, 2.0, 0.0, 0.0); + let p = display.to_logical_point(100.0, 50.0).unwrap(); + assert_close(p.x, 50.0); + assert_close(p.y, 25.0); + } + + #[test] + fn fractional_scale() { + let display = NormalizedDisplay::new(300, 150, 1.5, 1.5, 0.0, 0.0); + let p = display.to_logical_point(150.0, 75.0).unwrap(); + assert_close(p.x, 100.0); + assert_close(p.y, 50.0); + } + + #[test] + fn non_zero_origin_is_preserved() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 10.0, 20.0); + let p = display.to_logical_point(5.0, 5.0).unwrap(); + assert_close(p.x, 15.0); + assert_close(p.y, 25.0); + } + + #[test] + fn anisotropic_scale_per_axis() { + let display = NormalizedDisplay::new(200, 100, 2.0, 1.0, 0.0, 0.0); + let p = display.to_logical_point(100.0, 40.0).unwrap(); + assert_close(p.x, 50.0); + assert_close(p.y, 40.0); + } + + #[test] + fn top_left_edge_is_inside() { + let display = NormalizedDisplay::new(100, 100, 2.0, 2.0, 0.0, 0.0); + assert!(display.to_logical_point(0.0, 0.0).is_ok()); + } + + #[test] + fn bottom_right_inclusive_pixel_is_inside() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(display.to_logical_point(99.0, 99.0).is_ok()); + } + + #[test] + fn width_height_pixel_is_out_of_bounds() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(matches!(display.to_logical_point(100.0, 0.0), Err(CoordError::OutOfBounds { .. }))); + assert!(matches!(display.to_logical_point(0.0, 100.0), Err(CoordError::OutOfBounds { .. }))); + } + + #[test] + fn negative_pixel_is_out_of_bounds() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(matches!(display.to_logical_point(-1.0, 10.0), Err(CoordError::OutOfBounds { .. }))); + } + + #[test] + fn non_finite_pixel_is_out_of_bounds() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(matches!( + display.to_logical_point(f64::NAN, 10.0), + Err(CoordError::OutOfBounds { .. }) + )); + assert!(matches!( + display.to_logical_point(10.0, f64::INFINITY), + Err(CoordError::OutOfBounds { .. }) + )); + } + + #[test] + fn invalid_scale_is_rejected() { + for (sx, sy) in [(0.0, 1.0), (1.0, -2.0), (f64::NAN, 1.0)] { + let display = NormalizedDisplay::new(100, 100, sx, sy, 0.0, 0.0); + assert!(matches!( + display.to_logical_point(10.0, 10.0), + Err(CoordError::InvalidScale { .. }) + )); + } + } + + #[test] + fn invalid_scale_takes_priority_over_bounds() { + let display = NormalizedDisplay::new(100, 100, 0.0, 1.0, 0.0, 0.0); + assert!(matches!( + display.to_logical_point(999.0, 999.0), + Err(CoordError::InvalidScale { .. }) + )); + } +} diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs new file mode 100644 index 000000000..a8a4298da --- /dev/null +++ b/crates/pi-natives/src/computer/mod.rs @@ -0,0 +1,26 @@ +//! Native computer-use primitives (macOS-only v1). +//! +//! # Overview +//! This module backs the model-facing `computer` tool: OS-native control of the +//! real macOS desktop via the `OpenAI` computer-use action set (`screenshot`, +//! `click`, `double_click`, `move`, `drag`, `scroll`, `type`, `keypress`, +//! `wait`). +//! +//! # Status +//! Slice 1 foundation. Only the framework-free coordinate contract +//! ([`coords`]) ships so far; it is unit-testable without a display or granted +//! TCC permissions. The native capture/input backend, the kill-switch +//! supervisor + event-tap lifecycle, and the napi `ComputerController` surface +//! land in later slices. See `docs/computer-use/` for the approved spec, the +//! consensus plan, and the architecture decision record. +//! +//! # Architecture +//! ```text +//! model -> packages/coding-agent (computer tool, exact OpenAI schema) +//! -> packages/natives (napi bindings) +//! -> pi-natives::computer (execute_action state machine + backend) +//! ``` + +pub mod coords; + +pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; diff --git a/crates/pi-natives/src/lib.rs b/crates/pi-natives/src/lib.rs index 602f6eadc..8153ad540 100644 --- a/crates/pi-natives/src/lib.rs +++ b/crates/pi-natives/src/lib.rs @@ -25,6 +25,7 @@ pub mod appearance; pub mod ast; pub mod build_info; pub mod clipboard; +pub mod computer; pub mod crash; pub mod edit_fuzzy; pub mod fd; diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md new file mode 100644 index 000000000..1c7a00015 --- /dev/null +++ b/docs/computer-use/README.md @@ -0,0 +1,74 @@ +# Native computer-use tool + +Status: **in progress (draft)** — Slice 1 foundation only. + +A new, model-agnostic `computer` tool that lets any model drive the user's real +macOS desktop via the OpenAI computer-use action set. Built fresh (the +open-source `openai/codex` repo has no GUI computer-use source to copy; only the +public action *schema* is mirrored). + +This feature was scoped through GJC's deep-interview (requirements) and ralplan +(Planner/Architect/Critic consensus) workflows. The full deep-interview spec and +the consensus plan + ADR are the authoritative source of truth; this document is +the committed summary and roadmap. + +## Locked decisions (ADR summary) + +- **Target:** the user's real macOS desktop, OS-native control. v1 is macOS-only + (Linux/Windows deferred behind the same tool schema). +- **Driver:** any model via a generic structured tool-call interface — no + provider-specific computer-use API. +- **Action set:** the exact OpenAI computer-use primitives — `screenshot`, + `click`, `double_click`, `move`, `drag`, `scroll`, `type`, `keypress`, `wait`. +- **Implementation:** built fresh in the Rust `pi-natives` crate (napi), + exposed through `packages/natives` to a new + `packages/coding-agent/src/tools/computer.ts`, kept deliberately lower-level + than the existing `browser` tool (coordinate/input primitives only, no web + semantics). +- **Coordinate contract:** a single normalized virtual display. The returned + screenshot's pixel dimensions *are* the action coordinate space; Rust owns the + transform to macOS logical points (Retina/HiDPI-safe) and display selection. +- **Permissions:** macOS TCC (Accessibility + Screen Recording) auto-preflighted; + on a missing grant, open the relevant Settings pane and return a clear + "grant then retry/relaunch" error. +- **Gating:** off by default; opt-in config flag (per session) plus a persistent + always-on option. +- **Safety:** no per-action approval (autonomous), **but** a daemon-enforced + global kill-switch outside model control (global hotkey OR TUI stop key) that + aborts queued actions, releases held keys/buttons, suspends further input, and + snapshots the last screen. Reset is user-only, never via the model-facing tool. +- **Architecture:** every primitive delegates to one central Rust + `execute_action` state machine (preflight, validation, cancellation, audit, + screenshot policy, release-all) so per-primitive methods cannot drift past the + safety contract. The in-process supervisor sits behind a `SupervisorClient` + boundary so an out-of-process daemon can replace it later without changing the + napi surface. + +## Coordinate contract (shipped) + +`crates/pi-natives/src/computer/coords.rs` implements the pure, framework-free +core: `NormalizedDisplay` maps a screenshot-space pixel `(x, y)` to a macOS +logical point via per-axis scale and the display's logical origin, rejecting +out-of-bounds and non-finite inputs. It is unit-tested (scale 1.0/2.0, +fractional and anisotropic scale, non-zero origins, edges, out-of-bounds, +invalid scale) and requires no display or granted permissions. + +## Delivery roadmap + +Delivery ships a `screenshot`+`click`+`type` vertical slice first; the remaining +six primitives fast-follow; v1 acceptance = all nine primitives drive a real +macOS app end-to-end plus a kill-switch drill (per-primitive napi unit tests + +manual macOS E2E). + +| Slice | Scope | Status | +|-------|-------|--------| +| Coordinate contract + planning docs | `coords` module + unit tests + this doc | **done (this PR)** | +| Native capture + input backend | capture, input, permissions, `execute_action` | planned | +| Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | planned | +| napi bindings + TS `computer` tool surface | `ComputerController`, schema, gating, prompt, renderer | planned | +| Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | + +The native backend, kill-switch, napi/TS surface, and manual end-to-end +acceptance require real macOS hardware, granted TCC permissions, and a +human-operated drill, so they are tracked as follow-up work rather than landed +in this draft. From 7d16c357eb654f09cbe08c4c7003ef9d2b291f2f Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 19:46:09 +0900 Subject: [PATCH 02/23] feat(pi-natives): add macOS screen-capture primitive (computer-use) Adds crates/pi-natives/src/computer/capture.rs (macOS-gated): read-only primary-display capture via raw CoreGraphics FFI into a PNG plus the NormalizedDisplay descriptor (scale derived from captured physical pixels vs logical bounds). A missing Screen Recording grant surfaces CaptureError::CaptureFailed rather than a silent black frame. Verified live with Screen Recording granted: a real, non-uniform primary-display capture decodes as a PNG with matching dimensions (cargo test -p pi-natives --ignored captures_non_uniform_primary_display). The GUI capture test is #[ignore] so CI stays deterministic. --- crates/pi-natives/src/computer/capture.rs | 252 ++++++++++++++++++++++ crates/pi-natives/src/computer/mod.rs | 4 + docs/computer-use/README.md | 21 +- 3 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 crates/pi-natives/src/computer/capture.rs diff --git a/crates/pi-natives/src/computer/capture.rs b/crates/pi-natives/src/computer/capture.rs new file mode 100644 index 000000000..23efe9f8a --- /dev/null +++ b/crates/pi-natives/src/computer/capture.rs @@ -0,0 +1,252 @@ +//! Primary-display screen capture (macOS). +//! +//! # Overview +//! Read-only capture of the current primary display into a PNG plus the +//! [`NormalizedDisplay`] descriptor whose pixel dimensions define the action +//! coordinate space (see [`super::coords`]). The display scale is derived from +//! the captured physical pixel size versus the logical display bounds, so the +//! coordinate contract stays correct on Retina/HiDPI. +//! +//! Capture requires the macOS Screen Recording (TCC) permission. When it is not +//! granted, `CGDisplayCreateImage` returns null and this surfaces +//! [`CaptureError::CaptureFailed`] rather than silently returning a black +//! frame. +//! +//! Implemented with raw CoreGraphics FFI (no extra crates); the buffer is owned +//! Rust memory and every Core Graphics handle is released exactly once. + +use std::{ffi::c_void, fmt}; + +use crate::computer::coords::NormalizedDisplay; + +#[repr(C)] +#[derive(Clone, Copy)] +struct CgPoint { + x: f64, + y: f64, +} + +#[repr(C)] +#[derive(Clone, Copy)] +struct CgSize { + width: f64, + height: f64, +} + +#[repr(C)] +#[derive(Clone, Copy)] +struct CgRect { + origin: CgPoint, + size: CgSize, +} + +type CgDirectDisplayId = u32; +type CgImageRef = *mut c_void; +type CgColorSpaceRef = *mut c_void; +type CgContextRef = *mut c_void; + +/// `kCGImageAlphaPremultipliedLast` (1) | `kCGBitmapByteOrder32Big` (4 << 12) +/// yields an RGBA8888 byte layout. +const RGBA_BITMAP_INFO: u32 = 1 | (4 << 12); +const BITS_PER_COMPONENT: usize = 8; +const BYTES_PER_PIXEL: usize = 4; + +#[link(name = "CoreGraphics", kind = "framework")] +unsafe extern "C" { + fn CGMainDisplayID() -> CgDirectDisplayId; + fn CGDisplayBounds(display: CgDirectDisplayId) -> CgRect; + fn CGDisplayCreateImage(display: CgDirectDisplayId) -> CgImageRef; + fn CGImageGetWidth(image: CgImageRef) -> usize; + fn CGImageGetHeight(image: CgImageRef) -> usize; + fn CGImageRelease(image: CgImageRef); + fn CGColorSpaceCreateDeviceRGB() -> CgColorSpaceRef; + fn CGColorSpaceRelease(space: CgColorSpaceRef); + fn CGBitmapContextCreate( + data: *mut c_void, + width: usize, + height: usize, + bits_per_component: usize, + bytes_per_row: usize, + space: CgColorSpaceRef, + bitmap_info: u32, + ) -> CgContextRef; + fn CGContextDrawImage(context: CgContextRef, rect: CgRect, image: CgImageRef); + fn CGContextRelease(context: CgContextRef); +} + +/// Reason a primary-display capture failed. +#[derive(Debug, Clone)] +pub enum CaptureError { + /// `CGDisplayCreateImage` returned null or a zero-sized image — commonly the + /// Screen Recording permission is not granted. + CaptureFailed, + /// A Core Graphics color space or bitmap context could not be created. + ContextFailed, + /// The captured frame could not be PNG-encoded. + Encode(String), +} + +impl fmt::Display for CaptureError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::CaptureFailed => { + write!(f, "screen capture failed; the Screen Recording permission may not be granted") + }, + Self::ContextFailed => write!(f, "failed to create a Core Graphics bitmap context"), + Self::Encode(reason) => write!(f, "failed to encode captured frame as PNG: {reason}"), + } + } +} + +impl std::error::Error for CaptureError {} + +/// A captured primary-display frame. +pub struct CapturedFrame { + /// Coordinate descriptor for the captured display. + pub display: NormalizedDisplay, + /// PNG-encoded RGBA image bytes. + pub png: Vec, +} + +/// Capture the current primary display as a PNG plus its coordinate descriptor. +/// +/// # Errors +/// Returns [`CaptureError`] when the OS capture call fails (often a missing +/// Screen Recording grant), a bitmap context cannot be created, or PNG encoding +/// fails. +pub fn capture_primary_display() -> Result { + // SAFETY: both calls are pure Core Graphics queries; `CGMainDisplayID` + // returns a valid id for the active primary display and `CGDisplayBounds` + // reads geometry for that id. + let (display_id, bounds) = unsafe { + let id = CGMainDisplayID(); + (id, CGDisplayBounds(id)) + }; + + // SAFETY: `display_id` is a valid primary-display id. The returned image is + // released exactly once below regardless of the `frame_from_image` result. + let image = unsafe { CGDisplayCreateImage(display_id) }; + if image.is_null() { + return Err(CaptureError::CaptureFailed); + } + + let result = frame_from_image(image, bounds); + + // SAFETY: `image` is non-null (checked above) and not used after release. + unsafe { CGImageRelease(image) }; + result +} + +/// Convert a non-null `CGImage` into a [`CapturedFrame`]. Does not release +/// `image`; the caller owns its lifetime. +fn frame_from_image(image: CgImageRef, bounds: CgRect) -> Result { + // SAFETY: `image` is non-null per the caller's check. + let (width, height) = unsafe { (CGImageGetWidth(image), CGImageGetHeight(image)) }; + if width == 0 || height == 0 { + return Err(CaptureError::CaptureFailed); + } + + let bytes_per_row = width * BYTES_PER_PIXEL; + let mut buffer = vec![0u8; bytes_per_row * height]; + + // SAFETY: device RGB color space; released on every path below. + let space = unsafe { CGColorSpaceCreateDeviceRGB() }; + if space.is_null() { + return Err(CaptureError::ContextFailed); + } + + // SAFETY: `buffer` is exactly `bytes_per_row * height` bytes, matching the + // dimensions/stride passed here; `space` is non-null. + let context = unsafe { + CGBitmapContextCreate( + buffer.as_mut_ptr().cast::(), + width, + height, + BITS_PER_COMPONENT, + bytes_per_row, + space, + RGBA_BITMAP_INFO, + ) + }; + if context.is_null() { + // SAFETY: `space` is non-null and released exactly once here. + unsafe { CGColorSpaceRelease(space) }; + return Err(CaptureError::ContextFailed); + } + + let rect = CgRect { + origin: CgPoint { x: 0.0, y: 0.0 }, + size: CgSize { width: width as f64, height: height as f64 }, + }; + // SAFETY: `context` and `image` are non-null; `rect` matches the buffer the + // context was created over, so the draw stays in bounds. + unsafe { CGContextDrawImage(context, rect, image) }; + + // SAFETY: both handles are non-null and released exactly once; not used after. + unsafe { + CGContextRelease(context); + CGColorSpaceRelease(space); + } + + let png = encode_png(&buffer, width as u32, height as u32)?; + let scale_x = derive_scale(width as f64, bounds.size.width); + let scale_y = derive_scale(height as f64, bounds.size.height); + + Ok(CapturedFrame { + display: NormalizedDisplay::new( + width as u32, + height as u32, + scale_x, + scale_y, + bounds.origin.x, + bounds.origin.y, + ), + png, + }) +} + +/// Scale = physical pixels / logical points, defaulting to `1.0` when the +/// logical extent is not positive. +fn derive_scale(pixels: f64, logical: f64) -> f64 { + if logical > 0.0 { pixels / logical } else { 1.0 } +} + +fn encode_png(rgba: &[u8], width: u32, height: u32) -> Result, CaptureError> { + use image::{ExtendedColorType, ImageEncoder, codecs::png::PngEncoder}; + + let mut out = Vec::new(); + PngEncoder::new(&mut out) + .write_image(rgba, width, height, ExtendedColorType::Rgba8) + .map_err(|err| CaptureError::Encode(err.to_string()))?; + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::capture_primary_display; + + /// Exercises the real OS capture path, so it is ignored by default and run + /// explicitly (`cargo test -p pi-natives --ignored`) on a macOS host with + /// Screen Recording granted. + #[test] + #[ignore = "captures the real primary display; needs macOS + Screen Recording grant"] + fn captures_non_uniform_primary_display() { + let frame = capture_primary_display() + .expect("capture should succeed when Screen Recording is granted"); + assert!(frame.display.width_px > 0 && frame.display.height_px > 0); + + let decoded = image::load_from_memory(&frame.png).expect("captured bytes decode as PNG"); + assert_eq!(decoded.width(), frame.display.width_px); + assert_eq!(decoded.height(), frame.display.height_px); + + let rgba = decoded.to_rgba8(); + let first = rgba.pixels().next().copied(); + let non_uniform = rgba.pixels().any(|pixel| Some(*pixel) != first); + assert!( + non_uniform, + "captured frame is uniform (black/blank) — Screen Recording likely not granted" + ); + + std::fs::write("/tmp/computer-capture-evidence.png", &frame.png).ok(); + } +} diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index a8a4298da..d41f6f253 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -21,6 +21,10 @@ //! -> pi-natives::computer (execute_action state machine + backend) //! ``` +#[cfg(target_os = "macos")] +pub mod capture; pub mod coords; +#[cfg(target_os = "macos")] +pub use capture::{CaptureError, CapturedFrame, capture_primary_display}; pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md index 1c7a00015..361f0a9db 100644 --- a/docs/computer-use/README.md +++ b/docs/computer-use/README.md @@ -1,6 +1,8 @@ # Native computer-use tool -Status: **in progress (draft)** — Slice 1 foundation only. +Status: **in progress (draft)** — coordinate contract + native `screenshot` +capture landed and verified; input primitives, kill-switch, and napi/TS surface +to follow. A new, model-agnostic `computer` tool that lets any model drive the user's real macOS desktop via the OpenAI computer-use action set. Built fresh (the @@ -44,7 +46,7 @@ the committed summary and roadmap. boundary so an out-of-process daemon can replace it later without changing the napi surface. -## Coordinate contract (shipped) +## Capture + coordinate contract (shipped) `crates/pi-natives/src/computer/coords.rs` implements the pure, framework-free core: `NormalizedDisplay` maps a screenshot-space pixel `(x, y)` to a macOS @@ -53,6 +55,14 @@ out-of-bounds and non-finite inputs. It is unit-tested (scale 1.0/2.0, fractional and anisotropic scale, non-zero origins, edges, out-of-bounds, invalid scale) and requires no display or granted permissions. +`crates/pi-natives/src/computer/capture.rs` (macOS) implements the read-only +`screenshot` primitive: it captures the primary display via CoreGraphics into a +PNG and derives the `NormalizedDisplay` scale from captured physical pixels vs +logical bounds, surfacing a missing Screen Recording grant as +`CaptureError::CaptureFailed` (never a silent black frame). Verified live: a +real, non-uniform primary-display capture decodes as a PNG with matching +dimensions (`cargo test -p pi-natives --ignored captures_non_uniform_primary_display`). + ## Delivery roadmap Delivery ships a `screenshot`+`click`+`type` vertical slice first; the remaining @@ -63,12 +73,13 @@ manual macOS E2E). | Slice | Scope | Status | |-------|-------|--------| | Coordinate contract + planning docs | `coords` module + unit tests + this doc | **done (this PR)** | -| Native capture + input backend | capture, input, permissions, `execute_action` | planned | +| Native screen capture (`screenshot`) | `capture` module, primary display, PNG + scale | **done (this PR, verified live)** | +| Native input backend | `input`, `permissions`, `execute_action` (click/type/drag/scroll/keypress) | planned | | Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | planned | | napi bindings + TS `computer` tool surface | `ComputerController`, schema, gating, prompt, renderer | planned | | Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | -The native backend, kill-switch, napi/TS surface, and manual end-to-end -acceptance require real macOS hardware, granted TCC permissions, and a +The remaining input backend, kill-switch, napi/TS surface, and manual +end-to-end acceptance still require injecting events into a live desktop and a human-operated drill, so they are tracked as follow-up work rather than landed in this draft. From 52182e5b94d134093ac0ca1255925f3581f5bffc Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 20:57:13 +0900 Subject: [PATCH 03/23] feat(pi-natives): add macOS TCC preflight for computer-use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds crates/pi-natives/src/computer/permissions.rs (macOS): non-prompting preflight for Accessibility (input injection) and Screen Recording (capture), Settings-pane openers, and require_*_for_input/capture guards returning COMPUTER_PERMISSION_REQUIRED so callers fail closed. This is the fail-closed gate prerequisite: input must not fire unless Accessibility preflight passes. Live probe in this environment reports accessibility=false (and screen_recording=false for the executing binary, which still captured via host-process inheritance) — confirming input injection / live E2E require Accessibility granted to the actual executing gjc process before they can proceed. --- crates/pi-natives/src/computer/mod.rs | 4 + crates/pi-natives/src/computer/permissions.rs | 185 ++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 crates/pi-natives/src/computer/permissions.rs diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index d41f6f253..ae098233d 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -24,7 +24,11 @@ #[cfg(target_os = "macos")] pub mod capture; pub mod coords; +#[cfg(target_os = "macos")] +pub mod permissions; #[cfg(target_os = "macos")] pub use capture::{CaptureError, CapturedFrame, capture_primary_display}; pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; +#[cfg(target_os = "macos")] +pub use permissions::{PermissionError, PreflightStatus, TccPermission, preflight}; diff --git a/crates/pi-natives/src/computer/permissions.rs b/crates/pi-natives/src/computer/permissions.rs new file mode 100644 index 000000000..6bb0ea7d4 --- /dev/null +++ b/crates/pi-natives/src/computer/permissions.rs @@ -0,0 +1,185 @@ +//! macOS TCC permission preflight for computer-use (macOS). +//! +//! # Overview +//! Two distinct TCC permissions gate the computer tool: +//! - **Screen Recording** — required for `screenshot` capture (see +//! [`super::capture`]). +//! - **Accessibility** — required for input injection (click/type/etc.). This +//! is a *separate* grant from Screen Recording. +//! +//! This module performs non-prompting preflight checks and can open the correct +//! System Settings pane so the user can grant a missing permission, then retry. +//! It never injects input and never blocks; callers gate side effects on +//! [`preflight`] and surface [`PermissionError`] when a required grant is +//! missing rather than acting on a stale assumption. + +use std::process::Command; + +#[link(name = "ApplicationServices", kind = "framework")] +unsafe extern "C" { + /// Returns whether the current process is a trusted Accessibility client + /// (no prompt). Equivalent to `AXIsProcessTrustedWithOptions(NULL)`. + fn AXIsProcessTrusted() -> bool; +} + +#[link(name = "CoreGraphics", kind = "framework")] +unsafe extern "C" { + /// Returns whether the current process already has Screen Recording access, + /// without prompting. + fn CGPreflightScreenCaptureAccess() -> bool; +} + +/// A TCC permission the computer tool depends on. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TccPermission { + /// Accessibility — required for input injection. + Accessibility, + /// Screen Recording — required for screen capture. + ScreenRecording, +} + +impl TccPermission { + /// The `x-apple.systempreferences:` URL for this permission's settings pane. + #[must_use] + pub const fn settings_url(self) -> &'static str { + match self { + Self::Accessibility => { + "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility" + }, + Self::ScreenRecording => { + "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture" + }, + } + } +} + +/// Current grant state for the permissions the computer tool needs. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PreflightStatus { + /// Whether Accessibility (input injection) is granted. + pub accessibility: bool, + /// Whether Screen Recording (capture) is granted. + pub screen_recording: bool, +} + +/// Error returned when a required permission is missing. Carries the offending +/// permission so the caller can open the right Settings pane and ask the user +/// to grant it, then retry. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PermissionError { + /// The missing permission. + pub missing: TccPermission, +} + +impl std::fmt::Display for PermissionError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let (name, what) = match self.missing { + TccPermission::Accessibility => ("Accessibility", "inject input"), + TccPermission::ScreenRecording => ("Screen Recording", "capture the screen"), + }; + write!( + f, + "COMPUTER_PERMISSION_REQUIRED: {name} permission is required to {what}. Grant it in \ + System Settings (opened for you), then retry." + ) + } +} + +impl std::error::Error for PermissionError {} + +/// Whether the process is a trusted Accessibility client (no prompt). +#[must_use] +pub fn accessibility_granted() -> bool { + // SAFETY: `AXIsProcessTrusted` takes no arguments and only reads the current + // process's TCC trust state. + unsafe { AXIsProcessTrusted() } +} + +/// Whether the process already has Screen Recording access (no prompt). +#[must_use] +pub fn screen_recording_granted() -> bool { + // SAFETY: `CGPreflightScreenCaptureAccess` takes no arguments and only reads + // the current process's capture-access state. + unsafe { CGPreflightScreenCaptureAccess() } +} + +/// Read the current grant state for both required permissions. +#[must_use] +pub fn preflight() -> PreflightStatus { + PreflightStatus { + accessibility: accessibility_granted(), + screen_recording: screen_recording_granted(), + } +} + +/// Open the System Settings pane for `permission` via `open(1)`. Best-effort; +/// returns whether the launch was spawned successfully. +pub fn open_settings(permission: TccPermission) -> bool { + Command::new("open") + .arg(permission.settings_url()) + .status() + .is_ok_and(|status| status.success()) +} + +/// Ensure Accessibility is granted for input injection. +/// +/// On failure, opens the Accessibility settings pane and returns +/// [`PermissionError`] so the caller can fail closed and prompt a +/// grant-then-retry — never proceeding to inject input. +/// +/// # Errors +/// Returns [`PermissionError`] when Accessibility is not granted. +pub fn require_accessibility_for_input() -> Result<(), PermissionError> { + if accessibility_granted() { + return Ok(()); + } + let _ = open_settings(TccPermission::Accessibility); + Err(PermissionError { missing: TccPermission::Accessibility }) +} + +/// Ensure Screen Recording is granted for capture. +/// +/// On failure, opens the Screen Recording settings pane and returns +/// [`PermissionError`]. +/// +/// # Errors +/// Returns [`PermissionError`] when Screen Recording is not granted. +pub fn require_screen_recording_for_capture() -> Result<(), PermissionError> { + if screen_recording_granted() { + return Ok(()); + } + let _ = open_settings(TccPermission::ScreenRecording); + Err(PermissionError { missing: TccPermission::ScreenRecording }) +} + +#[cfg(test)] +mod tests { + use super::{TccPermission, preflight}; + + #[test] + fn settings_urls_target_the_privacy_panes() { + assert!( + TccPermission::Accessibility + .settings_url() + .contains("Privacy_Accessibility") + ); + assert!( + TccPermission::ScreenRecording + .settings_url() + .contains("Privacy_ScreenCapture") + ); + } + + /// Reports the live TCC grant state. Ignored by default (result depends on + /// the host's granted permissions); run explicitly to learn whether input + /// injection (Accessibility) is currently possible. + #[test] + #[ignore = "reports live TCC grant state; environment-dependent"] + fn report_live_preflight() { + let status = preflight(); + println!( + "TCC preflight: accessibility={} screen_recording={}", + status.accessibility, status.screen_recording + ); + } +} From 31a139dae284643cbc65f2afe00149d5cd6ffa01 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 21:14:07 +0900 Subject: [PATCH 04/23] feat(computer-use): expose screenshot via napi -> packages/natives (TS) Adds a #[napi] computerScreenshot binding over the landed CoreGraphics capture, regenerates packages/natives bindings, and adds a TS test that verifies a decodable PNG with matching dimensions. Verified live (bun --cwd=packages/natives test computer: 1 pass, 13 expect()). This wires the read-only screenshot primitive through the napi -> TS bridge. Input primitives + kill-switch remain gated: live verification requires Accessibility granted to the actual gjc process (a cargo/test binary is never TCC-trusted for input injection). --- crates/pi-natives/src/computer/capture.rs | 49 +++++++++++++++++++++++ docs/computer-use/README.md | 6 ++- packages/natives/native/index.d.ts | 34 ++++++++++++++++ packages/natives/native/index.js | 1 + packages/natives/test/computer.test.ts | 32 +++++++++++++++ 5 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 packages/natives/test/computer.test.ts diff --git a/crates/pi-natives/src/computer/capture.rs b/crates/pi-natives/src/computer/capture.rs index 23efe9f8a..ec4621a22 100644 --- a/crates/pi-natives/src/computer/capture.rs +++ b/crates/pi-natives/src/computer/capture.rs @@ -17,6 +17,9 @@ use std::{ffi::c_void, fmt}; +use napi::bindgen_prelude::Uint8Array; +use napi_derive::napi; + use crate::computer::coords::NormalizedDisplay; #[repr(C)] @@ -250,3 +253,49 @@ mod tests { std::fs::write("/tmp/computer-capture-evidence.png", &frame.png).ok(); } } + +/// A captured primary-display screenshot returned to JS. +/// +/// `width_px`/`height_px` are the physical pixels that define the action +/// coordinate space (see the coordinate contract); the scale/origin map them to +/// macOS logical points. +#[napi(object)] +pub struct ComputerScreenshot { + /// PNG-encoded image bytes. + pub png: Uint8Array, + /// Screenshot width in physical pixels. + pub width_px: u32, + /// Screenshot height in physical pixels. + pub height_px: u32, + /// Physical-pixels-per-logical-point along X. + pub scale_x: f64, + /// Physical-pixels-per-logical-point along Y. + pub scale_y: f64, + /// Logical origin X of the display (points). + pub origin_x: f64, + /// Logical origin Y of the display (points). + pub origin_y: f64, +} + +/// Capture the primary display for JS callers (macOS). +/// +/// Requires the Screen Recording permission. This is the read-only `screenshot` +/// primitive of the computer-use tool; input primitives land behind the same +/// surface once the Accessibility gate is satisfied in a granted `gjc` process. +/// +/// # Errors +/// Returns an error when capture fails (e.g. Screen Recording not granted). +#[napi(js_name = "computerScreenshot")] +pub fn computer_screenshot() -> napi::Result { + let frame = + capture_primary_display().map_err(|err| napi::Error::from_reason(format!("{err}")))?; + Ok(ComputerScreenshot { + png: Uint8Array::from(frame.png), + width_px: frame.display.width_px, + height_px: frame.display.height_px, + scale_x: frame.display.scale_x, + scale_y: frame.display.scale_y, + origin_x: frame.display.origin_x, + origin_y: frame.display.origin_y, + }) +} diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md index 361f0a9db..6cf7c1813 100644 --- a/docs/computer-use/README.md +++ b/docs/computer-use/README.md @@ -74,9 +74,11 @@ manual macOS E2E). |-------|-------|--------| | Coordinate contract + planning docs | `coords` module + unit tests + this doc | **done (this PR)** | | Native screen capture (`screenshot`) | `capture` module, primary display, PNG + scale | **done (this PR, verified live)** | -| Native input backend | `input`, `permissions`, `execute_action` (click/type/drag/scroll/keypress) | planned | +| TCC preflight (`permissions`) | Accessibility + Screen Recording checks, Settings openers, fail-closed guards | **done (this PR, verified live)** | +| napi screenshot binding (`computerScreenshot`) | napi → `packages/natives` → TS, verified live | **done (this PR)** | +| Native input backend | `input`, `execute_action` (click/type/drag/scroll/keypress) | planned (gated on Accessibility in a granted gjc process) | | Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | planned | -| napi bindings + TS `computer` tool surface | `ComputerController`, schema, gating, prompt, renderer | planned | +| TS `computer` tool surface | full `ComputerController` + `computer.ts` schema/gating/prompt/renderer | planned | | Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | The remaining input backend, kill-switch, napi/TS surface, and manual diff --git a/packages/natives/native/index.d.ts b/packages/natives/native/index.d.ts index 7734bbe61..9647f3695 100644 --- a/packages/natives/native/index.d.ts +++ b/packages/natives/native/index.d.ts @@ -357,6 +357,40 @@ export interface ClipboardImage { mimeType: string } +/** + * Capture the primary display for JS callers (macOS). + * + * Requires the Screen Recording permission. This is the read-only `screenshot` + * primitive of the computer-use tool; input primitives land behind the same + * surface once the Accessibility gate is satisfied in a granted `gjc` process. + * + * # Errors + * Returns an error when capture fails (e.g. Screen Recording not granted). + */ +export declare function computerScreenshot(): ComputerScreenshot + +/** + * A captured primary-display screenshot returned to JS. `width_px`/`height_px` + * are the physical pixels that define the action coordinate space (see the + * coordinate contract); the scale/origin map them to macOS logical points. + */ +export interface ComputerScreenshot { + /** PNG-encoded image bytes. */ + png: Uint8Array + /** Screenshot width in physical pixels. */ + widthPx: number + /** Screenshot height in physical pixels. */ + heightPx: number + /** Physical-pixels-per-logical-point along X. */ + scaleX: number + /** Physical-pixels-per-logical-point along Y. */ + scaleY: number + /** Logical origin X of the display (points). */ + originX: number + /** Logical origin Y of the display (points). */ + originY: number +} + /** A context line (before or after a match). */ export interface ContextLine { /** 1-indexed line number in the source file. */ diff --git a/packages/natives/native/index.js b/packages/natives/native/index.js index 0393bf787..0e744d4e6 100644 --- a/packages/natives/native/index.js +++ b/packages/natives/native/index.js @@ -28,6 +28,7 @@ export const __piNativesV0_5_1 = nativeBindings.__piNativesV0_5_1; export const applyBashFixups = nativeBindings.applyBashFixups; export const astEdit = nativeBindings.astEdit; export const astGrep = nativeBindings.astGrep; +export const computerScreenshot = nativeBindings.computerScreenshot; export const copyToClipboard = nativeBindings.copyToClipboard; export const countTokens = nativeBindings.countTokens; export const detectMacOSAppearance = nativeBindings.detectMacOSAppearance; diff --git a/packages/natives/test/computer.test.ts b/packages/natives/test/computer.test.ts new file mode 100644 index 000000000..d8e4acda8 --- /dev/null +++ b/packages/natives/test/computer.test.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from "bun:test"; +import { computerScreenshot } from "../native/index.js"; + +const isMacOS = process.platform === "darwin"; + +// The native `computerScreenshot` binding is macOS-only and captures the real +// primary display, so it requires the Screen Recording permission. Gate on +// platform and skip gracefully when capture is unavailable in the environment. +describe.if(isMacOS)("computer screenshot napi binding", () => { + it("returns a decodable PNG whose dimensions match the descriptor", () => { + let shot: ReturnType; + try { + shot = computerScreenshot(); + } catch (err) { + // Screen Recording not granted to this process — surfaced, not silent. + console.warn(`skipping: computerScreenshot unavailable (${String(err)})`); + return; + } + + expect(shot.widthPx).toBeGreaterThan(0); + expect(shot.heightPx).toBeGreaterThan(0); + expect(shot.scaleX).toBeGreaterThan(0); + expect(shot.scaleY).toBeGreaterThan(0); + expect(shot.png.byteLength).toBeGreaterThan(0); + + // PNG magic number: 89 50 4E 47 0D 0A 1A 0A. + const sig = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]; + for (let i = 0; i < sig.length; i++) { + expect(shot.png[i]).toBe(sig[i]); + } + }); +}); From 2118e09150aae4f026a925cf64c01245bbfdde1e Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:03:25 +0900 Subject: [PATCH 05/23] feat(pi-natives): add gated native input orchestration (computer-use) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds crates/pi-natives/src/computer/input.rs: InputController orchestrates click/double_click/move/drag/scroll/type/keypress over an EventSink trait, tracking held buttons so release_all cleans up after abort/error. Coords flow through coords.rs; out-of-bounds is rejected and the drag error path releases the held button. 9 unit tests drive a RecordingSink to verify exact event sequences (no real OS events). The real CGEvent-backed MacEventSink is constructed only via guarded_controller(), which require_accessibility_for_input() gates — so input physically cannot fire while Accessibility is ungranted. Not yet napi/model-exposed (per the plan, input ships only after the kill-switch is proven). Live OS-event behavior is verified in a granted gjc session; the orchestration logic is fully unit-tested here. --- crates/pi-natives/src/computer/input.rs | 635 ++++++++++++++++++++++++ crates/pi-natives/src/computer/mod.rs | 2 + 2 files changed, 637 insertions(+) create mode 100644 crates/pi-natives/src/computer/input.rs diff --git a/crates/pi-natives/src/computer/input.rs b/crates/pi-natives/src/computer/input.rs new file mode 100644 index 000000000..8bdf9af19 --- /dev/null +++ b/crates/pi-natives/src/computer/input.rs @@ -0,0 +1,635 @@ +//! macOS native input injection for computer-use. +//! +//! # Safety model +//! Input is **runtime-gated**: [`InputController::guarded`] refuses to +//! construct unless Accessibility is granted (see [`super::permissions`]), so +//! no event can be posted while the TCC gate is closed. This module is also +//! **not** wired to napi or the model surface yet — per the approved plan, +//! input is exposed only after the kill-switch supervisor is proven live. +//! +//! # Testability +//! All event *orchestration* (action → low-level event sequence, held +//! button/modifier tracking, coordinate transforms, release-all cleanup) lives +//! in [`InputController`] over an [`EventSink`] trait. Unit tests drive a +//! [`RecordingSink`] to assert exact sequences without posting real OS events. +//! Only [`MacEventSink`] performs `CGEvent` FFI; its live behavior is verified +//! in a granted `gjc` session, not from a non-TCC-trusted test binary. + +use super::coords::{CoordError, LogicalPoint, NormalizedDisplay}; + +/// A mouse button. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MouseButton { + /// Primary (left) button. + Left, + /// Secondary (right) button. + Right, + /// Tertiary (middle) button. + Center, +} + +/// One low-level event recorded by [`RecordingSink`] for tests. +#[derive(Debug, Clone, PartialEq)] +pub enum SinkOp { + /// Move the cursor to a logical point. + Move(LogicalPoint), + /// Press or release `button` at a logical point. + Button { at: LogicalPoint, button: MouseButton, down: bool }, + /// Scroll by logical deltas (`dx`, `dy`). + Scroll { dx: f64, dy: f64 }, + /// Type a unicode string. + TypeUnicode(String), + /// Press or release a virtual key code. + Key { code: u16, down: bool }, +} + +/// Sink for low-level input events. The real implementation posts `CGEvent`s; +/// the test implementation records them. +pub trait EventSink { + /// Move the cursor. + fn move_cursor(&mut self, to: LogicalPoint); + /// Press or release a mouse button at a point. + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool); + /// Scroll by logical deltas. + fn scroll(&mut self, dx: f64, dy: f64); + /// Type a unicode string. + fn type_unicode(&mut self, text: &str); + /// Press or release a virtual key code. + fn key(&mut self, code: u16, down: bool); +} + +/// Error from an input action. +#[derive(Debug, Clone, PartialEq)] +pub enum InputError { + /// A coordinate could not be mapped to a logical point. + Coord(CoordError), + /// A key name was not recognized. + UnknownKey(String), +} + +impl From for InputError { + fn from(value: CoordError) -> Self { + Self::Coord(value) + } +} + +impl std::fmt::Display for InputError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Coord(err) => write!(f, "{err}"), + Self::UnknownKey(key) => write!(f, "unknown key name: {key}"), + } + } +} + +impl std::error::Error for InputError {} + +/// Resolve a named key (or single character) to a macOS virtual key code. +/// Returns `None` for unrecognized names. +#[must_use] +pub fn key_code_for(name: &str) -> Option { + let code = match name.to_ascii_lowercase().as_str() { + "return" | "enter" => 36, + "tab" => 48, + "space" => 49, + "delete" | "backspace" => 51, + "escape" | "esc" => 53, + "left" | "arrowleft" => 123, + "right" | "arrowright" => 124, + "down" | "arrowdown" => 125, + "up" | "arrowup" => 126, + _ => return None, + }; + Some(code) +} + +/// Orchestrates input actions over an [`EventSink`], tracking held buttons so +/// [`InputController::release_all`] can clean up after an abort or error. +pub struct InputController { + sink: S, + cursor: LogicalPoint, + held_buttons: Vec, +} + +impl InputController { + /// Construct a controller over `sink`. Prefer [`InputController::guarded`] + /// for any path that posts real events. + pub const fn new(sink: S) -> Self { + Self { sink, cursor: LogicalPoint { x: 0.0, y: 0.0 }, held_buttons: Vec::new() } + } + + /// The most recent cursor position. + #[must_use] + pub const fn cursor(&self) -> LogicalPoint { + self.cursor + } + + /// Whether any mouse button is currently held. + #[must_use] + pub const fn has_held_buttons(&self) -> bool { + !self.held_buttons.is_empty() + } + + fn press(&mut self, at: LogicalPoint, button: MouseButton) { + self.sink.mouse_button(at, button, true); + if !self.held_buttons.contains(&button) { + self.held_buttons.push(button); + } + } + + fn release(&mut self, at: LogicalPoint, button: MouseButton) { + self.sink.mouse_button(at, button, false); + self.held_buttons.retain(|held| *held != button); + } + + /// Move the cursor to a screenshot-space pixel on `display`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn move_to( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + ) -> Result<(), InputError> { + let point = display.to_logical_point(x, y)?; + self.cursor = point; + self.sink.move_cursor(point); + Ok(()) + } + + /// Move to `(x, y)` and click `button`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn click( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + button: MouseButton, + ) -> Result<(), InputError> { + self.move_to(display, x, y)?; + let at = self.cursor; + self.press(at, button); + self.release(at, button); + Ok(()) + } + + /// Double-click `button` at `(x, y)`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn double_click( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + button: MouseButton, + ) -> Result<(), InputError> { + self.click(display, x, y, button)?; + let at = self.cursor; + self.press(at, button); + self.release(at, button); + Ok(()) + } + + /// Press at `(from_x, from_y)`, drag to `(to_x, to_y)`, and release. + /// Releases the button on the error path so no button is left held. + /// + /// # Errors + /// Returns [`InputError::Coord`] when either pixel is out of bounds. + pub fn drag( + &mut self, + display: &NormalizedDisplay, + from_x: f64, + from_y: f64, + to_x: f64, + to_y: f64, + button: MouseButton, + ) -> Result<(), InputError> { + self.move_to(display, from_x, from_y)?; + let start = self.cursor; + self.press(start, button); + match display.to_logical_point(to_x, to_y) { + Ok(end) => { + self.cursor = end; + self.sink.move_cursor(end); + self.release(end, button); + Ok(()) + }, + Err(err) => { + // Out-of-bounds destination: release the held button before erroring. + self.release(start, button); + Err(InputError::Coord(err)) + }, + } + } + + /// Scroll by logical deltas after moving to `(x, y)`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn scroll( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + dx: f64, + dy: f64, + ) -> Result<(), InputError> { + self.move_to(display, x, y)?; + self.sink.scroll(dx, dy); + Ok(()) + } + + /// Type a unicode string. + pub fn type_text(&mut self, text: &str) { + self.sink.type_unicode(text); + } + + /// Press and release each named key in order. + /// + /// # Errors + /// Returns [`InputError::UnknownKey`] when a name is unrecognized; keys + /// before the failure have already been sent. + pub fn keypress(&mut self, keys: &[String]) -> Result<(), InputError> { + for name in keys { + let code = key_code_for(name).ok_or_else(|| InputError::UnknownKey(name.clone()))?; + self.sink.key(code, true); + self.sink.key(code, false); + } + Ok(()) + } + + /// Release every held mouse button (idempotent). Run on abort/error paths + /// so a partial drag never leaves a button stuck. + pub fn release_all(&mut self) { + let at = self.cursor; + let held: Vec = self.held_buttons.drain(..).collect(); + for button in held { + self.sink.mouse_button(at, button, false); + } + } +} + +#[cfg(target_os = "macos")] +pub use mac::{MacEventSink, guarded_controller}; + +#[cfg(target_os = "macos")] +mod mac { + //! Real CGEvent-backed [`EventSink`] (macOS). Live behavior is verified in a + //! granted `gjc` session; construction is gated on Accessibility. + + use std::ffi::c_void; + + use super::{EventSink, InputController, MouseButton}; + use crate::computer::{ + coords::LogicalPoint, + permissions::{PermissionError, require_accessibility_for_input}, + }; + + #[repr(C)] + #[derive(Clone, Copy)] + struct CgPoint { + x: f64, + y: f64, + } + + type CgEventSourceRef = *mut c_void; + type CgEventRef = *mut c_void; + + // CGEventType values. + const LEFT_DOWN: u32 = 1; + const LEFT_UP: u32 = 2; + const RIGHT_DOWN: u32 = 3; + const RIGHT_UP: u32 = 4; + const MOUSE_MOVED: u32 = 5; + const OTHER_DOWN: u32 = 25; + const OTHER_UP: u32 = 26; + + // CGMouseButton values. + const BTN_LEFT: u32 = 0; + const BTN_RIGHT: u32 = 1; + const BTN_CENTER: u32 = 2; + + // kCGEventSourceStateCombinedSessionState / kCGHIDEventTap. + const SOURCE_COMBINED_SESSION: u32 = 0; + const HID_EVENT_TAP: u32 = 0; + // kCGScrollEventUnitPixel. + const SCROLL_UNIT_PIXEL: u32 = 0; + + #[link(name = "CoreGraphics", kind = "framework")] + unsafe extern "C" { + fn CGEventSourceCreate(state_id: u32) -> CgEventSourceRef; + fn CGEventCreateMouseEvent( + source: CgEventSourceRef, + mouse_type: u32, + position: CgPoint, + button: u32, + ) -> CgEventRef; + fn CGEventCreateScrollWheelEvent( + source: CgEventSourceRef, + units: u32, + wheel_count: u32, + wheel1: i32, + wheel2: i32, + ) -> CgEventRef; + fn CGEventCreateKeyboardEvent( + source: CgEventSourceRef, + keycode: u16, + key_down: bool, + ) -> CgEventRef; + fn CGEventKeyboardSetUnicodeString(event: CgEventRef, length: usize, string: *const u16); + fn CGEventPost(tap: u32, event: CgEventRef); + fn CFRelease(cf: *const c_void); + } + + const fn button_codes(button: MouseButton, down: bool) -> (u32, u32) { + match button { + MouseButton::Left => (if down { LEFT_DOWN } else { LEFT_UP }, BTN_LEFT), + MouseButton::Right => (if down { RIGHT_DOWN } else { RIGHT_UP }, BTN_RIGHT), + MouseButton::Center => (if down { OTHER_DOWN } else { OTHER_UP }, BTN_CENTER), + } + } + + /// CGEvent-backed sink. Holds an event source for the session. + pub struct MacEventSink { + source: CgEventSourceRef, + } + + impl MacEventSink { + fn new() -> Self { + // SAFETY: `CGEventSourceCreate` returns an owned source (or null, + // which CGEvent creation tolerates); released on drop. + let source = unsafe { CGEventSourceCreate(SOURCE_COMBINED_SESSION) }; + Self { source } + } + + fn post_mouse(&self, at: LogicalPoint, event_type: u32, button: u32) { + let position = CgPoint { x: at.x, y: at.y }; + // SAFETY: `source` is the owned event source; the created event is + // posted and released exactly once. + unsafe { + let event = CGEventCreateMouseEvent(self.source, event_type, position, button); + if !event.is_null() { + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + } + + impl Drop for MacEventSink { + fn drop(&mut self) { + if !self.source.is_null() { + // SAFETY: `source` is owned, non-null, and not used after release. + unsafe { CFRelease(self.source.cast_const()) }; + } + } + } + + impl EventSink for MacEventSink { + fn move_cursor(&mut self, to: LogicalPoint) { + self.post_mouse(to, MOUSE_MOVED, BTN_LEFT); + } + + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool) { + let (event_type, code) = button_codes(button, down); + self.post_mouse(at, event_type, code); + } + + fn scroll(&mut self, dx: f64, dy: f64) { + // SAFETY: created scroll event is posted and released exactly once. + unsafe { + let event = CGEventCreateScrollWheelEvent( + self.source, + SCROLL_UNIT_PIXEL, + 2, + dy as i32, + dx as i32, + ); + if !event.is_null() { + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + + fn type_unicode(&mut self, text: &str) { + let utf16: Vec = text.encode_utf16().collect(); + // SAFETY: down/up keyboard events are created, populated with the + // UTF-16 buffer (valid for the call), posted, and released once each. + unsafe { + for down in [true, false] { + let event = CGEventCreateKeyboardEvent(self.source, 0, down); + if event.is_null() { + continue; + } + CGEventKeyboardSetUnicodeString(event, utf16.len(), utf16.as_ptr()); + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + + fn key(&mut self, code: u16, down: bool) { + // SAFETY: created keyboard event is posted and released exactly once. + unsafe { + let event = CGEventCreateKeyboardEvent(self.source, code, down); + if !event.is_null() { + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + } + + /// Construct an [`InputController`] backed by real `CGEvent`s — only when + /// Accessibility is granted. + /// + /// # Errors + /// Returns [`PermissionError`] when Accessibility is not granted; no event + /// source is created and no input can be posted. + pub fn guarded_controller() -> Result, PermissionError> { + require_accessibility_for_input()?; + Ok(InputController::new(MacEventSink::new())) + } +} + +#[cfg(test)] +mod tests { + use super::{EventSink, InputController, InputError, MouseButton, SinkOp, key_code_for}; + use crate::computer::coords::{LogicalPoint, NormalizedDisplay}; + + #[derive(Default)] + struct RecordingSink { + ops: Vec, + } + + impl EventSink for RecordingSink { + fn move_cursor(&mut self, to: LogicalPoint) { + self.ops.push(SinkOp::Move(to)); + } + + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool) { + self.ops.push(SinkOp::Button { at, button, down }); + } + + fn scroll(&mut self, dx: f64, dy: f64) { + self.ops.push(SinkOp::Scroll { dx, dy }); + } + + fn type_unicode(&mut self, text: &str) { + self.ops.push(SinkOp::TypeUnicode(text.to_string())); + } + + fn key(&mut self, code: u16, down: bool) { + self.ops.push(SinkOp::Key { code, down }); + } + } + + fn display() -> NormalizedDisplay { + // 200x100 physical px at 2x => clicks map to logical /2. + NormalizedDisplay::new(200, 100, 2.0, 2.0, 0.0, 0.0) + } + + #[test] + fn click_moves_then_presses_and_releases_at_logical_point() { + let mut c = InputController::new(RecordingSink::default()); + c.click(&display(), 100.0, 50.0, MouseButton::Left).unwrap(); + let at = LogicalPoint { x: 50.0, y: 25.0 }; + assert_eq!(c.into_ops(), vec![ + SinkOp::Move(at), + SinkOp::Button { at, button: MouseButton::Left, down: true }, + SinkOp::Button { at, button: MouseButton::Left, down: false }, + ]); + } + + #[test] + fn double_click_emits_two_press_release_pairs() { + let mut c = InputController::new(RecordingSink::default()); + c.double_click(&display(), 10.0, 10.0, MouseButton::Left) + .unwrap(); + let downs = c + .ops_ref() + .iter() + .filter(|op| matches!(op, SinkOp::Button { down: true, .. })) + .count(); + let ups = c + .ops_ref() + .iter() + .filter(|op| matches!(op, SinkOp::Button { down: false, .. })) + .count(); + assert_eq!((downs, ups), (2, 2)); + assert!(!c.has_held_buttons()); + } + + #[test] + fn drag_releases_button_and_leaves_none_held() { + let mut c = InputController::new(RecordingSink::default()); + c.drag(&display(), 0.0, 0.0, 100.0, 50.0, MouseButton::Left) + .unwrap(); + assert!(!c.has_held_buttons()); + let ops = c.into_ops(); + assert_eq!(ops.first(), Some(&SinkOp::Move(LogicalPoint { x: 0.0, y: 0.0 }))); + assert_eq!( + ops.last(), + Some(&SinkOp::Button { + at: LogicalPoint { x: 50.0, y: 25.0 }, + button: MouseButton::Left, + down: false, + }) + ); + } + + #[test] + fn drag_to_out_of_bounds_releases_the_held_button() { + let mut c = InputController::new(RecordingSink::default()); + let err = c + .drag(&display(), 0.0, 0.0, 999.0, 0.0, MouseButton::Left) + .unwrap_err(); + assert!(matches!(err, InputError::Coord(_))); + // Button was pressed then released on the error path; none left held. + assert!(!c.has_held_buttons()); + let releases = c + .ops_ref() + .iter() + .filter(|op| matches!(op, SinkOp::Button { down: false, .. })) + .count(); + assert_eq!(releases, 1); + } + + #[test] + fn release_all_releases_a_stuck_button() { + let mut c = InputController::new(RecordingSink::default()); + // Press without releasing by starting a drag whose destination is invalid + // is covered above; here force a held state via a press through click then + // simulate a held button using a manual press path. + c.move_to(&display(), 10.0, 10.0).unwrap(); + c.press_for_test(MouseButton::Left); + assert!(c.has_held_buttons()); + c.release_all(); + assert!(!c.has_held_buttons()); + assert!(matches!(c.ops_ref().last(), Some(SinkOp::Button { down: false, .. }))); + // release_all is idempotent. + c.release_all(); + assert!(!c.has_held_buttons()); + } + + #[test] + fn move_out_of_bounds_errors_without_emitting_move() { + let mut c = InputController::new(RecordingSink::default()); + let err = c.move_to(&display(), 200.0, 0.0).unwrap_err(); + assert!(matches!(err, InputError::Coord(_))); + assert!(c.ops_ref().is_empty()); + } + + #[test] + fn keypress_maps_names_and_rejects_unknown() { + let mut c = InputController::new(RecordingSink::default()); + c.keypress(&["enter".to_string(), "tab".to_string()]) + .unwrap(); + assert_eq!(c.ops_ref(), &[ + SinkOp::Key { code: 36, down: true }, + SinkOp::Key { code: 36, down: false }, + SinkOp::Key { code: 48, down: true }, + SinkOp::Key { code: 48, down: false }, + ]); + let err = c + .keypress(&["definitely-not-a-key".to_string()]) + .unwrap_err(); + assert!(matches!(err, InputError::UnknownKey(_))); + } + + #[test] + fn type_text_forwards_unicode() { + let mut c = InputController::new(RecordingSink::default()); + c.type_text("héllo"); + assert_eq!(c.into_ops(), vec![SinkOp::TypeUnicode("héllo".to_string())]); + } + + #[test] + fn key_code_table_covers_common_names() { + assert_eq!(key_code_for("Return"), Some(36)); + assert_eq!(key_code_for("ESC"), Some(53)); + assert_eq!(key_code_for("up"), Some(126)); + assert_eq!(key_code_for("nope"), None); + } + + // Test-only helpers on the controller. + impl InputController { + fn into_ops(self) -> Vec { + self.sink.ops + } + + fn ops_ref(&self) -> &[SinkOp] { + &self.sink.ops + } + + fn press_for_test(&mut self, button: MouseButton) { + let at = self.cursor(); + self.press(at, button); + } + } +} diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index ae098233d..9eee71af2 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -24,11 +24,13 @@ #[cfg(target_os = "macos")] pub mod capture; pub mod coords; +pub mod input; #[cfg(target_os = "macos")] pub mod permissions; #[cfg(target_os = "macos")] pub use capture::{CaptureError, CapturedFrame, capture_primary_display}; pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; +pub use input::{EventSink, InputController, InputError, MouseButton}; #[cfg(target_os = "macos")] pub use permissions::{PermissionError, PreflightStatus, TccPermission, preflight}; From 86c8e963b51a2ee2e9fcf79b14dd9bdac23cc13c Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:03:59 +0900 Subject: [PATCH 06/23] docs(computer-use): mark input orchestration logic-done (firing gated) --- docs/computer-use/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md index 6cf7c1813..7358e040d 100644 --- a/docs/computer-use/README.md +++ b/docs/computer-use/README.md @@ -76,7 +76,8 @@ manual macOS E2E). | Native screen capture (`screenshot`) | `capture` module, primary display, PNG + scale | **done (this PR, verified live)** | | TCC preflight (`permissions`) | Accessibility + Screen Recording checks, Settings openers, fail-closed guards | **done (this PR, verified live)** | | napi screenshot binding (`computerScreenshot`) | napi → `packages/natives` → TS, verified live | **done (this PR)** | -| Native input backend | `input`, `execute_action` (click/type/drag/scroll/keypress) | planned (gated on Accessibility in a granted gjc process) | +| Native input orchestration (`input`) | `InputController` click/double_click/move/drag/scroll/type/keypress + release_all over an `EventSink` | **logic done (this PR, unit-verified)** — real `CGEvent` firing gated on Accessibility in a granted gjc process | +| Central `execute_action` state machine | preflight + supervisor + cancellation + audit + release-all | planned | | Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | planned | | TS `computer` tool surface | full `ComputerController` + `computer.ts` schema/gating/prompt/renderer | planned | | Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | From 4454776835eddea618c53df2143e9a960c445b91 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:21:45 +0900 Subject: [PATCH 07/23] feat(pi-natives): verify live input injection (cursor move) + warp fix Adds current_cursor_position() (CGEventGetLocation) and a guarded live test that moves the cursor to the display center and reads it back. First run revealed a bare kCGEventMouseMoved event does not relocate the cursor; fixed move_cursor to use CGWarpMouseCursorPosition (then post the moved event for hover). Live test now passes within 2 logical points with Accessibility granted to the host app (cmux). This proves the CGEvent input pipeline end-to-end on the real desktop: coords.rs transform -> warp -> read-back. Input is still gated (guarded_controller requires Accessibility) and not yet model-exposed; click/type await the kill-switch per the plan. --- crates/pi-natives/src/computer/input.rs | 73 ++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/crates/pi-natives/src/computer/input.rs b/crates/pi-natives/src/computer/input.rs index 8bdf9af19..924e52389 100644 --- a/crates/pi-natives/src/computer/input.rs +++ b/crates/pi-natives/src/computer/input.rs @@ -274,7 +274,7 @@ impl InputController { } #[cfg(target_os = "macos")] -pub use mac::{MacEventSink, guarded_controller}; +pub use mac::{MacEventSink, current_cursor_position, guarded_controller}; #[cfg(target_os = "macos")] mod mac { @@ -342,6 +342,9 @@ mod mac { ) -> CgEventRef; fn CGEventKeyboardSetUnicodeString(event: CgEventRef, length: usize, string: *const u16); fn CGEventPost(tap: u32, event: CgEventRef); + fn CGEventCreate(source: CgEventSourceRef) -> CgEventRef; + fn CGEventGetLocation(event: CgEventRef) -> CgPoint; + fn CGWarpMouseCursorPosition(new_cursor_position: CgPoint) -> i32; fn CFRelease(cf: *const c_void); } @@ -391,6 +394,12 @@ mod mac { impl EventSink for MacEventSink { fn move_cursor(&mut self, to: LogicalPoint) { + // `CGWarpMouseCursorPosition` reliably relocates the hardware cursor + // (a bare mouseMoved event does not); the moved event then notifies + // apps of the hover at the new point. + let position = CgPoint { x: to.x, y: to.y }; + // SAFETY: pure Core Graphics cursor warp to a point; no ownership. + unsafe { CGWarpMouseCursorPosition(position) }; self.post_mouse(to, MOUSE_MOVED, BTN_LEFT); } @@ -455,6 +464,23 @@ mod mac { require_accessibility_for_input()?; Ok(InputController::new(MacEventSink::new())) } + + /// Read the current global cursor position in logical points (top-left + /// origin). Used to verify mouse-move injection without clicking. + #[must_use] + pub fn current_cursor_position() -> LogicalPoint { + // SAFETY: `CGEventCreate(null)` returns an event whose location is the + // current cursor; it is released after the read. + unsafe { + let event = CGEventCreate(std::ptr::null_mut()); + if event.is_null() { + return LogicalPoint { x: 0.0, y: 0.0 }; + } + let location = CGEventGetLocation(event); + CFRelease(event.cast_const()); + LogicalPoint { x: location.x, y: location.y } + } + } } #[cfg(test)] @@ -633,3 +659,48 @@ mod tests { } } } + +#[cfg(all(test, target_os = "macos"))] +mod live_tests { + use super::{MouseButton, current_cursor_position, guarded_controller}; + use crate::computer::capture::capture_primary_display; + + /// Fires a real cursor move (no clicks/keys) and reads the position back to + /// prove the CGEvent input pipeline works end to end. Ignored by default; + /// run with `--ignored` on a macOS host with Accessibility granted. + #[test] + #[ignore = "moves the real cursor; needs macOS + Accessibility granted"] + fn cursor_move_lands_near_target() { + let frame = capture_primary_display().expect("capture (Screen Recording) should be granted"); + let display = frame.display; + let Ok(mut controller) = guarded_controller() else { + panic!("Accessibility must be granted for input injection"); + }; + + // Target the display center — a safe interior point, well away from edges. + let target_px = f64::from(display.width_px) / 2.0; + let target_py = f64::from(display.height_px) / 2.0; + controller + .move_to(&display, target_px, target_py) + .expect("move_to should succeed"); + + let expected = display + .to_logical_point(target_px, target_py) + .expect("center is in bounds"); + let pos = current_cursor_position(); + let dx = (pos.x - expected.x).abs(); + let dy = (pos.y - expected.y).abs(); + assert!( + dx <= 2.0 && dy <= 2.0, + "cursor landed at ({}, {}), expected ~({}, {})", + pos.x, + pos.y, + expected.x, + expected.y + ); + assert_eq!(controller.cursor(), expected); + // We only moved the cursor; nothing should be held. + assert!(!controller.has_held_buttons()); + let _ = MouseButton::Left; // keep the import meaningful for future click tests + } +} From 4ec50c7b1b5910b2f4267697e45b4cf52016df67 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:22:34 +0900 Subject: [PATCH 08/23] docs(computer-use): mark live input injection verified; kill-switch next --- docs/computer-use/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md index 7358e040d..86dde2469 100644 --- a/docs/computer-use/README.md +++ b/docs/computer-use/README.md @@ -76,9 +76,9 @@ manual macOS E2E). | Native screen capture (`screenshot`) | `capture` module, primary display, PNG + scale | **done (this PR, verified live)** | | TCC preflight (`permissions`) | Accessibility + Screen Recording checks, Settings openers, fail-closed guards | **done (this PR, verified live)** | | napi screenshot binding (`computerScreenshot`) | napi → `packages/natives` → TS, verified live | **done (this PR)** | -| Native input orchestration (`input`) | `InputController` click/double_click/move/drag/scroll/type/keypress + release_all over an `EventSink` | **logic done (this PR, unit-verified)** — real `CGEvent` firing gated on Accessibility in a granted gjc process | +| Native input orchestration (`input`) | `InputController` click/double_click/move/drag/scroll/type/keypress + release_all over an `EventSink` | **done (this PR)** — logic unit-tested; **live cursor-move injection verified** (Accessibility granted) | | Central `execute_action` state machine | preflight + supervisor + cancellation + audit + release-all | planned | -| Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | planned | +| Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | next — required before click/type are exposed | | TS `computer` tool surface | full `ComputerController` + `computer.ts` schema/gating/prompt/renderer | planned | | Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | From ff81b21c1b1def989052e59b452be3b349dc2be3 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:26:42 +0900 Subject: [PATCH 09/23] feat(pi-natives): add kill-switch supervisor safety state machine Adds crates/pi-natives/src/computer/supervisor.rs: process-global Supervisor with fail-closed input_allowed (requires hotkey_live + fresh heartbeat + not suspended), trigger_stop latch, and user-only reset. 5 unit tests cover the gating truth table (not-live, live+fresh, stale heartbeat, stop-latch-until-reset, lost liveness). Pure atomics so the safety logic is deterministic; the OS hotkey listener drives it next. --- crates/pi-natives/src/computer/mod.rs | 2 + crates/pi-natives/src/computer/supervisor.rs | 205 +++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 crates/pi-natives/src/computer/supervisor.rs diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index 9eee71af2..db9c734e9 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -27,6 +27,7 @@ pub mod coords; pub mod input; #[cfg(target_os = "macos")] pub mod permissions; +pub mod supervisor; #[cfg(target_os = "macos")] pub use capture::{CaptureError, CapturedFrame, capture_primary_display}; @@ -34,3 +35,4 @@ pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; pub use input::{EventSink, InputController, InputError, MouseButton}; #[cfg(target_os = "macos")] pub use permissions::{PermissionError, PreflightStatus, TccPermission, preflight}; +pub use supervisor::{Supervisor, SupervisorStatus}; diff --git a/crates/pi-natives/src/computer/supervisor.rs b/crates/pi-natives/src/computer/supervisor.rs new file mode 100644 index 000000000..116e3d095 --- /dev/null +++ b/crates/pi-natives/src/computer/supervisor.rs @@ -0,0 +1,205 @@ +//! Process-global kill-switch supervisor for computer-use. +//! +//! # Role +//! The supervisor is the safety authority for autonomous input: side-effecting +//! actions may fire only while [`Supervisor::input_allowed`] holds, and a stop +//! (global hotkey or TUI key) latches [`Supervisor::is_suspended`] until a +//! **user-only** [`Supervisor::reset`]. The model-facing surface can never +//! reset suspension. +//! +//! `input_allowed` is fail-closed: it requires the stop path to be live +//! (`hotkey_live`), a fresh heartbeat from that stop path, and a non-suspended +//! state. If the hotkey listener dies (heartbeat goes stale or liveness drops), +//! input is disabled automatically. +//! +//! This module is pure state (atomics + timestamps) so the safety logic is +//! unit-tested deterministically without OS event taps; the OS hotkey listener +//! (a CFRunLoop CGEventTap) drives `set_hotkey_live`/`heartbeat`/`trigger_stop` +//! and is verified separately. + +use std::{ + sync::{ + OnceLock, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, + time::{SystemTime, UNIX_EPOCH}, +}; + +/// Max age of the stop-path heartbeat before input is disabled (ms). +pub const HEARTBEAT_FRESH_MS: u64 = 2_000; + +/// Snapshot of supervisor state used for gating and status reporting. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SupervisorStatus { + /// Input is latched off until a user-only reset. + pub suspended: bool, + /// The global stop path (hotkey/event-tap) reports itself live. + pub hotkey_live: bool, + /// The stop path's heartbeat is within [`HEARTBEAT_FRESH_MS`]. + pub heartbeat_fresh: bool, +} + +impl SupervisorStatus { + /// Whether side-effecting input may fire: live, fresh, and not suspended. + #[must_use] + pub const fn input_allowed(self) -> bool { + self.hotkey_live && self.heartbeat_fresh && !self.suspended + } +} + +/// Process-global kill-switch state. +pub struct Supervisor { + suspended: AtomicBool, + hotkey_live: AtomicBool, + last_heartbeat_ms: AtomicU64, +} + +fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| u64::try_from(d.as_millis()).unwrap_or(u64::MAX)) + .unwrap_or(0) +} + +impl Supervisor { + /// Construct a fresh supervisor: not suspended, stop path not yet live. + #[must_use] + pub const fn new() -> Self { + Self { + suspended: AtomicBool::new(false), + hotkey_live: AtomicBool::new(false), + last_heartbeat_ms: AtomicU64::new(0), + } + } + + /// The process-global supervisor singleton. + pub fn global() -> &'static Self { + static GLOBAL: OnceLock = OnceLock::new(); + GLOBAL.get_or_init(Supervisor::new) + } + + /// Record that the stop path is live (or not) and refresh its heartbeat. + pub fn set_hotkey_live(&self, live: bool) { + self.hotkey_live.store(live, Ordering::SeqCst); + if live { + self.last_heartbeat_ms.store(now_ms(), Ordering::SeqCst); + } + } + + /// Heartbeat from the live stop path (call on a fixed interval). + pub fn heartbeat(&self) { + self.heartbeat_at(now_ms()); + } + + /// Heartbeat with an explicit timestamp (deterministic in tests). + pub fn heartbeat_at(&self, at_ms: u64) { + self.last_heartbeat_ms.store(at_ms, Ordering::SeqCst); + } + + /// Latch suspension: abort further input until a user-only [`reset`]. + /// Invoked by the global hotkey or TUI stop key. + /// + /// [`reset`]: Supervisor::reset + pub fn trigger_stop(&self) { + self.suspended.store(true, Ordering::SeqCst); + } + + /// Clear suspension. **User-only** — never wire this to the model-facing + /// tool schema or generic tool dispatch. + pub fn reset(&self) { + self.suspended.store(false, Ordering::SeqCst); + } + + /// Whether input is currently latched off. + #[must_use] + pub fn is_suspended(&self) -> bool { + self.suspended.load(Ordering::SeqCst) + } + + /// Status as of `now_ms` (explicit for tests). + #[must_use] + pub fn status_at(&self, now_ms: u64) -> SupervisorStatus { + let last = self.last_heartbeat_ms.load(Ordering::SeqCst); + SupervisorStatus { + suspended: self.suspended.load(Ordering::SeqCst), + hotkey_live: self.hotkey_live.load(Ordering::SeqCst), + heartbeat_fresh: now_ms.saturating_sub(last) <= HEARTBEAT_FRESH_MS, + } + } + + /// Status as of now. + #[must_use] + pub fn status(&self) -> SupervisorStatus { + self.status_at(now_ms()) + } + + /// Whether side-effecting input may fire right now. + #[must_use] + pub fn input_allowed(&self) -> bool { + self.status().input_allowed() + } +} + +impl Default for Supervisor { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::{HEARTBEAT_FRESH_MS, Supervisor}; + + #[test] + fn fresh_supervisor_disallows_input_until_stop_path_is_live() { + let s = Supervisor::new(); + assert!(!s.status_at(1_000).input_allowed(), "not live yet"); + assert!(!s.is_suspended()); + } + + #[test] + fn live_and_fresh_allows_input() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + assert!(s.status_at(10_500).input_allowed()); + } + + #[test] + fn stale_heartbeat_disables_input() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + let stale = 10_000 + HEARTBEAT_FRESH_MS + 1; + assert!(!s.status_at(stale).input_allowed(), "stale heartbeat must fail closed"); + } + + #[test] + fn trigger_stop_latches_until_user_reset() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + assert!(s.status_at(10_100).input_allowed()); + + s.trigger_stop(); + assert!(s.is_suspended()); + // Even with a live, fresh stop path, suspension keeps input off. + s.heartbeat_at(10_200); + assert!(!s.status_at(10_250).input_allowed()); + + s.reset(); + assert!(!s.is_suspended()); + s.heartbeat_at(10_300); + assert!(s.status_at(10_350).input_allowed()); + } + + #[test] + fn losing_hotkey_liveness_disables_input() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + assert!(s.status_at(10_100).input_allowed()); + s.set_hotkey_live(false); + assert!(!s.status_at(10_150).input_allowed(), "dead stop path must fail closed"); + } +} From 98cf12f466abf791e39aa4584c963ec6b2a3ae87 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:40:35 +0900 Subject: [PATCH 10/23] feat(pi-natives): add global kill-switch hotkey listener (verified live) Adds crates/pi-natives/src/computer/hotkey.rs: a listen-only CGEventTap on a dedicated CFRunLoop thread that latches Supervisor::trigger_stop on the configured global hotkey (Ctrl+Opt+Cmd+Escape), marking the supervisor stop path live on tap creation (fails closed otherwise). Independent of the model tool path. Verified live: synthetic_hotkey_triggers_stop posts a synthetic hotkey and observes the supervisor latch suspended end-to-end (Accessibility sufficed for tap creation; no separate Input Monitoring needed). clippy clean, fmt applied; aligned CFRunLoopGetCurrent signature with appearance.rs to avoid the clashing-extern warning. --- crates/pi-natives/src/computer/hotkey.rs | 251 +++++++++++++++++++ crates/pi-natives/src/computer/mod.rs | 2 + crates/pi-natives/src/computer/supervisor.rs | 9 +- 3 files changed, 257 insertions(+), 5 deletions(-) create mode 100644 crates/pi-natives/src/computer/hotkey.rs diff --git a/crates/pi-natives/src/computer/hotkey.rs b/crates/pi-natives/src/computer/hotkey.rs new file mode 100644 index 000000000..81ada58ac --- /dev/null +++ b/crates/pi-natives/src/computer/hotkey.rs @@ -0,0 +1,251 @@ +//! Global kill-switch hotkey listener (macOS). +//! +//! Runs a listen-only `CGEventTap` for key-down events on a dedicated thread +//! that owns its own `CFRunLoop`. When the configured hotkey +//! (Control+Option+Command+Escape by default) is seen, it latches +//! [`Supervisor::trigger_stop`] on the process-global supervisor — independent +//! of the model's tool path, so the model cannot bypass it. +//! +//! The listener marks the supervisor's stop path live on successful tap +//! creation and clears it on teardown, so input gating fails closed if the tap +//! cannot start. Verified by a synthetic-injection self-test (post the hotkey, +//! observe the latch) plus a real key press by a human for the final drill. + +use std::{ + ffi::c_void, + sync::OnceLock, + thread, + time::{Duration, Instant}, +}; + +use super::supervisor::Supervisor; + +type CfMachPortRef = *mut c_void; +type CfRunLoopSourceRef = *mut c_void; +type CfRunLoopRef = *const c_void; +type CfAllocatorRef = *const c_void; +type CfStringRef = *const c_void; +type CgEventRef = *mut c_void; +type CgEventTapProxy = *mut c_void; +type CgEventTapCallBack = extern "C" fn( + proxy: CgEventTapProxy, + event_type: u32, + event: CgEventRef, + user_info: *mut c_void, +) -> CgEventRef; + +// CGEventTap placement/options/location. +const SESSION_EVENT_TAP: u32 = 1; // kCGSessionEventTap +const HEAD_INSERT: u32 = 0; // kCGHeadInsertEventTap +const LISTEN_ONLY: u32 = 1; // kCGEventTapOptionListenOnly +const EVENT_KEY_DOWN: u32 = 10; // kCGEventKeyDown +const KEYCODE_FIELD: u32 = 9; // kCGKeyboardEventKeycode +const KEY_DOWN_MASK: u64 = 1 << EVENT_KEY_DOWN; // CGEventMaskBit(kCGEventKeyDown) + +// Default hotkey: Control+Option+Command+Escape — distinctive, unlikely to +// collide. +const HOTKEY_KEYCODE: i64 = 53; // Escape +const FLAG_CONTROL: u64 = 0x0004_0000; +const FLAG_OPTION: u64 = 0x0008_0000; +const FLAG_COMMAND: u64 = 0x0010_0000; +const HOTKEY_MODS: u64 = FLAG_CONTROL | FLAG_OPTION | FLAG_COMMAND; + +#[link(name = "CoreGraphics", kind = "framework")] +unsafe extern "C" { + fn CGEventTapCreate( + tap: u32, + place: u32, + options: u32, + events_of_interest: u64, + callback: CgEventTapCallBack, + user_info: *mut c_void, + ) -> CfMachPortRef; + fn CGEventTapEnable(tap: CfMachPortRef, enable: bool); + fn CGEventGetIntegerValueField(event: CgEventRef, field: u32) -> i64; + fn CGEventGetFlags(event: CgEventRef) -> u64; +} + +#[link(name = "CoreFoundation", kind = "framework")] +unsafe extern "C" { + static kCFRunLoopCommonModes: CfStringRef; + fn CFMachPortCreateRunLoopSource( + allocator: CfAllocatorRef, + port: CfMachPortRef, + order: isize, + ) -> CfRunLoopSourceRef; + fn CFRunLoopGetCurrent() -> CfRunLoopRef; + fn CFRunLoopAddSource(rl: CfRunLoopRef, source: CfRunLoopSourceRef, mode: CfStringRef); + fn CFRunLoopRun(); + fn CFRelease(cf: *const c_void); +} + +const fn matches_hotkey(keycode: i64, flags: u64) -> bool { + keycode == HOTKEY_KEYCODE && (flags & HOTKEY_MODS) == HOTKEY_MODS +} + +extern "C" fn tap_callback( + _proxy: CgEventTapProxy, + event_type: u32, + event: CgEventRef, + _user_info: *mut c_void, +) -> CgEventRef { + if event_type == EVENT_KEY_DOWN && !event.is_null() { + // SAFETY: `event` is a valid key event provided by the tap for the + // duration of this callback; we only read fields and return it unchanged. + let (keycode, flags) = + unsafe { (CGEventGetIntegerValueField(event, KEYCODE_FIELD), CGEventGetFlags(event)) }; + if matches_hotkey(keycode, flags) { + Supervisor::global().trigger_stop(); + } + } + // Listen-only: pass the event through untouched. + event +} + +static STARTED: OnceLock = OnceLock::new(); + +/// Start the global hotkey listener once (idempotent). +/// +/// Spawns a dedicated `CFRunLoop` thread; on successful tap creation the +/// supervisor's stop path is marked live. Returns whether the listener is +/// (now) live. +pub fn start() -> bool { + let first = STARTED.set(true).is_ok(); + if first { + thread::Builder::new() + .name("computer-killswitch".into()) + .spawn(run_listener) + .ok(); + } + wait_until_live(Duration::from_secs(1)) +} + +fn run_listener() { + // SAFETY: a listen-only key-down session tap; the returned mach port and + // run-loop source are added to this thread's run loop, which then runs for + // the process lifetime. Handles are released only on the (non-returning) + // teardown path below. + unsafe { + let tap = CGEventTapCreate( + SESSION_EVENT_TAP, + HEAD_INSERT, + LISTEN_ONLY, + KEY_DOWN_MASK, + tap_callback, + std::ptr::null_mut(), + ); + if tap.is_null() { + Supervisor::global().set_hotkey_live(false); + return; + } + let source = CFMachPortCreateRunLoopSource(std::ptr::null(), tap, 0); + if source.is_null() { + CFRelease(tap.cast_const()); + Supervisor::global().set_hotkey_live(false); + return; + } + CFRunLoopAddSource(CFRunLoopGetCurrent(), source, kCFRunLoopCommonModes); + CGEventTapEnable(tap, true); + Supervisor::global().set_hotkey_live(true); + CFRunLoopRun(); + // Unreached in normal operation; if the run loop ever returns, fail closed. + Supervisor::global().set_hotkey_live(false); + CFRelease(source.cast_const()); + CFRelease(tap.cast_const()); + } +} + +fn wait_until_live(timeout: Duration) -> bool { + let deadline = Instant::now() + timeout; + loop { + if Supervisor::global().status().hotkey_live { + return true; + } + if Instant::now() >= deadline { + return false; + } + thread::sleep(Duration::from_millis(20)); + } +} + +#[cfg(test)] +mod tests { + use super::{HOTKEY_KEYCODE, HOTKEY_MODS, matches_hotkey}; + + #[test] + fn matches_only_the_full_hotkey_combo() { + assert!(matches_hotkey(HOTKEY_KEYCODE, HOTKEY_MODS)); + assert!(matches_hotkey(HOTKEY_KEYCODE, HOTKEY_MODS | 0x1)); // extra bits ok + assert!(!matches_hotkey(HOTKEY_KEYCODE, 0)); // no modifiers + assert!(!matches_hotkey(HOTKEY_KEYCODE, 0x0004_0000)); // only control + assert!(!matches_hotkey(0, HOTKEY_MODS)); // wrong key + } +} + +#[cfg(all(test, target_os = "macos"))] +mod live_tests { + use std::{ffi::c_void, thread, time::Duration}; + + use super::{HOTKEY_KEYCODE, HOTKEY_MODS, start}; + use crate::computer::{permissions::accessibility_granted, supervisor::Supervisor}; + + type CgEventSourceRef = *mut c_void; + type CgEventRef = *mut c_void; + + #[link(name = "CoreGraphics", kind = "framework")] + unsafe extern "C" { + fn CGEventSourceCreate(state_id: u32) -> CgEventSourceRef; + fn CGEventCreateKeyboardEvent( + source: CgEventSourceRef, + keycode: u16, + key_down: bool, + ) -> CgEventRef; + fn CGEventSetFlags(event: CgEventRef, flags: u64); + fn CGEventPost(tap: u32, event: CgEventRef); + fn CFRelease(cf: *const c_void); + } + + fn post_hotkey() { + // SAFETY: creates, flags, posts, and releases a synthetic key event. + unsafe { + let source = CGEventSourceCreate(0); + for down in [true, false] { + let event = CGEventCreateKeyboardEvent(source, HOTKEY_KEYCODE as u16, down); + if event.is_null() { + continue; + } + CGEventSetFlags(event, HOTKEY_MODS); + CGEventPost(0, event); + CFRelease(event.cast_const()); + } + if !source.is_null() { + CFRelease(source.cast_const()); + } + } + } + + /// Starts the listener and posts a synthetic hotkey, proving the tap latches + /// the supervisor. Requires Accessibility/Input-Monitoring; ignored by + /// default. + #[test] + #[ignore = "starts a global event tap and posts a synthetic hotkey; needs macOS + grants"] + fn synthetic_hotkey_triggers_stop() { + assert!(accessibility_granted(), "Accessibility must be granted"); + let live = start(); + assert!(live, "hotkey listener should report live (tap created)"); + + Supervisor::global().reset(); + assert!(!Supervisor::global().is_suspended()); + + post_hotkey(); + // Give the tap callback time to fire on its run-loop thread. + for _ in 0..50 { + if Supervisor::global().is_suspended() { + break; + } + thread::sleep(Duration::from_millis(20)); + } + assert!(Supervisor::global().is_suspended(), "synthetic hotkey should latch trigger_stop"); + Supervisor::global().reset(); + } +} diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index db9c734e9..a2724f540 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -24,6 +24,8 @@ #[cfg(target_os = "macos")] pub mod capture; pub mod coords; +#[cfg(target_os = "macos")] +pub mod hotkey; pub mod input; #[cfg(target_os = "macos")] pub mod permissions; diff --git a/crates/pi-natives/src/computer/supervisor.rs b/crates/pi-natives/src/computer/supervisor.rs index 116e3d095..4f4482f8f 100644 --- a/crates/pi-natives/src/computer/supervisor.rs +++ b/crates/pi-natives/src/computer/supervisor.rs @@ -14,8 +14,8 @@ //! //! This module is pure state (atomics + timestamps) so the safety logic is //! unit-tested deterministically without OS event taps; the OS hotkey listener -//! (a CFRunLoop CGEventTap) drives `set_hotkey_live`/`heartbeat`/`trigger_stop` -//! and is verified separately. +//! (a `CFRunLoop` `CGEventTap`) drives +//! `set_hotkey_live`/`heartbeat`/`trigger_stop` and is verified separately. use std::{ sync::{ @@ -57,8 +57,7 @@ pub struct Supervisor { fn now_ms() -> u64 { SystemTime::now() .duration_since(UNIX_EPOCH) - .map(|d| u64::try_from(d.as_millis()).unwrap_or(u64::MAX)) - .unwrap_or(0) + .map_or(0, |d| u64::try_from(d.as_millis()).unwrap_or(u64::MAX)) } impl Supervisor { @@ -75,7 +74,7 @@ impl Supervisor { /// The process-global supervisor singleton. pub fn global() -> &'static Self { static GLOBAL: OnceLock = OnceLock::new(); - GLOBAL.get_or_init(Supervisor::new) + GLOBAL.get_or_init(Self::new) } /// Record that the stop path is live (or not) and refresh its heartbeat. From d67ec0cec2b4f7bcee2038a8848f1bb9fbaa786c Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 22:42:06 +0900 Subject: [PATCH 11/23] docs(computer-use): mark kill-switch verified live; gated execute_action next --- docs/computer-use/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md index 86dde2469..b0ae5feae 100644 --- a/docs/computer-use/README.md +++ b/docs/computer-use/README.md @@ -78,8 +78,8 @@ manual macOS E2E). | napi screenshot binding (`computerScreenshot`) | napi → `packages/natives` → TS, verified live | **done (this PR)** | | Native input orchestration (`input`) | `InputController` click/double_click/move/drag/scroll/type/keypress + release_all over an `EventSink` | **done (this PR)** — logic unit-tested; **live cursor-move injection verified** (Accessibility granted) | | Central `execute_action` state machine | preflight + supervisor + cancellation + audit + release-all | planned | -| Kill-switch supervisor + event-tap lifecycle | supervisor, hotkey, abort/release/suspend/snapshot | next — required before click/type are exposed | -| TS `computer` tool surface | full `ComputerController` + `computer.ts` schema/gating/prompt/renderer | planned | +| Kill-switch supervisor + global-hotkey event-tap | `supervisor` (fail-closed `input_allowed`, user-only reset) + `hotkey` CGEventTap on a CFRunLoop thread | **done (this PR)** — supervisor unit-tested; **synthetic-hotkey latch verified live** | +| Supervisor-gated `execute_action` + napi/TS `computer` tool | wire input through `input_allowed` + cancellation; `ComputerController` napi; `computer.ts` schema/gating/prompt/renderer | next | | Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | The remaining input backend, kill-switch, napi/TS surface, and manual From f6252cbc7428b68d3f9402da5ea1f49a4c17b33a Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 15:35:00 +0000 Subject: [PATCH 12/23] fix(computer-use): make computerScreenshot napi binding cross-platform The `computer_screenshot` napi function lived in the macOS-gated `computer::capture` module, so napi-rs omitted it from the generated `index.{js,d.ts}` on non-macOS targets. CI's Linux native build regenerated the bindings without `computerScreenshot`, breaking the `@gajae-code/natives` type check on `test/computer.test.ts` (TS2305: no exported member 'computerScreenshot'). Move the `ComputerScreenshot` struct and `computerScreenshot` binding into `computer::mod` (compiled on all platforms) and gate only the macOS CoreGraphics capture call internally, matching the `detectMacOSAppearance` pattern. Non-macOS callers receive a clear unsupported-platform error, and the generated TypeScript surface is now identical across platforms. --- crates/pi-natives/src/computer/capture.rs | 49 -------------------- crates/pi-natives/src/computer/mod.rs | 55 +++++++++++++++++++++++ packages/natives/native/index.d.ts | 8 ++-- 3 files changed, 60 insertions(+), 52 deletions(-) diff --git a/crates/pi-natives/src/computer/capture.rs b/crates/pi-natives/src/computer/capture.rs index ec4621a22..23efe9f8a 100644 --- a/crates/pi-natives/src/computer/capture.rs +++ b/crates/pi-natives/src/computer/capture.rs @@ -17,9 +17,6 @@ use std::{ffi::c_void, fmt}; -use napi::bindgen_prelude::Uint8Array; -use napi_derive::napi; - use crate::computer::coords::NormalizedDisplay; #[repr(C)] @@ -253,49 +250,3 @@ mod tests { std::fs::write("/tmp/computer-capture-evidence.png", &frame.png).ok(); } } - -/// A captured primary-display screenshot returned to JS. -/// -/// `width_px`/`height_px` are the physical pixels that define the action -/// coordinate space (see the coordinate contract); the scale/origin map them to -/// macOS logical points. -#[napi(object)] -pub struct ComputerScreenshot { - /// PNG-encoded image bytes. - pub png: Uint8Array, - /// Screenshot width in physical pixels. - pub width_px: u32, - /// Screenshot height in physical pixels. - pub height_px: u32, - /// Physical-pixels-per-logical-point along X. - pub scale_x: f64, - /// Physical-pixels-per-logical-point along Y. - pub scale_y: f64, - /// Logical origin X of the display (points). - pub origin_x: f64, - /// Logical origin Y of the display (points). - pub origin_y: f64, -} - -/// Capture the primary display for JS callers (macOS). -/// -/// Requires the Screen Recording permission. This is the read-only `screenshot` -/// primitive of the computer-use tool; input primitives land behind the same -/// surface once the Accessibility gate is satisfied in a granted `gjc` process. -/// -/// # Errors -/// Returns an error when capture fails (e.g. Screen Recording not granted). -#[napi(js_name = "computerScreenshot")] -pub fn computer_screenshot() -> napi::Result { - let frame = - capture_primary_display().map_err(|err| napi::Error::from_reason(format!("{err}")))?; - Ok(ComputerScreenshot { - png: Uint8Array::from(frame.png), - width_px: frame.display.width_px, - height_px: frame.display.height_px, - scale_x: frame.display.scale_x, - scale_y: frame.display.scale_y, - origin_x: frame.display.origin_x, - origin_y: frame.display.origin_y, - }) -} diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index a2724f540..15314d241 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -35,6 +35,61 @@ pub mod supervisor; pub use capture::{CaptureError, CapturedFrame, capture_primary_display}; pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; pub use input::{EventSink, InputController, InputError, MouseButton}; +use napi::bindgen_prelude::Uint8Array; +use napi_derive::napi; #[cfg(target_os = "macos")] pub use permissions::{PermissionError, PreflightStatus, TccPermission, preflight}; pub use supervisor::{Supervisor, SupervisorStatus}; + +/// A captured primary-display screenshot returned to JS. +/// +/// `width_px`/`height_px` are the physical pixels that define the action +/// coordinate space (see the coordinate contract); the scale/origin map them to +/// macOS logical points. +#[napi(object)] +pub struct ComputerScreenshot { + /// PNG-encoded image bytes. + pub png: Uint8Array, + /// Screenshot width in physical pixels. + pub width_px: u32, + /// Screenshot height in physical pixels. + pub height_px: u32, + /// Physical-pixels-per-logical-point along X. + pub scale_x: f64, + /// Physical-pixels-per-logical-point along Y. + pub scale_y: f64, + /// Logical origin X of the display (points). + pub origin_x: f64, + /// Logical origin Y of the display (points). + pub origin_y: f64, +} + +/// Capture the primary display for JS callers (macOS). +/// +/// Requires the Screen Recording permission. This is the read-only `screenshot` +/// primitive of the computer-use tool; input primitives land behind the same +/// surface once the Accessibility gate is satisfied in a granted `gjc` process. +/// +/// # Errors +/// Returns an error when capture fails (e.g. Screen Recording not granted). +#[napi(js_name = "computerScreenshot")] +pub fn computer_screenshot() -> napi::Result { + #[cfg(target_os = "macos")] + { + let frame = capture::capture_primary_display() + .map_err(|err| napi::Error::from_reason(format!("{err}")))?; + Ok(ComputerScreenshot { + png: Uint8Array::from(frame.png), + width_px: frame.display.width_px, + height_px: frame.display.height_px, + scale_x: frame.display.scale_x, + scale_y: frame.display.scale_y, + origin_x: frame.display.origin_x, + origin_y: frame.display.origin_y, + }) + } + #[cfg(not(target_os = "macos"))] + { + Err(napi::Error::from_reason("computer screenshot capture is only supported on macOS")) + } +} diff --git a/packages/natives/native/index.d.ts b/packages/natives/native/index.d.ts index 9647f3695..cfce95e09 100644 --- a/packages/natives/native/index.d.ts +++ b/packages/natives/native/index.d.ts @@ -370,9 +370,11 @@ export interface ClipboardImage { export declare function computerScreenshot(): ComputerScreenshot /** - * A captured primary-display screenshot returned to JS. `width_px`/`height_px` - * are the physical pixels that define the action coordinate space (see the - * coordinate contract); the scale/origin map them to macOS logical points. + * A captured primary-display screenshot returned to JS. + * + * `width_px`/`height_px` are the physical pixels that define the action + * coordinate space (see the coordinate contract); the scale/origin map them to + * macOS logical points. */ export interface ComputerScreenshot { /** PNG-encoded image bytes. */ From 1a897df915bcb26b4291ce071265334cb2aa8b5d Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 00:45:41 +0900 Subject: [PATCH 13/23] feat(pi-natives): add supervisor-gated execute_action (computer-use G001) Adds crates/pi-natives/src/computer/executor.rs: the single side-effect authority. execute_input runs a fail-closed gate (supervisor live+fresh+ not-suspended, Accessibility granted, matching display epoch for coordinate actions) before dispatching to InputController, and runs release_all on any error or mid-flight suspension. Stable error codes (COMPUTER_SUSPENDED / _SUPERVISOR_NOT_LIVE / _PERMISSION_REQUIRED / _DISPLAY_STALE / _COORD_INVALID / _CANCELLED). DisplayContext trait defines the display-epoch staleness contract; PermissionGate is injectable. 9 unit tests with a real Supervisor + fake perms/display + recording sink cover every gate-rejection path, matching-epoch success, out-of-bounds release-all, type/keypress/wait, and stable codes. 37 computer tests pass; clippy clean. Added InputController::into_sink accessor. --- crates/pi-natives/src/computer/executor.rs | 443 +++++++++++++++++++++ crates/pi-natives/src/computer/input.rs | 7 + crates/pi-natives/src/computer/mod.rs | 1 + 3 files changed, 451 insertions(+) create mode 100644 crates/pi-natives/src/computer/executor.rs diff --git a/crates/pi-natives/src/computer/executor.rs b/crates/pi-natives/src/computer/executor.rs new file mode 100644 index 000000000..994496550 --- /dev/null +++ b/crates/pi-natives/src/computer/executor.rs @@ -0,0 +1,443 @@ +//! Central supervisor-gated execution for computer-use input. +//! +//! # Single side-effect authority +//! Every side-effecting input action passes [`execute_input`] before the +//! [`InputController`] touches the OS. The gate is fail-closed: it requires the +//! supervisor stop-path live + fresh + not-suspended, Accessibility granted, +//! and (for coordinate actions) a matching display epoch. `release_all` runs on +//! every non-success exit and whenever suspension is observed mid-flight, so a +//! partial drag never leaves a button held. Screenshot is read-only (see +//! [`super::capture`]) and is intentionally NOT gated here. +//! +//! The gate logic is OS-agnostic and unit-tested with a fake permission gate, +//! fake display context, a real [`Supervisor`], and a recording [`EventSink`]; +//! macOS supplies the concrete permission/display providers. + +use super::{ + coords::{CoordError, NormalizedDisplay}, + input::{EventSink, InputController, InputError, MouseButton}, + supervisor::Supervisor, +}; + +/// A side-effecting computer-use action (the 8 input primitives). Screenshot is +/// handled by the read-only capture path, not this executor. +#[derive(Debug, Clone, PartialEq)] +pub enum InputAction { + /// Move + click. + Click { x: f64, y: f64, button: MouseButton }, + /// Move + double click. + DoubleClick { x: f64, y: f64, button: MouseButton }, + /// Move the cursor. + Move { x: f64, y: f64 }, + /// Press, drag, release. + Drag { x: f64, y: f64, to_x: f64, to_y: f64, button: MouseButton }, + /// Move + scroll by logical deltas. + Scroll { x: f64, y: f64, scroll_x: f64, scroll_y: f64 }, + /// Type a unicode string. + Type { text: String }, + /// Press/release named keys in order. + Keypress { keys: Vec }, + /// Abort-aware wait. + Wait { ms: u64 }, +} + +impl InputAction { + /// Whether the action targets a screenshot-space coordinate (and so needs a + /// fresh, matching display epoch). + #[must_use] + pub const fn is_coordinate(&self) -> bool { + matches!( + self, + Self::Click { .. } + | Self::DoubleClick { .. } + | Self::Move { .. } + | Self::Drag { .. } + | Self::Scroll { .. } + ) + } +} + +/// Reason an action was rejected or failed. Each maps to a stable error code so +/// the TS tool can surface consistent, actionable messages. +#[derive(Debug, Clone, PartialEq)] +pub enum ExecError { + /// Kill-switch latched; input stays off until a user-only reset. + Suspended, + /// The global stop path is not live/fresh; input is disabled fail-closed. + SupervisorNotLive, + /// Accessibility is not granted; no input may be injected. + PermissionRequired, + /// The display changed since the screenshot the coordinates came from. + DisplayStale, + /// A coordinate was out of bounds / non-finite / invalid scale. + Coord(CoordError), + /// The action was cancelled (AbortSignal/timeout/supervisor stop). + Cancelled, + /// A key name was not recognized. + UnknownKey(String), +} + +impl ExecError { + /// Stable error code string for the TS surface. + #[must_use] + pub const fn code(&self) -> &'static str { + match self { + Self::Suspended => "COMPUTER_SUSPENDED", + Self::SupervisorNotLive => "COMPUTER_SUPERVISOR_NOT_LIVE", + Self::PermissionRequired => "COMPUTER_PERMISSION_REQUIRED", + Self::DisplayStale => "COMPUTER_DISPLAY_STALE", + Self::Coord(_) => "COMPUTER_COORD_INVALID", + Self::Cancelled => "COMPUTER_CANCELLED", + Self::UnknownKey(_) => "COMPUTER_UNKNOWN_KEY", + } + } +} + +impl From for ExecError { + fn from(value: InputError) -> Self { + match value { + InputError::Coord(err) => Self::Coord(err), + InputError::UnknownKey(key) => Self::UnknownKey(key), + } + } +} + +impl std::fmt::Display for ExecError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Coord(err) => write!(f, "{}: {err}", self.code()), + Self::UnknownKey(key) => write!(f, "{}: {key}", self.code()), + _ => write!(f, "{}", self.code()), + } + } +} + +impl std::error::Error for ExecError {} + +/// Provides the current Accessibility (input) grant state. macOS implements +/// this over `permissions::accessibility_granted`; tests inject a fake. +pub trait PermissionGate { + /// Whether Accessibility is currently granted. + fn accessibility_granted(&self) -> bool; +} + +/// Provides the current display epoch so coordinate actions can reject stale +/// screenshots. macOS implements this over the capture/display descriptor. +pub trait DisplayContext { + /// The current display epoch (hash of topology/scale/origin). + fn current_epoch(&self) -> u64; +} + +/// Fail-closed gate run before any side-effecting input. +fn gate( + action: &InputAction, + supervisor: &Supervisor, + perms: &P, + display_ctx: &D, + expected_epoch: Option, +) -> Result<(), ExecError> { + let status = supervisor.status(); + if status.suspended { + return Err(ExecError::Suspended); + } + if !status.hotkey_live || !status.heartbeat_fresh { + return Err(ExecError::SupervisorNotLive); + } + if !perms.accessibility_granted() { + return Err(ExecError::PermissionRequired); + } + if action.is_coordinate() { + if let Some(expected) = expected_epoch { + if display_ctx.current_epoch() != expected { + return Err(ExecError::DisplayStale); + } + } + } + Ok(()) +} + +/// Execute a side-effecting input action through the fail-closed gate. +/// +/// `cancelled` is polled before and (for multi-step actions) reflected via the +/// controller; on any error or observed suspension, `release_all` runs so no +/// mouse button or modifier is left held. +/// +/// # Errors +/// Returns [`ExecError`] when the gate rejects (suspended / not-live / +/// permission / stale display), the action is cancelled, or the controller +/// reports a coordinate/key error. +pub fn execute_input( + action: &InputAction, + supervisor: &Supervisor, + perms: &P, + display_ctx: &D, + expected_epoch: Option, + display: &NormalizedDisplay, + controller: &mut InputController, + cancelled: &dyn Fn() -> bool, +) -> Result<(), ExecError> +where + S: EventSink, + P: PermissionGate, + D: DisplayContext, +{ + gate(action, supervisor, perms, display_ctx, expected_epoch)?; + if cancelled() { + return Err(ExecError::Cancelled); + } + + let result = dispatch(action, display, controller, cancelled); + + // release_all on any failure, or if the kill-switch latched mid-action. + if result.is_err() || supervisor.is_suspended() { + controller.release_all(); + } + result +} + +fn dispatch( + action: &InputAction, + display: &NormalizedDisplay, + controller: &mut InputController, + cancelled: &dyn Fn() -> bool, +) -> Result<(), ExecError> { + match action { + InputAction::Click { x, y, button } => controller + .click(display, *x, *y, *button) + .map_err(Into::into), + InputAction::DoubleClick { x, y, button } => controller + .double_click(display, *x, *y, *button) + .map_err(Into::into), + InputAction::Move { x, y } => controller.move_to(display, *x, *y).map_err(Into::into), + InputAction::Drag { x, y, to_x, to_y, button } => controller + .drag(display, *x, *y, *to_x, *to_y, *button) + .map_err(Into::into), + InputAction::Scroll { x, y, scroll_x, scroll_y } => controller + .scroll(display, *x, *y, *scroll_x, *scroll_y) + .map_err(Into::into), + InputAction::Type { text } => { + controller.type_text(text); + Ok(()) + }, + InputAction::Keypress { keys } => controller.keypress(keys).map_err(Into::into), + InputAction::Wait { ms } => wait_abortable(*ms, cancelled), + } +} + +/// Sleep up to `ms`, checking `cancelled` periodically. +fn wait_abortable(ms: u64, cancelled: &dyn Fn() -> bool) -> Result<(), ExecError> { + use std::time::{Duration, Instant}; + let deadline = Instant::now() + Duration::from_millis(ms); + while Instant::now() < deadline { + if cancelled() { + return Err(ExecError::Cancelled); + } + std::thread::sleep(Duration::from_millis(ms.min(10))); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::{DisplayContext, ExecError, InputAction, PermissionGate, execute_input}; + use crate::computer::{ + coords::{LogicalPoint, NormalizedDisplay}, + input::{EventSink, InputController, MouseButton, SinkOp}, + supervisor::Supervisor, + }; + + struct FakePerms { + granted: bool, + } + impl PermissionGate for FakePerms { + fn accessibility_granted(&self) -> bool { + self.granted + } + } + + struct FakeDisplay { + epoch: u64, + } + impl DisplayContext for FakeDisplay { + fn current_epoch(&self) -> u64 { + self.epoch + } + } + + #[derive(Default)] + struct RecordingSink { + ops: Vec, + } + impl EventSink for RecordingSink { + fn move_cursor(&mut self, to: LogicalPoint) { + self.ops.push(SinkOp::Move(to)); + } + + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool) { + self.ops.push(SinkOp::Button { at, button, down }); + } + + fn scroll(&mut self, dx: f64, dy: f64) { + self.ops.push(SinkOp::Scroll { dx, dy }); + } + + fn type_unicode(&mut self, text: &str) { + self.ops.push(SinkOp::TypeUnicode(text.to_string())); + } + + fn key(&mut self, code: u16, down: bool) { + self.ops.push(SinkOp::Key { code, down }); + } + } + + fn display() -> NormalizedDisplay { + NormalizedDisplay::new(200, 100, 2.0, 2.0, 0.0, 0.0) + } + + fn live_supervisor() -> Supervisor { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat(); + s + } + + fn never_cancel() -> impl Fn() -> bool { + || false + } + + fn run( + action: &InputAction, + sup: &Supervisor, + granted: bool, + expected_epoch: Option, + current_epoch: u64, + ) -> (Result<(), ExecError>, Vec) { + let mut controller = InputController::new(RecordingSink::default()); + let perms = FakePerms { granted }; + let disp_ctx = FakeDisplay { epoch: current_epoch }; + let cancel = never_cancel(); + let res = execute_input( + action, + sup, + &perms, + &disp_ctx, + expected_epoch, + &display(), + &mut controller, + &cancel, + ); + (res, controller.into_sink().ops) + } + + #[test] + fn suspended_rejects_before_any_sink_op() { + let sup = live_supervisor(); + sup.trigger_stop(); + let (res, ops) = run(&InputAction::Move { x: 10.0, y: 10.0 }, &sup, true, None, 0); + assert_eq!(res, Err(ExecError::Suspended)); + assert!(ops.is_empty(), "no events when suspended"); + } + + #[test] + fn not_live_rejects() { + let sup = Supervisor::new(); // hotkey not live + let (res, ops) = run( + &InputAction::Click { x: 1.0, y: 1.0, button: MouseButton::Left }, + &sup, + true, + None, + 0, + ); + assert_eq!(res, Err(ExecError::SupervisorNotLive)); + assert!(ops.is_empty()); + } + + #[test] + fn missing_accessibility_rejects() { + let sup = live_supervisor(); + let (res, ops) = run(&InputAction::Move { x: 1.0, y: 1.0 }, &sup, false, None, 0); + assert_eq!(res, Err(ExecError::PermissionRequired)); + assert!(ops.is_empty()); + } + + #[test] + fn stale_display_epoch_rejects_coordinate_action() { + let sup = live_supervisor(); + let (res, ops) = run( + &InputAction::Click { x: 1.0, y: 1.0, button: MouseButton::Left }, + &sup, + true, + Some(7), + 9, + ); + assert_eq!(res, Err(ExecError::DisplayStale)); + assert!(ops.is_empty()); + } + + #[test] + fn matching_epoch_allows_action() { + let sup = live_supervisor(); + let (res, ops) = run( + &InputAction::Click { x: 100.0, y: 50.0, button: MouseButton::Left }, + &sup, + true, + Some(7), + 7, + ); + assert!(res.is_ok()); + assert!(!ops.is_empty()); + } + + #[test] + fn out_of_bounds_coordinate_errors_and_releases() { + let sup = live_supervisor(); + // drag to out-of-bounds: press happens then error -> release_all leaves nothing + // held. + let action = InputAction::Drag { + x: 0.0, + y: 0.0, + to_x: 999.0, + to_y: 0.0, + button: MouseButton::Left, + }; + let (res, ops) = run(&action, &sup, true, None, 0); + assert!(matches!(res, Err(ExecError::Coord(_)))); + let downs = ops + .iter() + .filter(|o| matches!(o, SinkOp::Button { down: true, .. })) + .count(); + let ups = ops + .iter() + .filter(|o| matches!(o, SinkOp::Button { down: false, .. })) + .count(); + assert_eq!(downs, ups, "every press is released after the error path"); + } + + #[test] + fn type_and_keypress_pass_the_gate() { + let sup = live_supervisor(); + let (res, ops) = run(&InputAction::Type { text: "hi".to_string() }, &sup, true, None, 0); + assert!(res.is_ok()); + assert_eq!(ops, vec![SinkOp::TypeUnicode("hi".to_string())]); + + let (res2, ops2) = + run(&InputAction::Keypress { keys: vec!["enter".to_string()] }, &sup, true, None, 0); + assert!(res2.is_ok()); + assert_eq!(ops2.len(), 2); // key down + up + } + + #[test] + fn wait_zero_is_ok() { + let sup = live_supervisor(); + let (res, _) = run(&InputAction::Wait { ms: 0 }, &sup, true, None, 0); + assert!(res.is_ok()); + } + + #[test] + fn error_codes_are_stable() { + assert_eq!(ExecError::Suspended.code(), "COMPUTER_SUSPENDED"); + assert_eq!(ExecError::SupervisorNotLive.code(), "COMPUTER_SUPERVISOR_NOT_LIVE"); + assert_eq!(ExecError::PermissionRequired.code(), "COMPUTER_PERMISSION_REQUIRED"); + assert_eq!(ExecError::DisplayStale.code(), "COMPUTER_DISPLAY_STALE"); + } +} diff --git a/crates/pi-natives/src/computer/input.rs b/crates/pi-natives/src/computer/input.rs index 924e52389..61b538d69 100644 --- a/crates/pi-natives/src/computer/input.rs +++ b/crates/pi-natives/src/computer/input.rs @@ -130,6 +130,13 @@ impl InputController { !self.held_buttons.is_empty() } + /// Consume the controller and return the underlying sink (e.g. to inspect + /// recorded events in tests). + #[must_use] + pub fn into_sink(self) -> S { + self.sink + } + fn press(&mut self, at: LogicalPoint, button: MouseButton) { self.sink.mouse_button(at, button, true); if !self.held_buttons.contains(&button) { diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index 15314d241..53e5554c5 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -24,6 +24,7 @@ #[cfg(target_os = "macos")] pub mod capture; pub mod coords; +pub mod executor; #[cfg(target_os = "macos")] pub mod hotkey; pub mod input; From d59fffb73f4622c42e3b41fef411e40ab5a908f3 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 01:18:57 +0900 Subject: [PATCH 14/23] feat(computer-use): napi ComputerController (G002) + ultragoal red-team gate (G004) G002 (pi-natives): capture.rs gains display_epoch (geometry hash) + capture_id + lightweight current_display_epoch() (no Screen Recording); executor.rs gains MacPermissionGate/MacDisplayContext providers; new controller.rs exposes a #[napi] ComputerController whose 9 methods are thin adapters that all route through executor::execute_input (the single side-effect authority); bypass_guard.rs statically asserts InputController side-effect methods are only called from input.rs/executor.rs. Bindings regenerated. cargo test computer:: = 38 pass (incl bypass guard), clippy clean. G004 (ultragoal gate): ultragoal-runtime.ts gains a trusted changeSet data-flow (checkpoint+review), computer-touching detection from trusted changed paths (declarations additive-only), a mandatory computer adversarial case-set (7 IDs, no not_applicable) requiring live/structural native proof (inline/metadata/receipt-only fail with COMPUTER_REDTEAM_*), computer/native surface tokens, docs-only tiering, and byte-for-byte non-computer compatibility. New fixture matrix: 7 pass; ultragoal-runtime 102 pass + review 8 pass (non-regression). --- .../pi-natives/src/computer/bypass_guard.rs | 44 +++ crates/pi-natives/src/computer/capture.rs | 103 +++++-- crates/pi-natives/src/computer/controller.rs | 169 +++++++++++ crates/pi-natives/src/computer/executor.rs | 20 ++ crates/pi-natives/src/computer/mod.rs | 42 ++- .../src/gjc-runtime/ultragoal-runtime.ts | 282 ++++++++++++++++-- .../computer-red-team-fixtures.test.ts | 262 ++++++++++++++++ packages/natives/native/index.d.ts | 17 ++ packages/natives/native/index.js | 1 + packages/natives/test/computer.test.ts | 24 +- 10 files changed, 907 insertions(+), 57 deletions(-) create mode 100644 crates/pi-natives/src/computer/bypass_guard.rs create mode 100644 crates/pi-natives/src/computer/controller.rs create mode 100644 packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts diff --git a/crates/pi-natives/src/computer/bypass_guard.rs b/crates/pi-natives/src/computer/bypass_guard.rs new file mode 100644 index 000000000..09a1ad437 --- /dev/null +++ b/crates/pi-natives/src/computer/bypass_guard.rs @@ -0,0 +1,44 @@ +#[cfg(test)] +mod tests { + use std::{fs, path::Path}; + + const SIDE_EFFECT_METHODS: &[&str] = + &[".click(", ".double_click(", ".drag(", ".scroll(", ".type_text(", ".keypress("]; + + #[test] + fn input_controller_side_effect_methods_stay_behind_executor() { + let computer_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("src/computer"); + let mut violations = Vec::new(); + + for entry in fs::read_dir(&computer_dir).expect("computer module directory is readable") { + let entry = entry.expect("computer module entry is readable"); + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) != Some("rs") { + continue; + } + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or_default(); + if file_name == "bypass_guard.rs" { + continue; + } + let source = fs::read_to_string(&path).expect("computer module source is readable"); + for method in SIDE_EFFECT_METHODS { + if !source.contains(method) { + continue; + } + if file_name != "input.rs" && file_name != "executor.rs" { + violations.push(format!("{file_name} references {method}")); + } + } + } + + assert!( + violations.is_empty(), + "InputController side-effect methods must be referenced only in input.rs and \ + executor.rs: {}", + violations.join(", ") + ); + } +} diff --git a/crates/pi-natives/src/computer/capture.rs b/crates/pi-natives/src/computer/capture.rs index 23efe9f8a..b8dc0a79f 100644 --- a/crates/pi-natives/src/computer/capture.rs +++ b/crates/pi-natives/src/computer/capture.rs @@ -15,7 +15,13 @@ //! Implemented with raw CoreGraphics FFI (no extra crates); the buffer is owned //! Rust memory and every Core Graphics handle is released exactly once. -use std::{ffi::c_void, fmt}; +use std::{ + collections::hash_map::DefaultHasher, + ffi::c_void, + fmt, + hash::{Hash, Hasher}, + sync::atomic::{AtomicU64, Ordering}, +}; use crate::computer::coords::NormalizedDisplay; @@ -56,6 +62,8 @@ unsafe extern "C" { fn CGMainDisplayID() -> CgDirectDisplayId; fn CGDisplayBounds(display: CgDirectDisplayId) -> CgRect; fn CGDisplayCreateImage(display: CgDirectDisplayId) -> CgImageRef; + fn CGDisplayPixelsWide(display: CgDirectDisplayId) -> usize; + fn CGDisplayPixelsHigh(display: CgDirectDisplayId) -> usize; fn CGImageGetWidth(image: CgImageRef) -> usize; fn CGImageGetHeight(image: CgImageRef) -> usize; fn CGImageRelease(image: CgImageRef); @@ -100,12 +108,18 @@ impl fmt::Display for CaptureError { impl std::error::Error for CaptureError {} +static NEXT_CAPTURE_ID: AtomicU64 = AtomicU64::new(1); + /// A captured primary-display frame. pub struct CapturedFrame { /// Coordinate descriptor for the captured display. - pub display: NormalizedDisplay, + pub display: NormalizedDisplay, /// PNG-encoded RGBA image bytes. - pub png: Vec, + pub png: Vec, + /// Stable hash of the display geometry used for stale-display checks. + pub display_epoch: u64, + /// Process-local opaque capture id. + pub capture_id: u32, } /// Capture the current primary display as a PNG plus its coordinate descriptor. @@ -115,14 +129,19 @@ pub struct CapturedFrame { /// Screen Recording grant), a bitmap context cannot be created, or PNG encoding /// fails. pub fn capture_primary_display() -> Result { - // SAFETY: both calls are pure Core Graphics queries; `CGMainDisplayID` - // returns a valid id for the active primary display and `CGDisplayBounds` - // reads geometry for that id. - let (display_id, bounds) = unsafe { + // SAFETY: pure Core Graphics geometry queries for the active primary display; + // no image capture occurs before `CGDisplayCreateImage` below. + let (display_id, display) = unsafe { let id = CGMainDisplayID(); - (id, CGDisplayBounds(id)) + let bounds = CGDisplayBounds(id); + let pixels_wide = CGDisplayPixelsWide(id); + let pixels_high = CGDisplayPixelsHigh(id); + (id, display_descriptor(pixels_wide, pixels_high, bounds)) }; + let display_epoch = display_epoch(&display); + let capture_id = next_capture_id(); + // SAFETY: `display_id` is a valid primary-display id. The returned image is // released exactly once below regardless of the `frame_from_image` result. let image = unsafe { CGDisplayCreateImage(display_id) }; @@ -130,16 +149,27 @@ pub fn capture_primary_display() -> Result { return Err(CaptureError::CaptureFailed); } - let result = frame_from_image(image, bounds); + let result = frame_from_image(image, display, display_epoch, capture_id); // SAFETY: `image` is non-null (checked above) and not used after release. unsafe { CGImageRelease(image) }; result } +#[must_use] +pub fn current_display_epoch() -> u64 { + let display = current_display_descriptor(); + display_epoch(&display) +} + /// Convert a non-null `CGImage` into a [`CapturedFrame`]. Does not release /// `image`; the caller owns its lifetime. -fn frame_from_image(image: CgImageRef, bounds: CgRect) -> Result { +fn frame_from_image( + image: CgImageRef, + display: NormalizedDisplay, + display_epoch: u64, + capture_id: u32, +) -> Result { // SAFETY: `image` is non-null per the caller's check. let (width, height) = unsafe { (CGImageGetWidth(image), CGImageGetHeight(image)) }; if width == 0 || height == 0 { @@ -189,20 +219,8 @@ fn frame_from_image(image: CgImageRef, bounds: CgRect) -> Result f64 { if logical > 0.0 { pixels / logical } else { 1.0 } } +fn current_display_descriptor() -> NormalizedDisplay { + // SAFETY: pure Core Graphics geometry queries for the active primary display; + // no image capture or Screen Recording permission is involved. + unsafe { + let display_id = CGMainDisplayID(); + let bounds = CGDisplayBounds(display_id); + display_descriptor(CGDisplayPixelsWide(display_id), CGDisplayPixelsHigh(display_id), bounds) + } +} + +fn display_descriptor(width: usize, height: usize, bounds: CgRect) -> NormalizedDisplay { + let scale_x = derive_scale(width as f64, bounds.size.width); + let scale_y = derive_scale(height as f64, bounds.size.height); + NormalizedDisplay::new( + width as u32, + height as u32, + scale_x, + scale_y, + bounds.origin.x, + bounds.origin.y, + ) +} + +fn display_epoch(display: &NormalizedDisplay) -> u64 { + let mut hasher = DefaultHasher::new(); + display.width_px.hash(&mut hasher); + display.height_px.hash(&mut hasher); + display.scale_x.to_bits().hash(&mut hasher); + display.scale_y.to_bits().hash(&mut hasher); + display.origin_x.to_bits().hash(&mut hasher); + display.origin_y.to_bits().hash(&mut hasher); + hasher.finish() +} + +fn next_capture_id() -> u32 { + let id = NEXT_CAPTURE_ID.fetch_add(1, Ordering::Relaxed); + ((id - 1) % u64::from(u32::MAX) + 1) as u32 +} + fn encode_png(rgba: &[u8], width: u32, height: u32) -> Result, CaptureError> { use image::{ExtendedColorType, ImageEncoder, codecs::png::PngEncoder}; diff --git a/crates/pi-natives/src/computer/controller.rs b/crates/pi-natives/src/computer/controller.rs new file mode 100644 index 000000000..1faaa921b --- /dev/null +++ b/crates/pi-natives/src/computer/controller.rs @@ -0,0 +1,169 @@ +//! N-API controller surface for macOS computer-use. +//! +//! Side-effecting methods are thin adapters: they construct an [`InputAction`] +//! and delegate to [`execute_input`]. No direct input controller methods are +//! called from this module. + +use napi::bindgen_prelude::Uint8Array; +use napi_derive::napi; + +use crate::computer::{ + ComputerScreenshot, + capture::capture_primary_display, + executor::{ExecError, InputAction, MacDisplayContext, MacPermissionGate, execute_input}, + hotkey, + input::{MouseButton, guarded_controller}, + supervisor::Supervisor, +}; + +#[napi] +pub struct ComputerController; + +#[napi] +impl ComputerController { + #[napi(constructor)] + pub fn new() -> Self { + Self + } + + #[napi] + pub fn screenshot(&self) -> napi::Result { + let frame = + capture_primary_display().map_err(|err| napi::Error::from_reason(format!("{err}")))?; + Ok(ComputerScreenshot { + png: Uint8Array::from(frame.png), + width_px: frame.display.width_px, + height_px: frame.display.height_px, + scale_x: frame.display.scale_x, + scale_y: frame.display.scale_y, + origin_x: frame.display.origin_x, + origin_y: frame.display.origin_y, + display_epoch: frame.display_epoch as f64, + capture_id: frame.capture_id, + }) + } + + #[napi] + pub fn click( + &self, + expected_epoch: Option, + x: f64, + y: f64, + button: Option, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Click { x, y, button: parse_button(button)? }) + } + + #[napi(js_name = "doubleClick")] + pub fn double_click( + &self, + expected_epoch: Option, + x: f64, + y: f64, + button: Option, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::DoubleClick { x, y, button: parse_button(button)? }) + } + + #[napi] + pub fn move_(&self, expected_epoch: Option, x: f64, y: f64) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Move { x, y }) + } + + #[napi] + pub fn drag( + &self, + expected_epoch: Option, + x: f64, + y: f64, + to_x: f64, + to_y: f64, + button: Option, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Drag { + x, + y, + to_x, + to_y, + button: parse_button(button)?, + }) + } + + #[napi] + pub fn scroll( + &self, + expected_epoch: Option, + x: f64, + y: f64, + scroll_x: f64, + scroll_y: f64, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Scroll { x, y, scroll_x, scroll_y }) + } + + #[napi(js_name = "type")] + pub fn type_(&self, expected_epoch: Option, text: String) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Type { text }) + } + + #[napi] + pub fn keypress(&self, expected_epoch: Option, keys: Vec) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Keypress { keys }) + } + + #[napi] + pub fn wait(&self, expected_epoch: Option, ms: u32) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Wait { ms: u64::from(ms) }) + } + + fn execute(&self, expected_epoch: Option, action: InputAction) -> napi::Result<()> { + hotkey::start(); + let frame = + capture_primary_display().map_err(|err| napi::Error::from_reason(format!("{err}")))?; + let display = frame.display; + let mut controller = guarded_controller() + .map_err(|err| napi_error("COMPUTER_PERMISSION_REQUIRED", err.to_string()))?; + let cancel = || Supervisor::global().is_suspended(); + execute_input( + &action, + Supervisor::global(), + &MacPermissionGate, + &MacDisplayContext, + expected_epoch.map(epoch_from_f64), + &display, + &mut controller, + &cancel, + ) + .map_err(exec_error) + } +} + +fn parse_button(button: Option) -> napi::Result { + match button + .as_deref() + .unwrap_or("left") + .to_ascii_lowercase() + .as_str() + { + "left" => Ok(MouseButton::Left), + "right" => Ok(MouseButton::Right), + "center" | "middle" => Ok(MouseButton::Center), + other => Err(napi_error("COMPUTER_COORD_INVALID", format!("unknown mouse button: {other}"))), + } +} + +fn epoch_from_f64(value: f64) -> u64 { + if value.is_finite() && value >= 0.0 { + value as u64 + } else { + u64::MAX + } +} + +fn exec_error(err: ExecError) -> napi::Error { + napi_error(err.code(), err.to_string()) +} + +fn napi_error(code: &'static str, reason: String) -> napi::Error { + napi::Error::new(napi::Status::GenericFailure, format!("{code}: {reason}")) +} diff --git a/crates/pi-natives/src/computer/executor.rs b/crates/pi-natives/src/computer/executor.rs index 994496550..87a0131dc 100644 --- a/crates/pi-natives/src/computer/executor.rs +++ b/crates/pi-natives/src/computer/executor.rs @@ -128,6 +128,26 @@ pub trait DisplayContext { fn current_epoch(&self) -> u64; } +#[cfg(target_os = "macos")] +pub struct MacPermissionGate; + +#[cfg(target_os = "macos")] +impl PermissionGate for MacPermissionGate { + fn accessibility_granted(&self) -> bool { + crate::computer::permissions::accessibility_granted() + } +} + +#[cfg(target_os = "macos")] +pub struct MacDisplayContext; + +#[cfg(target_os = "macos")] +impl DisplayContext for MacDisplayContext { + fn current_epoch(&self) -> u64 { + crate::computer::capture::current_display_epoch() + } +} + /// Fail-closed gate run before any side-effecting input. fn gate( action: &InputAction, diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs index 53e5554c5..567a2daa9 100644 --- a/crates/pi-natives/src/computer/mod.rs +++ b/crates/pi-natives/src/computer/mod.rs @@ -21,8 +21,12 @@ //! -> pi-natives::computer (execute_action state machine + backend) //! ``` +#[cfg(test)] +mod bypass_guard; #[cfg(target_os = "macos")] pub mod capture; +#[cfg(target_os = "macos")] +pub mod controller; pub mod coords; pub mod executor; #[cfg(target_os = "macos")] @@ -33,7 +37,9 @@ pub mod permissions; pub mod supervisor; #[cfg(target_os = "macos")] -pub use capture::{CaptureError, CapturedFrame, capture_primary_display}; +pub use capture::{CaptureError, CapturedFrame, capture_primary_display, current_display_epoch}; +#[cfg(target_os = "macos")] +pub use controller::ComputerController; pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; pub use input::{EventSink, InputController, InputError, MouseButton}; use napi::bindgen_prelude::Uint8Array; @@ -50,19 +56,23 @@ pub use supervisor::{Supervisor, SupervisorStatus}; #[napi(object)] pub struct ComputerScreenshot { /// PNG-encoded image bytes. - pub png: Uint8Array, + pub png: Uint8Array, /// Screenshot width in physical pixels. - pub width_px: u32, + pub width_px: u32, /// Screenshot height in physical pixels. - pub height_px: u32, + pub height_px: u32, /// Physical-pixels-per-logical-point along X. - pub scale_x: f64, + pub scale_x: f64, /// Physical-pixels-per-logical-point along Y. - pub scale_y: f64, + pub scale_y: f64, /// Logical origin X of the display (points). - pub origin_x: f64, + pub origin_x: f64, /// Logical origin Y of the display (points). - pub origin_y: f64, + pub origin_y: f64, + /// Stable hash of the display geometry used for stale-display checks. + pub display_epoch: f64, + /// Process-local opaque capture id. + pub capture_id: u32, } /// Capture the primary display for JS callers (macOS). @@ -80,13 +90,15 @@ pub fn computer_screenshot() -> napi::Result { let frame = capture::capture_primary_display() .map_err(|err| napi::Error::from_reason(format!("{err}")))?; Ok(ComputerScreenshot { - png: Uint8Array::from(frame.png), - width_px: frame.display.width_px, - height_px: frame.display.height_px, - scale_x: frame.display.scale_x, - scale_y: frame.display.scale_y, - origin_x: frame.display.origin_x, - origin_y: frame.display.origin_y, + png: Uint8Array::from(frame.png), + width_px: frame.display.width_px, + height_px: frame.display.height_px, + scale_x: frame.display.scale_x, + scale_y: frame.display.scale_y, + origin_x: frame.display.origin_x, + origin_y: frame.display.origin_y, + display_epoch: frame.display_epoch as f64, + capture_id: frame.capture_id, }) } #[cfg(not(target_os = "macos"))] diff --git a/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts b/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts index c450113d1..1ac172a44 100644 --- a/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts +++ b/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts @@ -801,13 +801,119 @@ function evidenceKindMatches(kind: string, words: string[]): boolean { type SurfaceFamily = "web" | "cli" | "native" | "api-package" | "algorithm-math" | "unknown"; +type UltragoalChangeStatus = "added" | "modified" | "deleted" | "renamed" | "copied" | "unknown"; +type UltragoalChangeCategory = + | "code" + | "generated-binding" + | "tool" + | "settings-registry" + | "prompt-doc-behavior" + | "docs-static" + | "other"; +interface UltragoalChangeSetPath extends JsonObject { + path: string; + status: UltragoalChangeStatus; + oldPath?: string; + category?: UltragoalChangeCategory; +} +interface UltragoalChangeSet extends JsonObject { + source: "checkpoint-git" | "review-pr" | "review-branch" | "review-worktree" | "review-spec"; + baseRef?: string; + headRef?: string; + mergeBase?: string; + paths: UltragoalChangeSetPath[]; + rawDiffStat?: string; + trusted: true; +} + +const COMPUTER_SURFACE_TOKENS = new Set(["computer", "computer-use", "desktop-input", "native-input"]); +const MANDATORY_COMPUTER_CASE_IDS = [ + "kill-switch-bypass", + "suspended-enforcement", + "permission-revoked", + "display-stale", + "out-of-bounds-drift", + "runaway-loop-halt", + "blast-radius", +] as const; + +function normalizeRepoPath(value: string): string { + return value.replaceAll("\\\\", "/").replace(/^\.\//, ""); +} + +function categorizeComputerChangePath(value: string): UltragoalChangeCategory { + const normalized = normalizeRepoPath(value); + if (normalized.startsWith("crates/pi-natives/src/computer/")) return "code"; + if (/^packages\/natives\/native\/index\.(?:d\.ts|js)$/.test(normalized)) return "generated-binding"; + if ( + normalized === "packages/coding-agent/src/tools/computer.ts" || + normalized.startsWith("packages/coding-agent/src/tools/computer/") + ) + return "tool"; + if ( + normalized === "packages/coding-agent/src/tools/index.ts" || + normalized === "packages/coding-agent/src/tools/renderers.ts" || + normalized === "packages/coding-agent/src/config/settings-schema.ts" + ) + return "settings-registry"; + if ( + normalized === "packages/coding-agent/src/prompts/tools/computer.md" || + normalized === "packages/coding-agent/src/defaults/gjc/skills/ultragoal/SKILL.md" || + normalized === "packages/coding-agent/src/prompts/agents/executor.md" + ) + return "prompt-doc-behavior"; + if (normalized === "docs/tools/computer.md" || normalized === "docs/computer-use/README.md") return "docs-static"; + return "other"; +} + +function isComputerChangePath(row: UltragoalChangeSetPath): boolean { + return categorizeComputerChangePath(row.path) !== "other" || (row.oldPath ? categorizeComputerChangePath(row.oldPath) !== "other" : false); +} + +function isDocsOnlyStaticComputerChangeSet(changeSet: UltragoalChangeSet | undefined): boolean { + if (!changeSet || changeSet.paths.length === 0) return false; + return changeSet.paths.every(row => { + const category = row.category ?? categorizeComputerChangePath(row.path); + const oldCategory = row.oldPath ? categorizeComputerChangePath(row.oldPath) : category; + return category === "docs-static" && oldCategory === "docs-static"; + }); +} + +function trustedChangeSetRequiresComputerSuite(changeSet: UltragoalChangeSet | undefined): boolean { + if (!changeSet || !changeSet.trusted) return false; + if (isDocsOnlyStaticComputerChangeSet(changeSet)) return false; + return changeSet.paths.some(isComputerChangePath); +} + +function executorQaDeclaresComputerTouching(executorQa: JsonObject): boolean { + if (executorQa.computerTouching === true) return true; + const surfaces = Array.isArray(executorQa.surfaces) ? executorQa.surfaces : []; + if (surfaces.some(value => typeof value === "string" && COMPUTER_SURFACE_TOKENS.has(normalizeSurfaceToken(value)))) return true; + const surfaceRows = Array.isArray(executorQa.surfaceEvidence) ? executorQa.surfaceEvidence : []; + return surfaceRows.some(row => { + const object = qualityGateObject(row); + const surface = object ? nonEmptyString(object.surface) : null; + return surface ? COMPUTER_SURFACE_TOKENS.has(normalizeSurfaceToken(surface)) : false; + }); +} + +function requiresComputerRedTeamSuite(executorQa: JsonObject, changeSet: UltragoalChangeSet | undefined): boolean { + if (trustedChangeSetRequiresComputerSuite(changeSet)) return true; + const declaredPaths = Array.isArray(executorQa.changedPaths) ? executorQa.changedPaths : []; + return declaredPaths.some(value => typeof value === "string" && categorizeComputerChangePath(value) !== "other"); +} + +function normalizeAdversarialCaseId(value: string): string { + return normalizeSurfaceToken(value).replace(/\s+/g, "-"); +} + export function normalizeSurfaceToken(value: string): string { return value.toLowerCase().replaceAll("_", "-").trim(); } export function surfaceFamily(value: string): SurfaceFamily { const normalized = normalizeSurfaceToken(value); - if (["native", "desktop", "tui"].some(word => normalized.includes(word))) return "native"; + if (["computer", "computer-use", "desktop-input", "native-input", "native", "desktop", "tui"].some(word => normalized.includes(word))) return "native"; if (["gui", "web", "browser", "ui", "visual"].some(word => normalized.includes(word))) return "web"; if (["cli", "terminal", "command"].some(word => normalized.includes(word))) return "cli"; if (["api", "package", "library", "sdk"].some(word => normalized.includes(word))) return "api-package"; @@ -1836,12 +1942,61 @@ function validateAdversarialCases( return idMap; } +async function validateMandatoryComputerAdversarialCases( + cwd: string, + contractCoverage: JsonObject[], + adversarialCases: Map, + artifactRefs: Map, +): Promise { + const linkedCaseIds = new Set(); + for (const [index, row] of contractCoverage.entries()) { + const ids = optionalStringLinks(row, "adversarialCaseRefs", `executorQa.contractCoverage[${index}]`); + for (const id of ids ?? []) linkedCaseIds.add(normalizeAdversarialCaseId(id)); + } + for (const caseId of MANDATORY_COMPUTER_CASE_IDS) { + const row = adversarialCases.get(caseId); + if (!row) throw new Error(`COMPUTER_REDTEAM_CASE_MISSING: qualityGate executorQa.adversarialCases must include ${caseId}`); + if (optionalStatusField(row, `executorQa.adversarialCases.${caseId}`) === NOT_APPLICABLE_STATUS) { + throw new Error(`COMPUTER_REDTEAM_CASE_NOT_APPLICABLE: mandatory computer adversarial case ${caseId} must not be not_applicable`); + } + if (!linkedCaseIds.has(caseId)) { + throw new Error(`COMPUTER_REDTEAM_CASE_UNLINKED: mandatory computer adversarial case ${caseId} must be linked from contractCoverage.adversarialCaseRefs`); + } + const artifactIds = requireStringLinks(row.artifactRefs, `executorQa.adversarialCases.${caseId}.artifactRefs`); + let hasValidLiveNativeProof = false; + let sawInlineOnly = false; + let sawReceiptOnly = false; + let sawMetadataOnly = false; + for (const artifactId of artifactIds) { + const artifact = artifactRefs.get(artifactId); + if (!artifact) throw new Error(`qualityGate executorQa.adversarialCases.${caseId}.artifactRefs references unknown id ${artifactId}`); + const fieldName = `executorQa.artifactRefs.${artifactId}`; + if (artifact.inlineEvidence !== undefined && !nonEmptyString(artifact.path)) sawInlineOnly = true; + if ((artifact.verifiedReceipt !== undefined || artifact.receipt !== undefined) && !nonEmptyString(artifact.path)) sawReceiptOnly = true; + if (!nonEmptyString(artifact.path) && artifact.inlineEvidence === undefined && artifact.verifiedReceipt === undefined && artifact.receipt === undefined) sawMetadataOnly = true; + try { + await validateArtifactProof(cwd, artifact, fieldName, { surfaceFamily: "native", live: true }); + if (await validateStructuralArtifact(cwd, artifact, fieldName, { surfaceFamily: "native", live: true })) hasValidLiveNativeProof = true; + } catch { + // Preserve the explicit computer red-team error taxonomy below. + } + } + if (!hasValidLiveNativeProof) { + if (sawInlineOnly) throw new Error(`COMPUTER_REDTEAM_INLINE_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`); + if (sawReceiptOnly) throw new Error(`COMPUTER_REDTEAM_RECEIPT_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`); + if (sawMetadataOnly) throw new Error(`COMPUTER_REDTEAM_ARTIFACT_METADATA_ONLY: mandatory computer adversarial case ${caseId} requires durable live structural native proof`); + throw new Error(`COMPUTER_REDTEAM_ARTIFACT_MISSING: mandatory computer adversarial case ${caseId} requires at least one valid live structural native proof artifact`); + } + } +} + + function validateContractCoverage( executorQa: JsonObject, surfaceEvidence: Map, adversarialCases: Map, artifactRefs: Map, -): void { +): JsonObject[] { const rows = requireObjectArray(executorQa.contractCoverage, "executorQa.contractCoverage"); buildRowIdMap(rows, "executorQa.contractCoverage"); let hasSuccessfulContractCoverage = false; @@ -1892,32 +2047,40 @@ function validateContractCoverage( "qualityGate executorQa.contractCoverage must include at least one row with status covered, passed, or verified", ); } + return rows; } async function validateExecutorQaRedTeamEvidenceInternal( cwd: string, executorQa: JsonObject, - _options: { mode?: "checkpoint" | "review" } = {}, + options: { mode?: "checkpoint" | "review"; changeSet?: UltragoalChangeSet } = {}, ): Promise { const artifactRefs = await validateArtifactRefs(cwd, executorQa); const surfaceEvidence = await validateSurfaceEvidence(cwd, executorQa, artifactRefs); const adversarialCases = validateAdversarialCases(executorQa, artifactRefs); - validateContractCoverage(executorQa, surfaceEvidence, adversarialCases, artifactRefs); + const contractCoverage = validateContractCoverage(executorQa, surfaceEvidence, adversarialCases, artifactRefs); + if (requiresComputerRedTeamSuite(executorQa, options.changeSet)) { + await validateMandatoryComputerAdversarialCases(cwd, contractCoverage, adversarialCases, artifactRefs); + } } -async function validateExecutorQaRedTeamEvidence(cwd: string, executorQa: JsonObject): Promise { - await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa, { mode: "checkpoint" }); +async function validateExecutorQaRedTeamEvidence( + cwd: string, + executorQa: JsonObject, + options: { changeSet?: UltragoalChangeSet } = {}, +): Promise { + await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa, { mode: "checkpoint", changeSet: options.changeSet }); } export async function validateExecutorQaRedTeamEvidenceForReview( cwd: string, executorQa: Record, - options: { mode?: "review" } = {}, + options: { mode?: "review"; changeSet?: UltragoalChangeSet } = {}, ): Promise { await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa as JsonObject, options); } -async function validateCompletionQualityGate(cwd: string, gate: JsonObject): Promise { +async function validateCompletionQualityGate(cwd: string, gate: JsonObject, options: { changeSet?: UltragoalChangeSet } = {}): Promise { const codeReview = qualityGateObject(gate.codeReview); if (codeReview) { throw new Error( @@ -1962,7 +2125,7 @@ async function validateCompletionQualityGate(cwd: string, gate: JsonObject): Pro } requireNonEmptyString(executorQa.evidence, "executorQa.evidence"); requireEmptyBlockers(executorQa.blockers, "executorQa.blockers"); - await validateExecutorQaRedTeamEvidence(cwd, executorQa); + await validateExecutorQaRedTeamEvidence(cwd, executorQa, { changeSet: options.changeSet }); if (iteration.status !== PASSED_STATUS || iteration.fullRerun !== true) { throw new Error("qualityGate iteration must be passed with fullRerun true"); } @@ -1973,7 +2136,7 @@ async function validateCompletionQualityGate(cwd: string, gate: JsonObject): Pro requireEmptyBlockers(iteration.blockers, "iteration.blockers"); } -async function readRequiredCompletionQualityGate(cwd: string, value: string | undefined): Promise { +async function readRequiredCompletionQualityGate(cwd: string, value: string | undefined, options: { changeSet?: UltragoalChangeSet } = {}): Promise { if (!value?.trim()) { throw new Error( "complete checkpoints require --quality-gate-json with architectReview, executorQa, and iteration evidence", @@ -1982,7 +2145,7 @@ async function readRequiredCompletionQualityGate(cwd: string, value: string | un const gate = await readStructuredValue(cwd, value); const gateObject = qualityGateObject(gate); if (!gateObject) throw new Error("qualityGate must be a JSON object"); - await validateCompletionQualityGate(cwd, gateObject); + await validateCompletionQualityGate(cwd, gateObject, { changeSet: options.changeSet }); return gate; } @@ -2085,9 +2248,10 @@ export async function checkpointUltragoalGoal(input: { // instead of silently dropping it. return plan; } + const changeSet = input.status === "complete" ? await computeCheckpointChangeSet(input.cwd) : undefined; const qualityGateJson = input.status === "complete" - ? await readRequiredCompletionQualityGate(input.cwd, input.qualityGateJson) + ? await readRequiredCompletionQualityGate(input.cwd, input.qualityGateJson, { changeSet }) : input.qualityGateJson ? await readStructuredValue(input.cwd, input.qualityGateJson) : undefined; @@ -2686,20 +2850,101 @@ async function resolveGitBase(cwd: string, branch?: string): Promise { } const mergeBase = await spawnText(["git", "merge-base", "HEAD", "origin/main"], { cwd, timeoutMs: 3000 }); if (mergeBase.ok && mergeBase.stdout.trim()) return mergeBase.stdout.trim(); - return "HEAD"; + return "HEAD~1"; +} + +function parseGitNameStatus(output: string): UltragoalChangeSetPath[] { + const rows: UltragoalChangeSetPath[] = []; + for (const line of output.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) continue; + const parts = trimmed.split(/\s+/); + const statusCode = parts[0] ?? ""; + let status: UltragoalChangeStatus = "unknown"; + if (statusCode.startsWith("A")) status = "added"; + else if (statusCode.startsWith("M")) status = "modified"; + else if (statusCode.startsWith("D")) status = "deleted"; + else if (statusCode.startsWith("R")) status = "renamed"; + else if (statusCode.startsWith("C")) status = "copied"; + const pathValue = status === "renamed" || status === "copied" ? parts[2] : parts[1]; + if (!pathValue) continue; + const oldPath = status === "renamed" || status === "copied" ? parts[1] : undefined; + rows.push({ path: normalizeRepoPath(pathValue), oldPath: oldPath ? normalizeRepoPath(oldPath) : undefined, status, category: categorizeComputerChangePath(pathValue) }); + } + return rows; +} + +function mergeChangeSetPaths(groups: UltragoalChangeSetPath[][]): UltragoalChangeSetPath[] { + const byKey = new Map(); + for (const row of groups.flat()) byKey.set(`${row.oldPath ?? ""}\u0000${row.path}`, row); + return [...byKey.values()]; +} + +async function computeCheckpointChangeSet(cwd: string): Promise { + const inGit = await spawnText(["git", "rev-parse", "--is-inside-work-tree"], { cwd, timeoutMs: 3000 }); + if (!inGit.ok || inGit.stdout.trim() !== "true") return undefined; + if (!(await Bun.file(path.join(cwd, ".git")).exists())) return undefined; + const baseRef = await resolveGitBase(cwd); + const base = baseRef; + const mergeBase = await spawnText(["git", "merge-base", "HEAD", baseRef], { cwd, timeoutMs: 3000 }); + const [committed, unstaged, staged, stat] = await Promise.all([ + spawnText(["git", "diff", "--name-status", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--name-status"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--cached", "--name-status"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--stat", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + ]); + if (!committed.ok && !unstaged.ok && !staged.ok) return undefined; + return { + source: "checkpoint-git", + baseRef, + mergeBase: mergeBase.ok && mergeBase.stdout.trim() ? mergeBase.stdout.trim() : undefined, + headRef: "HEAD", + paths: mergeChangeSetPaths([parseGitNameStatus(committed.stdout), parseGitNameStatus(unstaged.stdout), parseGitNameStatus(staged.stdout)]), + rawDiffStat: stat.stdout, + trusted: true, + }; +} + +function parseUnifiedDiffPaths(diff: string): UltragoalChangeSetPath[] { + const paths: UltragoalChangeSetPath[] = []; + for (const line of diff.split("\n")) { + if (!line.startsWith("diff --git ")) continue; + const match = /^diff --git a\/(.+?) b\/(.+)$/.exec(line); + if (!match) continue; + const oldPath = normalizeRepoPath(match[1]!); + const newPath = normalizeRepoPath(match[2]!); + paths.push({ path: newPath, oldPath: oldPath === newPath ? undefined : oldPath, status: oldPath === newPath ? "modified" : "renamed", category: categorizeComputerChangePath(newPath) }); + } + return paths; +} + +function changeSetFromReviewSource(source: JsonObject): UltragoalChangeSet | undefined { + const kind = nonEmptyString(source.kind); + if (kind === "spec") return { source: "review-spec", paths: [], trusted: true }; + if (kind === "pr" && typeof source.diff === "string") return { source: "review-pr", paths: parseUnifiedDiffPaths(source.diff), rawDiffStat: source.diff, trusted: true }; + const local = qualityGateObject(source.local); + if (kind === "pr" && local) return changeSetFromReviewSource(local); + if (kind === "worktree") return { source: "review-worktree", paths: parseGitNameStatus(String(source.nameStatus ?? source.status ?? "")), rawDiffStat: String(source.diffStat ?? ""), trusted: true }; + if (kind === "branch" || kind === "pr-fallback") return { source: "review-branch", baseRef: nonEmptyString(source.base) ?? undefined, headRef: "HEAD", paths: parseGitNameStatus(String(source.nameStatus ?? "")), rawDiffStat: String(source.diffStat ?? ""), trusted: true }; + return undefined; } async function localDiffSource(cwd: string, sourceKind: string, branch?: string): Promise { if (sourceKind === "worktree") { - const [status, diff] = await Promise.all([ + const [status, diff, unstaged, staged] = await Promise.all([ spawnText(["git", "status", "--short"], { cwd, timeoutMs: 5000 }), spawnText(["git", "diff", "--stat"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--name-status"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--cached", "--name-status"], { cwd, timeoutMs: 5000 }), ]); - return { kind: "worktree", status: status.stdout, diffStat: diff.stdout }; + return { kind: "worktree", status: status.stdout, diffStat: diff.stdout, nameStatus: `${unstaged.stdout}\n${staged.stdout}` }; } const base = await resolveGitBase(cwd, branch); - const diff = await spawnText(["git", "diff", "--stat", `${base}...HEAD`], { cwd, timeoutMs: 5000 }); - return { kind: sourceKind, base, branch, diffStat: diff.stdout }; + const [diff, nameStatus] = await Promise.all([ + spawnText(["git", "diff", "--stat", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--name-status", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + ]); + return { kind: sourceKind, base, branch, diffStat: diff.stdout, nameStatus: nameStatus.stdout }; } async function resolveReviewSource( @@ -2813,13 +3058,14 @@ export async function runUltragoalReview(cwd: string, args: readonly string[]): const mode = parseReviewMode(flagValue(args, "--mode")); const specPath = flagValue(args, "--spec"); const { contractStrength, source } = await resolveReviewSource(cwd, args, specPath); + const changeSet = changeSetFromReviewSource(source); const executorQa = await readOptionalExecutorQa( cwd, flagValue(args, "--executor-qa-json") ?? flagValue(args, "--executor-qa"), ); const findings: UltragoalReviewFinding[] = []; try { - await validateExecutorQaRedTeamEvidenceForReview(cwd, executorQa, { mode: "review" }); + await validateExecutorQaRedTeamEvidenceForReview(cwd, executorQa, { mode: "review", changeSet }); } catch (error) { findings.push(findingFromError(error)); } diff --git a/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts b/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts new file mode 100644 index 000000000..bf7902ff5 --- /dev/null +++ b/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts @@ -0,0 +1,262 @@ +import { afterEach, describe, expect, it } from "bun:test"; +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; +import { deflateSync } from "node:zlib"; + +import { createUltragoalPlan, runNativeUltragoalCommand, startNextUltragoalGoal } from "@gajae-code/coding-agent/gjc-runtime/ultragoal-runtime"; + +const tempRoots: string[] = []; + +async function tempDir(): Promise { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "gjc-computer-red-team-")); + tempRoots.push(root); + return root; +} + +afterEach(async () => { + await Promise.all(tempRoots.splice(0).map(dir => fs.rm(dir, { recursive: true, force: true }))); +}); + +async function runGit(cwd: string, args: string[]): Promise { + const proc = Bun.spawn(["git", ...args], { cwd, stdout: "pipe", stderr: "pipe" }); + const [stdout, stderr, exitCode] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text(), proc.exited]); + if (exitCode !== 0) throw new Error(`git ${args.join(" ")} failed: ${stdout}${stderr}`); +} + +async function initRepo(root: string): Promise { + await runGit(root, ["init"]); + await runGit(root, ["config", "user.email", "test@example.com"]); + await runGit(root, ["config", "user.name", "Test User"]); + await fs.writeFile(path.join(root, "README.md"), "base\n"); + await runGit(root, ["add", "README.md"]); + await runGit(root, ["commit", "-m", "base"]); + await runGit(root, ["branch", "-M", "main"]); +} + +const PNG_SIGNATURE = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); +const PNG_CRC_TABLE = new Uint32Array(256).map((_, index) => { + let crc = index; + for (let bit = 0; bit < 8; bit++) crc = crc & 1 ? 0xedb88320 ^ (crc >>> 1) : crc >>> 1; + return crc >>> 0; +}); + +function pngCrc32(bytes: Buffer): number { + let crc = 0xffffffff; + for (const byte of bytes) crc = PNG_CRC_TABLE[(crc ^ byte) & 0xff]! ^ (crc >>> 8); + return (crc ^ 0xffffffff) >>> 0; +} + +function pngChunk(type: string, data = Buffer.alloc(0)): Buffer { + const typeBytes = Buffer.from(type, "ascii"); + const length = Buffer.alloc(4); + length.writeUInt32BE(data.length, 0); + const crc = Buffer.alloc(4); + crc.writeUInt32BE(pngCrc32(Buffer.concat([typeBytes, data])), 0); + return Buffer.concat([length, typeBytes, data, crc]); +} + +function syntheticPng(): Buffer { + const width = 320; + const height = 180; + const ihdr = Buffer.alloc(13); + ihdr.writeUInt32BE(width, 0); + ihdr.writeUInt32BE(height, 4); + ihdr[8] = 8; + ihdr[9] = 2; + const raw = Buffer.alloc((width * 3 + 1) * height); + for (let y = 0; y < height; y++) { + const row = y * (width * 3 + 1); + raw[row] = 0; + for (let x = 0; x < width; x++) { + const offset = row + 1 + x * 3; + raw[offset] = x % 256; + raw[offset + 1] = y % 256; + raw[offset + 2] = (x + y) % 256; + } + } + return Buffer.concat([PNG_SIGNATURE, pngChunk("IHDR", ihdr), pngChunk("IDAT", deflateSync(raw)), pngChunk("IEND")]); +} + +let activeObjective = ""; +async function seedPlan(root: string): Promise { + const created = await createUltragoalPlan({ cwd: root, brief: "@goal computer gate fixture", gjcObjective: "fixture" }); + await runGit(root, ["add", ".gjc/ultragoal/goals.json", ".gjc/ultragoal/ledger.jsonl"]); + await runGit(root, ["commit", "-m", "plan"]); + activeObjective = created.gjcObjective; + await startNextUltragoalGoal({ cwd: root }); +} + +function goalSnapshot(): string { + return JSON.stringify({ goal: { threadId: "test-thread", objective: activeObjective, status: "active", createdAt: Date.now(), updatedAt: Date.now() } }); +} + +function artifact(kind = "native screenshot"): Record { + return { id: "surface-proof", kind, description: "live structural native proof", path: "artifacts/native.png" }; +} + +const CASES = [ + "kill-switch-bypass", + "suspended-enforcement", + "permission-revoked", + "display-stale", + "out-of-bounds-drift", + "runaway-loop-halt", + "blast-radius", +]; + +function executorQa(overrides: { cases?: Record[]; artifacts?: Record[]; computerTouching?: boolean; surface?: string } = {}): Record { + const cases = overrides.cases ?? CASES.map(id => ({ + id, + status: "passed", + contractRef: "computer-safety", + scenario: `${id} adversarial scenario exercises the computer safety boundary`, + expectedBehavior: "fail closed before unsafe desktop input can continue", + verdict: "passed", + artifactRefs: ["case-proof"], + })); + return { + status: "passed", + e2eStatus: "passed", + redTeamStatus: "passed", + evidence: "executor QA covered the requested contract with durable proof artifacts", + e2eCommands: ["bun test fixture"], + redTeamCommands: ["bun test fixture"], + changedPaths: overrides.computerTouching === true ? ["crates/pi-natives/src/computer/executor.rs"] : undefined, + computerTouching: overrides.computerTouching, + artifactRefs: overrides.artifacts ?? [artifact("native screenshot"), { ...artifact("native screenshot"), id: "case-proof" }], + surfaceEvidence: [{ + id: "surface-native", + contractRef: "computer-safety", + surface: overrides.surface ?? "native", + status: "passed", + invocation: "native fixture invocation", + verdict: "passed", + artifactRefs: ["surface-proof"], + }], + adversarialCases: cases, + contractCoverage: [{ + id: "coverage", + contractRef: "computer-safety", + status: "covered", + obligation: "all mandatory computer red-team cases are covered", + surfaceEvidenceRefs: ["surface-native"], + adversarialCaseRefs: cases.map(row => String(row.id)), + }], + blockers: [], + }; +} + +function qualityGate(qa: Record): string { + return JSON.stringify({ + architectReview: { + architectureStatus: "CLEAR", + productStatus: "CLEAR", + codeStatus: "CLEAR", + recommendation: "APPROVE", + commands: ["review"], + evidence: "architect review passed with no blockers", + blockers: [], + }, + executorQa: qa, + iteration: { + status: "passed", + fullRerun: true, + rerunCommands: ["bun test fixture"], + evidence: "targeted fixture rerun passed", + blockers: [], + }, + }); +} + +async function writeQaArtifacts(root: string): Promise { + await fs.mkdir(path.join(root, "artifacts"), { recursive: true }); + await fs.writeFile(path.join(root, "artifacts/native.png"), syntheticPng()); +} + +async function checkpoint(root: string, qa: Record): Promise { + const result = await runNativeUltragoalCommand([ + "checkpoint", + "--goal-id", "G001", + "--status", "complete", + "--evidence", "fixture complete", + "--gjc-goal-json", goalSnapshot(), + "--quality-gate-json", qualityGate(qa), + ], root); + return result.stderr + result.stdout; +} + +async function seedComputerChange(root: string, file = "crates/pi-natives/src/computer/executor.rs"): Promise { + await fs.mkdir(path.dirname(path.join(root, file)), { recursive: true }); + await fs.writeFile(path.join(root, file), "// computer change\n"); + await runGit(root, ["add", file]); +} + +describe("computer red-team fixture matrix", () => { + it("preserves non-computer validation when unchanged", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + expect(await checkpoint(root, executorQa())).toContain("Checkpointed G001 as complete"); + }); + + it("fails computer code change missing a mandatory case", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root); + const message = await checkpoint(root, executorQa({ computerTouching: true, cases: (executorQa().adversarialCases as Record[]).filter(row => row.id !== "blast-radius") })).catch(error => String(error)); + expect(message).toContain("COMPUTER_REDTEAM_CASE_MISSING"); + }); + + it("fails not_applicable on a mandatory case", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root); + const cases = CASES.map(id => ({ id, status: id === "blast-radius" ? "not_applicable" : "passed", contractRef: "computer-safety", scenario: "scenario text", expectedBehavior: "expected behavior", verdict: "passed", artifactRefs: ["case-proof"] })); + const message = await checkpoint(root, executorQa({ cases })).catch(error => String(error)); + expect(message).toContain("not_applicable"); + }); + + it("fails mandatory case with inline-only metadata artifact", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await seedComputerChange(root); + await writeQaArtifacts(root); + const message = await checkpoint(root, executorQa({ computerTouching: true, artifacts: [artifact("native screenshot"), { id: "case-proof", kind: "native metadata", description: "inline only", inlineEvidence: "inline proof is not durable live structural evidence" }] })).catch(error => String(error)); + expect(message).toContain("COMPUTER_REDTEAM_INLINE_ONLY"); + }); + + it("passes full valid computer gate", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root); + expect(await checkpoint(root, executorQa({ computerTouching: true }))).toContain("Checkpointed G001 as complete"); + }); + + it("does not trigger from declaration-only without trusted computer change", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + const qa = executorQa({ computerTouching: false, surface: "native" }); + expect(await checkpoint(root, qa)).toContain("Checkpointed G001 as complete"); + }); + + it("allows non-operational docs-only computer tiering", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root, "docs/computer-use/README.md"); + const qa = executorQa({ computerTouching: false, surface: "native" }); + expect(await checkpoint(root, qa)).toContain("Checkpointed G001 as complete"); + }); +}); diff --git a/packages/natives/native/index.d.ts b/packages/natives/native/index.d.ts index cfce95e09..568c23611 100644 --- a/packages/natives/native/index.d.ts +++ b/packages/natives/native/index.d.ts @@ -1,5 +1,18 @@ /* auto-generated by NAPI-RS */ /* eslint-disable */ +export declare class ComputerController { + constructor() + screenshot(): ComputerScreenshot + click(expectedEpoch: number | undefined | null, x: number, y: number, button?: string | undefined | null): void + doubleClick(expectedEpoch: number | undefined | null, x: number, y: number, button?: string | undefined | null): void + move(expectedEpoch: number | undefined | null, x: number, y: number): void + drag(expectedEpoch: number | undefined | null, x: number, y: number, toX: number, toY: number, button?: string | undefined | null): void + scroll(expectedEpoch: number | undefined | null, x: number, y: number, scrollX: number, scrollY: number): void + type(expectedEpoch: number | undefined | null, text: string): void + keypress(expectedEpoch: number | undefined | null, keys: Array): void + wait(expectedEpoch: number | undefined | null, ms: number): void +} + /** * Long-lived macOS appearance observer. * @@ -391,6 +404,10 @@ export interface ComputerScreenshot { originX: number /** Logical origin Y of the display (points). */ originY: number + /** Stable hash of the display geometry used for stale-display checks. */ + displayEpoch: number + /** Process-local opaque capture id. */ + captureId: number } /** A context line (before or after a match). */ diff --git a/packages/natives/native/index.js b/packages/natives/native/index.js index 0e744d4e6..30add87d9 100644 --- a/packages/natives/native/index.js +++ b/packages/natives/native/index.js @@ -17,6 +17,7 @@ const nativeBindings = loadNative(); nativeBindings.initNativeCrashDiagnostics?.(); // --- generated native exports (do not edit) --- // classes +export const ComputerController = nativeBindings.ComputerController; export const MacAppearanceObserver = nativeBindings.MacAppearanceObserver; export const MacOSPowerAssertion = nativeBindings.MacOSPowerAssertion; export const Process = nativeBindings.Process; diff --git a/packages/natives/test/computer.test.ts b/packages/natives/test/computer.test.ts index d8e4acda8..4f83a3681 100644 --- a/packages/natives/test/computer.test.ts +++ b/packages/natives/test/computer.test.ts @@ -1,8 +1,28 @@ import { describe, expect, it } from "bun:test"; -import { computerScreenshot } from "../native/index.js"; +import { ComputerController, computerScreenshot } from "../native/index.js"; const isMacOS = process.platform === "darwin"; +describe.if(isMacOS)("ComputerController napi binding", () => { + it("exists with expected methods", () => { + const controller = new ComputerController(); + expect(controller).toBeInstanceOf(ComputerController); + for (const method of [ + "screenshot", + "click", + "doubleClick", + "move", + "drag", + "scroll", + "type", + "keypress", + "wait", + ]) { + expect(typeof controller[method as keyof ComputerController]).toBe("function"); + } + }); +}); + // The native `computerScreenshot` binding is macOS-only and captures the real // primary display, so it requires the Screen Recording permission. Gate on // platform and skip gracefully when capture is unavailable in the environment. @@ -22,6 +42,8 @@ describe.if(isMacOS)("computer screenshot napi binding", () => { expect(shot.scaleX).toBeGreaterThan(0); expect(shot.scaleY).toBeGreaterThan(0); expect(shot.png.byteLength).toBeGreaterThan(0); + expect(shot.displayEpoch).toBeGreaterThan(0); + expect(shot.captureId).toBeGreaterThan(0); // PNG magic number: 89 50 4E 47 0D 0A 1A 0A. const sig = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]; From 0e562c02ff77cfb3227b2a953418db253668331b Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 16:22:04 +0000 Subject: [PATCH 15/23] fix(pi-natives): collapse display epoch guard --- crates/pi-natives/src/computer/executor.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/crates/pi-natives/src/computer/executor.rs b/crates/pi-natives/src/computer/executor.rs index 87a0131dc..87b0e7ede 100644 --- a/crates/pi-natives/src/computer/executor.rs +++ b/crates/pi-natives/src/computer/executor.rs @@ -166,12 +166,11 @@ fn gate( if !perms.accessibility_granted() { return Err(ExecError::PermissionRequired); } - if action.is_coordinate() { - if let Some(expected) = expected_epoch { - if display_ctx.current_epoch() != expected { - return Err(ExecError::DisplayStale); - } - } + if action.is_coordinate() + && let Some(expected) = expected_epoch + && display_ctx.current_epoch() != expected + { + return Err(ExecError::DisplayStale); } Ok(()) } From 6aa946a42b34bd713df4f95283c7999d010453d5 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 01:31:15 +0900 Subject: [PATCH 16/23] feat(computer-use): first-class TS computer tool + metadata-only catalog (G003) Adds packages/coding-agent/src/tools/computer.ts: an AgentTool with the exact OpenAI 9-action snake_case zod schema routing through the @gajae-code/natives ComputerController, AbortSignal/timeout propagation, and COMPUTER_* error mapping. Off-by-default + fail-closed: callable only on macOS when computer.enabled||computer.alwaysOn; disabled returns COMPUTER_DISABLED without constructing the controller or starting native resources. First-class WITHOUT unsafe default exposure via a separate BUILTIN_CAPABILITY_CATALOG (metadata-only) distinct from the callable BUILTIN_TOOLS, so a disabled computer is documented/listable but not in the session registry and not auto-activatable by search_tool_bm25. Adds prompt, bounded renderer, settings-schema entries, and docs/tools/computer.md. 9 tool tests pass (exact schema, camelCase rejection, gating, disabled->COMPUTER_DISABLED, dispatch mapping); biome clean. --- docs/tools/computer.md | 71 ++++ .../src/config/settings-schema.ts | 60 +++ .../src/prompts/tools/computer.md | 32 ++ packages/coding-agent/src/tools/computer.ts | 384 ++++++++++++++++++ .../coding-agent/src/tools/computer/render.ts | 68 ++++ packages/coding-agent/src/tools/index.ts | 24 ++ packages/coding-agent/src/tools/renderers.ts | 2 + .../coding-agent/src/tools/tool-timeouts.ts | 1 + .../coding-agent/test/tools/computer.test.ts | 178 ++++++++ 9 files changed, 820 insertions(+) create mode 100644 docs/tools/computer.md create mode 100644 packages/coding-agent/src/prompts/tools/computer.md create mode 100644 packages/coding-agent/src/tools/computer.ts create mode 100644 packages/coding-agent/src/tools/computer/render.ts create mode 100644 packages/coding-agent/test/tools/computer.test.ts diff --git a/docs/tools/computer.md b/docs/tools/computer.md new file mode 100644 index 000000000..f25b5307c --- /dev/null +++ b/docs/tools/computer.md @@ -0,0 +1,71 @@ +# computer + +> Explicitly enabled macOS desktop screenshot and input control through the native supervisor-gated computer controller. + +## Source + +- Entry: `packages/coding-agent/src/tools/computer.ts` +- Model-facing prompt: `packages/coding-agent/src/prompts/tools/computer.md` +- Renderer: `packages/coding-agent/src/tools/computer/render.ts` +- Native controller: `@gajae-code/natives` `ComputerController` + +## Availability + +`computer` is first-class in the product catalog and documentation, but it is not a callable tool by default. + +Callable activation requires all of: + +1. macOS (`process.platform === "darwin"`), and +2. `computer.enabled` or `computer.alwaysOn` set to `true`. + +When disabled, every action including `screenshot` returns `COMPUTER_DISABLED`. Disabled catalog/listing paths do not construct `ComputerController`, start hotkeys, probe Screen Recording, probe Accessibility, capture screenshots, or expose the callable schema to `search_tool_bm25`. + +## Inputs + +The model action object uses an exact snake_case discriminated schema. CamelCase fields are rejected. + +### Shared fields + +| Field | Type | Required | Description | +| --- | --- | --- | --- | +| `action` | see actions below | Yes | Dispatch action. | +| `timeout` | `number` | No | Maximum action time in seconds. | +| `include_screenshot` | `boolean` | No | Request a bounded post-action screenshot when supported. | + +### Actions + +| Action | Required fields | Optional fields | +| --- | --- | --- | +| `screenshot` | none | shared | +| `click` | `x`, `y` | `button`, shared | +| `double_click` | `x`, `y` | `button`, shared | +| `move` | `x`, `y` | `button`, shared | +| `drag` | `x`, `y`, `to_x`, `to_y` | `button`, shared | +| `scroll` | `x`, `y`, `scroll_x`, `scroll_y` | shared | +| `type` | `text` | shared | +| `keypress` | `keys` | shared | +| `wait` | `ms` | shared | + +`button` is one of `left`, `right`, or `middle`. + +## Coordinate contract + +`x`, `y`, `to_x`, and `to_y` are screenshot pixels in the latest screenshot coordinate frame. They are not CSS pixels and not normalized fractions. The screenshot result records dimensions, scale, origin, display epoch, and capture id when supplied by native code. Coordinate actions must not clamp invalid coordinates; native code returns `COMPUTER_COORD_INVALID` or `COMPUTER_DISPLAY_STALE` before input when the coordinate/display contract cannot be satisfied. + +## Errors + +Stable computer error codes include: + +- `COMPUTER_DISABLED` +- `COMPUTER_SUSPENDED` +- `COMPUTER_SUPERVISOR_NOT_LIVE` +- `COMPUTER_PERMISSION_REQUIRED` +- `COMPUTER_DISPLAY_STALE` +- `COMPUTER_COORD_INVALID` +- `COMPUTER_CANCELLED` + +TS handles settings/platform exposure and UX mapping. Native `execute_action` remains the side-effect authority for supervisor state, permissions, display freshness, coordinate validation, cancellation, and release-all behavior. + +## Rendering + +The TUI renderer is bounded: it shows action, coordinates, scroll/key/wait summary, screenshot dimensions/byte count/capture id, supervisor status, and error code. It never renders raw screenshot base64. diff --git a/packages/coding-agent/src/config/settings-schema.ts b/packages/coding-agent/src/config/settings-schema.ts index 95f2726de..3a703320f 100644 --- a/packages/coding-agent/src/config/settings-schema.ts +++ b/packages/coding-agent/src/config/settings-schema.ts @@ -2111,6 +2111,66 @@ export const SETTINGS_SCHEMA = { }, }, + "computer.enabled": { + type: "boolean", + default: false, + ui: { + tab: "tools", + label: "Computer", + description: "Enable the macOS computer tool for this session. Off by default.", + }, + }, + + "computer.alwaysOn": { + type: "boolean", + default: false, + ui: { + tab: "tools", + label: "Computer Always On", + description: "Keep the macOS computer tool callable without per-session enablement.", + }, + }, + + "computer.autoScreenshot": { + type: "boolean", + default: false, + ui: { + tab: "tools", + label: "Computer Auto Screenshot", + description: "Automatically request bounded screenshots after computer actions when supported.", + }, + }, + + "computer.screenshotMaxBytes": { + type: "number", + default: 5_000_000, + ui: { + tab: "tools", + label: "Computer Screenshot Max Bytes", + description: "Maximum screenshot payload size for computer action results.", + }, + }, + + "computer.killSwitchHotkey": { + type: "string", + default: "Control+Option+Command+Escape", + ui: { + tab: "tools", + label: "Computer Kill Switch Hotkey", + description: "Native stop/suspend hotkey shown to users for computer-use sessions.", + }, + }, + + "computer.auditLog.enabled": { + type: "boolean", + default: true, + ui: { + tab: "tools", + label: "Computer Audit Log", + description: "Persist audit records for enabled computer-use actions.", + }, + }, + // Tool execution "tools.intentTracing": { type: "boolean", diff --git a/packages/coding-agent/src/prompts/tools/computer.md b/packages/coding-agent/src/prompts/tools/computer.md new file mode 100644 index 000000000..f8c3dbc4f --- /dev/null +++ b/packages/coding-agent/src/prompts/tools/computer.md @@ -0,0 +1,32 @@ +# computer + +Use `computer` only when the session has explicitly enabled macOS computer-use. It controls the real desktop and is off by default. + +## Safety contract + +- Disabled means disabled: when `computer.enabled` and `computer.alwaysOn` are both false, every action including `screenshot` fails with `COMPUTER_DISABLED` and captures nothing. +- The tool is macOS-only in v1. +- Native execution remains supervisor-gated. If the stop/suspend supervisor is unavailable, stale, suspended, permissioned off, display-stale, or cancelled, the action fails closed with a `COMPUTER_*` code. +- Respect the user's stop/suspend request immediately. Do not loop desktop actions after a stop/suspend/error. + +## Coordinate contract + +Coordinates are screenshot pixels, not CSS pixels and not normalized fractions. Use the latest successful `screenshot` dimensions and origin/scale metadata as the coordinate frame. Do not guess coordinates outside the screenshot bounds. + +## Actions + +The model action object uses exactly these snake_case actions and fields: + +- `screenshot` — capture the enabled desktop. +- `click` — `x`, `y`, optional `button` (`left`, `right`, `middle`). +- `double_click` — `x`, `y`, optional `button`. +- `move` — `x`, `y`, optional `button`. +- `drag` — `x`, `y`, `to_x`, `to_y`, optional `button`. +- `scroll` — `x`, `y`, `scroll_x`, `scroll_y`. +- `type` — `text`. +- `keypress` — `keys` string array. +- `wait` — `ms`. + +Shared optional fields: `timeout` seconds and `include_screenshot` for a bounded post-action screenshot when supported. + +Do not use camelCase fields such as `doubleClick`, `toX`, `scrollX`, or `includeScreenshot` in the model action object. diff --git a/packages/coding-agent/src/tools/computer.ts b/packages/coding-agent/src/tools/computer.ts new file mode 100644 index 000000000..ca1fa2f79 --- /dev/null +++ b/packages/coding-agent/src/tools/computer.ts @@ -0,0 +1,384 @@ +import type { AgentTool, AgentToolContext, AgentToolResult, AgentToolUpdateCallback } from "@gajae-code/agent-core"; +import { prompt } from "@gajae-code/utils"; +import * as z from "zod/v4"; +import computerDescription from "../prompts/tools/computer.md" with { type: "text" }; +import type { ToolSession } from "./index"; +import { ToolAbortError, ToolError, throwIfAborted } from "./tool-errors"; +import { toolResult } from "./tool-result"; +import { clampTimeout } from "./tool-timeouts"; + +const buttonSchema = z.enum(["left", "right", "middle"]); +const shared = { + timeout: z.number().positive().optional().describe("Maximum time in seconds for this action."), + include_screenshot: z.boolean().optional().describe("Capture a bounded post-action screenshot when supported."), +}; + +const screenshotSchema = z.object({ action: z.literal("screenshot"), ...shared }).strict(); +const clickSchema = z + .object({ action: z.literal("click"), x: z.number(), y: z.number(), button: buttonSchema.optional(), ...shared }) + .strict(); +const doubleClickSchema = z + .object({ + action: z.literal("double_click"), + x: z.number(), + y: z.number(), + button: buttonSchema.optional(), + ...shared, + }) + .strict(); +const moveSchema = z + .object({ action: z.literal("move"), x: z.number(), y: z.number(), button: buttonSchema.optional(), ...shared }) + .strict(); +const dragSchema = z + .object({ + action: z.literal("drag"), + x: z.number(), + y: z.number(), + to_x: z.number(), + to_y: z.number(), + button: buttonSchema.optional(), + ...shared, + }) + .strict(); +const scrollSchema = z + .object({ + action: z.literal("scroll"), + x: z.number(), + y: z.number(), + scroll_x: z.number(), + scroll_y: z.number(), + ...shared, + }) + .strict(); +const typeSchema = z.object({ action: z.literal("type"), text: z.string(), ...shared }).strict(); +const keypressSchema = z + .object({ action: z.literal("keypress"), keys: z.array(z.string()).min(1), ...shared }) + .strict(); +const waitSchema = z.object({ action: z.literal("wait"), ms: z.number().int().nonnegative(), ...shared }).strict(); + +export const computerSchema = z.discriminatedUnion("action", [ + screenshotSchema, + clickSchema, + doubleClickSchema, + moveSchema, + dragSchema, + scrollSchema, + typeSchema, + keypressSchema, + waitSchema, +]); + +export type ComputerParams = z.infer; +export type ComputerActionName = ComputerParams["action"]; + +export interface ComputerScreenshotDetails { + widthPx: number; + heightPx: number; + scaleX?: number; + scaleY?: number; + originX?: number; + originY?: number; + displayEpoch?: string; + captureId?: string; + pngBytes?: number; +} + +export interface ComputerToolDetails { + action: ComputerActionName; + status: "success" | "disabled" | "error"; + code?: string; + message?: string; + x?: number; + y?: number; + toX?: number; + toY?: number; + scrollX?: number; + scrollY?: number; + button?: string; + keys?: string[]; + ms?: number; + screenshot?: ComputerScreenshotDetails; + supervisor?: string; +} + +type NativeController = { + screenshot?: (payload?: unknown, options?: { signal?: AbortSignal }) => Promise | NativeScreenshot; + click?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + doubleClick?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + move?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + drag?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + scroll?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + type?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + keypress?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + wait?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; +}; + +type NativeScreenshot = { + png?: Uint8Array | Buffer | ArrayBuffer | string; + widthPx?: number; + heightPx?: number; + scaleX?: number; + scaleY?: number; + originX?: number; + originY?: number; + displayEpoch?: string; + captureId?: string; +}; + +export type ComputerControllerFactory = () => NativeController; + +export const COMPUTER_DISABLED_CODE = "COMPUTER_DISABLED"; + +const NATIVE_ERROR_CODES = new Set([ + "COMPUTER_SUSPENDED", + "COMPUTER_SUPERVISOR_NOT_LIVE", + "COMPUTER_PERMISSION_REQUIRED", + "COMPUTER_DISPLAY_STALE", + "COMPUTER_COORD_INVALID", + "COMPUTER_CANCELLED", +]); + +function createNativeComputerController(): NativeController { + const natives = require("@gajae-code/natives") as { ComputerController?: new () => NativeController }; + if (!natives.ComputerController) { + throw new ToolError("ComputerController is unavailable in @gajae-code/natives.", { + code: "COMPUTER_UNAVAILABLE", + }); + } + return new natives.ComputerController(); +} + +let controllerFactory: ComputerControllerFactory = createNativeComputerController; + +export function setComputerControllerFactoryForTests(factory: ComputerControllerFactory | undefined): void { + controllerFactory = factory ?? createNativeComputerController; +} + +export function isComputerSupportedPlatform(platform: NodeJS.Platform = process.platform): boolean { + return platform === "darwin"; +} + +export function isComputerEnabled(session: Pick): boolean { + return Boolean(session.settings.get("computer.enabled") || session.settings.get("computer.alwaysOn")); +} + +export function isComputerCallable( + session: Pick, + platform: NodeJS.Platform = process.platform, +): boolean { + return isComputerSupportedPlatform(platform) && isComputerEnabled(session); +} + +export class ComputerTool implements AgentTool { + readonly name = "computer"; + readonly label = "Computer"; + readonly loadMode = "discoverable"; + readonly summary = + "Control the explicitly enabled macOS desktop with screenshot, pointer, keyboard, scroll, and wait actions"; + readonly parameters = computerSchema; + readonly strict = true; + #description?: string; + + constructor(private readonly session: ToolSession) {} + + static createIf(session: ToolSession): ComputerTool | null { + return isComputerCallable(session) ? new ComputerTool(session) : null; + } + + get description(): string { + this.#description ??= prompt.render(computerDescription, {}); + return this.#description; + } + + async execute( + _toolCallId: string, + params: ComputerParams, + signal?: AbortSignal, + _onUpdate?: AgentToolUpdateCallback, + _ctx?: AgentToolContext, + ): Promise> { + const details = detailsFromParams(params); + if (!isComputerCallable(this.session)) { + details.status = "disabled"; + details.code = COMPUTER_DISABLED_CODE; + details.message = + "The computer tool is disabled. Enable computer.enabled or computer.alwaysOn on macOS to use it."; + return { ...toolResult(details).text(`${COMPUTER_DISABLED_CODE}: ${details.message}`).done(), isError: true }; + } + + try { + throwIfAborted(signal); + const timeoutSeconds = clampTimeout("computer", params.timeout); + const timeoutSignal = timeoutSeconds > 0 ? AbortSignal.timeout(timeoutSeconds * 1000) : undefined; + const combinedSignal = + signal && timeoutSignal ? AbortSignal.any([signal, timeoutSignal]) : (signal ?? timeoutSignal); + const result = await dispatchComputerAction(controllerFactory(), params, combinedSignal); + const screenshot = normalizeScreenshot(result); + if (screenshot) details.screenshot = screenshot; + details.status = "success"; + details.message = describeComputerSuccess(details); + return toolResult(details).text(details.message).done(); + } catch (error) { + if (error instanceof ToolAbortError) throw error; + const mapped = mapComputerError(error); + details.status = mapped.code === COMPUTER_DISABLED_CODE ? "disabled" : "error"; + details.code = mapped.code; + details.message = mapped.message; + return { ...toolResult(details).text(`${mapped.code}: ${mapped.message}`).done(), isError: true }; + } + } +} + +async function dispatchComputerAction( + controller: NativeController, + params: ComputerParams, + signal?: AbortSignal, +): Promise { + const options = { signal }; + switch (params.action) { + case "screenshot": + return controller.screenshot?.( + { timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + case "click": + return controller.click?.( + { + x: params.x, + y: params.y, + button: params.button ?? "left", + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "double_click": + return controller.doubleClick?.( + { + x: params.x, + y: params.y, + button: params.button ?? "left", + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "move": + return controller.move?.( + { + x: params.x, + y: params.y, + button: params.button, + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "drag": + return controller.drag?.( + { + x: params.x, + y: params.y, + toX: params.to_x, + toY: params.to_y, + button: params.button ?? "left", + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "scroll": + return controller.scroll?.( + { + x: params.x, + y: params.y, + scrollX: params.scroll_x, + scrollY: params.scroll_y, + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "type": + return controller.type?.( + { text: params.text, timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + case "keypress": + return controller.keypress?.( + { keys: params.keys, timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + case "wait": + return controller.wait?.( + { ms: params.ms, timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + } +} + +function detailsFromParams(params: ComputerParams): ComputerToolDetails { + const details: ComputerToolDetails = { action: params.action, status: "success" }; + if ("x" in params) details.x = params.x; + if ("y" in params) details.y = params.y; + if ("to_x" in params) details.toX = params.to_x; + if ("to_y" in params) details.toY = params.to_y; + if ("scroll_x" in params) details.scrollX = params.scroll_x; + if ("scroll_y" in params) details.scrollY = params.scroll_y; + if ("button" in params) details.button = params.button; + if ("keys" in params) details.keys = params.keys; + if ("ms" in params) details.ms = params.ms; + return details; +} + +function secondsToMs(seconds: number | undefined): number | undefined { + return typeof seconds === "number" ? seconds * 1000 : undefined; +} + +function normalizeScreenshot(value: unknown): ComputerScreenshotDetails | undefined { + const candidate = + value && typeof value === "object" && "screenshot" in value + ? (value as { screenshot?: unknown }).screenshot + : value; + if (!candidate || typeof candidate !== "object") return undefined; + const shot = candidate as NativeScreenshot; + if (typeof shot.widthPx !== "number" || typeof shot.heightPx !== "number") return undefined; + return { + widthPx: shot.widthPx, + heightPx: shot.heightPx, + scaleX: shot.scaleX, + scaleY: shot.scaleY, + originX: shot.originX, + originY: shot.originY, + displayEpoch: shot.displayEpoch, + captureId: shot.captureId, + pngBytes: getPngByteLength(shot.png), + }; +} + +function getPngByteLength(png: NativeScreenshot["png"]): number | undefined { + if (png === undefined) return undefined; + if (typeof png === "string") return Buffer.byteLength(png, "base64"); + if (png instanceof ArrayBuffer) return png.byteLength; + return png.byteLength; +} + +function mapComputerError(error: unknown): { code: string; message: string } { + if (error instanceof Error && error.name === "AbortError") { + return { code: "COMPUTER_CANCELLED", message: "Computer action was cancelled." }; + } + const maybe = error as { code?: unknown; message?: unknown }; + const rawCode = typeof maybe?.code === "string" ? maybe.code : undefined; + const code = + rawCode && (NATIVE_ERROR_CODES.has(rawCode) || rawCode.startsWith("COMPUTER_")) ? rawCode : "COMPUTER_ERROR"; + const message = + typeof maybe?.message === "string" && maybe.message.length > 0 ? maybe.message : "Computer action failed."; + return { code, message }; +} + +function describeComputerSuccess(details: ComputerToolDetails): string { + if (details.screenshot) { + return `Computer ${details.action} completed (${details.screenshot.widthPx}x${details.screenshot.heightPx}).`; + } + return `Computer ${details.action} completed.`; +} diff --git a/packages/coding-agent/src/tools/computer/render.ts b/packages/coding-agent/src/tools/computer/render.ts new file mode 100644 index 000000000..50b4e8c2d --- /dev/null +++ b/packages/coding-agent/src/tools/computer/render.ts @@ -0,0 +1,68 @@ +import type { Component } from "@gajae-code/tui"; +import { Text } from "@gajae-code/tui"; +import type { RenderResultOptions } from "../../extensibility/custom-tools/types"; +import type { Theme } from "../../modes/theme/theme"; +import type { ComputerToolDetails } from "../computer"; +import { formatBadge, formatErrorMessage } from "../render-utils"; + +function asRecord(value: unknown): Record { + return value && typeof value === "object" ? (value as Record) : {}; +} + +function summarizeArgs(args: unknown): string { + const input = asRecord(args); + const action = typeof input.action === "string" ? input.action : "computer"; + const parts = [action]; + if (typeof input.x === "number" && typeof input.y === "number") parts.push(`@ ${input.x},${input.y}`); + if (typeof input.to_x === "number" && typeof input.to_y === "number") parts.push(`→ ${input.to_x},${input.to_y}`); + if (typeof input.scroll_x === "number" || typeof input.scroll_y === "number") { + parts.push(`scroll ${input.scroll_x ?? 0},${input.scroll_y ?? 0}`); + } + if (Array.isArray(input.keys)) parts.push(`keys ${input.keys.join("+")}`); + if (typeof input.ms === "number") parts.push(`${input.ms}ms`); + return parts.join(" "); +} + +export function summarizeComputerDetails( + details: ComputerToolDetails | undefined, + isError: boolean, + theme: Theme, +): string { + if (!details) return isError ? "Computer action failed" : "Computer action completed"; + const parts = [details.action]; + if (details.x !== undefined && details.y !== undefined) parts.push(`@ ${details.x},${details.y}`); + if (details.toX !== undefined && details.toY !== undefined) parts.push(`→ ${details.toX},${details.toY}`); + if (details.scrollX !== undefined || details.scrollY !== undefined) + parts.push(`scroll ${details.scrollX ?? 0},${details.scrollY ?? 0}`); + if (details.screenshot) { + const shot = details.screenshot; + parts.push(`screenshot ${shot.widthPx}x${shot.heightPx}`); + if (shot.pngBytes !== undefined) parts.push(`${shot.pngBytes} bytes`); + if (shot.captureId) parts.push(`capture ${shot.captureId}`); + } + if (details.supervisor) parts.push(`supervisor ${details.supervisor}`); + if (details.code) parts.push(theme.fg(isError ? "error" : "muted", details.code)); + return parts.join(" "); +} + +export const computerToolRenderer = { + renderCall(args: unknown, _options: RenderResultOptions, theme: Theme): Component { + return new Text(`${formatBadge("computer", "accent", theme)} ${summarizeArgs(args)}`); + }, + renderResult( + result: { content: Array<{ type: string; text?: string }>; details?: unknown; isError?: boolean }, + _options: RenderResultOptions, + theme: Theme, + ): Component { + if (result.isError) { + const details = result.details as ComputerToolDetails | undefined; + return new Text( + formatErrorMessage(details?.message ?? result.content.find(c => c.type === "text")?.text, theme), + ); + } + return new Text( + `${formatBadge("computer", "success", theme)} ${summarizeComputerDetails(result.details as ComputerToolDetails | undefined, false, theme)}`, + ); + }, + mergeCallAndResult: true, +}; diff --git a/packages/coding-agent/src/tools/index.ts b/packages/coding-agent/src/tools/index.ts index 0156e1acd..fa12e455b 100644 --- a/packages/coding-agent/src/tools/index.ts +++ b/packages/coding-agent/src/tools/index.ts @@ -36,6 +36,7 @@ import { BashTool } from "./bash"; import { BrowserTool } from "./browser"; import { CalculatorTool } from "./calculator"; import { type CheckpointState, CheckpointTool, RewindTool } from "./checkpoint"; +import { ComputerTool, isComputerCallable } from "./computer"; import { CronCreateTool, CronDeleteTool, CronListTool } from "./cron"; import { DebugTool } from "./debug"; import { EvalTool } from "./eval"; @@ -73,6 +74,7 @@ export * from "./bash"; export * from "./browser"; export * from "./calculator"; export * from "./checkpoint"; +export * from "./computer"; export * from "./cron"; export * from "./debug"; export * from "./eval"; @@ -312,6 +314,26 @@ export function computeEssentialBuiltinNames(settings: Settings): string[] { * Hindsight memory helpers are intentionally excluded: memory is a private backend * integration, not a public gajae-code tool surface. */ +export interface BuiltinCapabilityCatalogEntry { + name: string; + label: string; + summary: string; + docsPath: string; + callableBuiltin: boolean; + defaultEnabled: boolean; +} + +export const BUILTIN_CAPABILITY_CATALOG: readonly BuiltinCapabilityCatalogEntry[] = [ + { + name: "computer", + label: "Computer", + summary: "Explicitly enabled macOS desktop screenshot and input control; off by default and supervisor-gated.", + docsPath: "docs/tools/computer.md", + callableBuiltin: false, + defaultEnabled: false, + }, +] as const; + export const BUILTIN_TOOLS: Record = { read: s => new ReadTool(s), bash: s => new BashTool(s), @@ -330,6 +352,7 @@ export const BUILTIN_TOOLS: Record = { lsp: LspTool.createIf, inspect_image: s => new InspectImageTool(s), browser: s => new BrowserTool(s), + computer: ComputerTool.createIf, checkpoint: CheckpointTool.createIf, rewind: RewindTool.createIf, task: s => TaskTool.create(s), @@ -504,6 +527,7 @@ export async function createTools(session: ToolSession, toolNames?: string[]): P if (name === "calc") return session.settings.get("calc.enabled"); if (name === "skill") return session.settings.get("skill.enabled"); if (name === "browser") return session.settings.get("browser.enabled"); + if (name === "computer") return isComputerCallable(session); if (name === "checkpoint" || name === "rewind") return session.settings.get("checkpoint.enabled"); if (name === "irc") { if (!session.settings.get("irc.enabled")) return false; diff --git a/packages/coding-agent/src/tools/renderers.ts b/packages/coding-agent/src/tools/renderers.ts index d1d803f44..c23f965af 100644 --- a/packages/coding-agent/src/tools/renderers.ts +++ b/packages/coding-agent/src/tools/renderers.ts @@ -17,6 +17,7 @@ import { astGrepToolRenderer } from "./ast-grep"; import { bashToolRenderer } from "./bash"; import { browserToolRenderer } from "./browser/render"; import { calculatorToolRenderer } from "./calculator"; +import { computerToolRenderer } from "./computer/render"; import { debugToolRenderer } from "./debug"; import { evalToolRenderer } from "./eval"; import { findToolRenderer } from "./find"; @@ -52,6 +53,7 @@ export const toolRenderers: Record = { ast_edit: astEditToolRenderer as ToolRenderer, bash: bashToolRenderer as ToolRenderer, browser: browserToolRenderer as ToolRenderer, + computer: computerToolRenderer as ToolRenderer, recipe: recipeToolRenderer as ToolRenderer, debug: debugToolRenderer as ToolRenderer, eval: evalToolRenderer as ToolRenderer, diff --git a/packages/coding-agent/src/tools/tool-timeouts.ts b/packages/coding-agent/src/tools/tool-timeouts.ts index cbd6ddc54..0f91ccea6 100644 --- a/packages/coding-agent/src/tools/tool-timeouts.ts +++ b/packages/coding-agent/src/tools/tool-timeouts.ts @@ -11,6 +11,7 @@ export const TOOL_TIMEOUTS = { bash: { default: 300, min: 1, max: 3600 }, eval: { default: 30, min: 1, max: 600 }, browser: { default: 30, min: 1, max: 300 }, + computer: { default: 30, min: 1, max: 300 }, ssh: { default: 60, min: 1, max: 3600 }, fetch: { default: 20, min: 1, max: 45 }, lsp: { default: 20, min: 5, max: 60 }, diff --git a/packages/coding-agent/test/tools/computer.test.ts b/packages/coding-agent/test/tools/computer.test.ts new file mode 100644 index 000000000..73465821e --- /dev/null +++ b/packages/coding-agent/test/tools/computer.test.ts @@ -0,0 +1,178 @@ +import { afterEach, describe, expect, it } from "bun:test"; +import { Settings } from "@gajae-code/coding-agent/config/settings"; +import { + BUILTIN_CAPABILITY_CATALOG, + ComputerTool, + computerSchema, + createTools, + isComputerCallable, + setComputerControllerFactoryForTests, + type ToolSession, +} from "@gajae-code/coding-agent/tools"; +import { summarizeComputerDetails } from "@gajae-code/coding-agent/tools/computer/render"; +import { toolRenderers } from "@gajae-code/coding-agent/tools/renderers"; + +function createSession(settings = Settings.isolated()): ToolSession { + return { + cwd: "/tmp/test", + hasUI: false, + getSessionFile: () => null, + getSessionSpawns: () => "*", + settings, + }; +} + +function textOf(result: { content: Array<{ type: string; text?: string }> }): string { + return result.content.map(c => c.text ?? "").join("\n"); +} + +describe("computer tool schema", () => { + const validCases = [ + { action: "screenshot" }, + { action: "click", x: 1, y: 2, button: "left" }, + { action: "double_click", x: 1, y: 2, button: "right" }, + { action: "move", x: 1, y: 2, button: "middle" }, + { action: "drag", x: 1, y: 2, to_x: 3, to_y: 4 }, + { action: "scroll", x: 1, y: 2, scroll_x: 0, scroll_y: -10 }, + { action: "type", text: "hello" }, + { action: "keypress", keys: ["Meta", "K"] }, + { action: "wait", ms: 250 }, + ]; + + it("accepts exactly the nine OpenAI snake_case actions", () => { + expect(validCases.map(value => computerSchema.parse(value).action)).toEqual([ + "screenshot", + "click", + "double_click", + "move", + "drag", + "scroll", + "type", + "keypress", + "wait", + ]); + }); + + it("rejects camelCase actions and fields", () => { + expect(() => computerSchema.parse({ action: "doubleClick", x: 1, y: 2 })).toThrow(); + expect(() => computerSchema.parse({ action: "drag", x: 1, y: 2, toX: 3, toY: 4 })).toThrow(); + expect(() => computerSchema.parse({ action: "scroll", x: 1, y: 2, scrollX: 0, scrollY: 1 })).toThrow(); + expect(() => computerSchema.parse({ action: "screenshot", includeScreenshot: true })).toThrow(); + }); +}); + +describe("computer tool gating", () => { + afterEach(() => setComputerControllerFactoryForTests(undefined)); + + it("is metadata-only by default and not callable/discoverable", async () => { + const session = createSession(Settings.isolated({ "tools.discoveryMode": "all" })); + const tools = await createTools(session); + const names = tools.map(t => t.name); + expect(names).not.toContain("computer"); + expect(BUILTIN_CAPABILITY_CATALOG.find(entry => entry.name === "computer")).toMatchObject({ + callableBuiltin: false, + defaultEnabled: false, + }); + const discoverable = tools.filter(t => t.loadMode === "discoverable").map(t => t.name); + expect(discoverable).not.toContain("computer"); + }); + + it("is callable with per-session enable or alwaysOn on macOS", async () => { + const enabledNames = (await createTools(createSession(Settings.isolated({ "computer.enabled": true })))).map( + t => t.name, + ); + const alwaysOnNames = (await createTools(createSession(Settings.isolated({ "computer.alwaysOn": true })))).map( + t => t.name, + ); + expect(enabledNames).toContain("computer"); + expect(alwaysOnNames).toContain("computer"); + }); + + it("is absent on non-macOS even when settings enable it", () => { + expect(isComputerCallable(createSession(Settings.isolated({ "computer.enabled": true })), "linux")).toBe(false); + }); + + it("returns COMPUTER_DISABLED without constructing native controller when directly invoked while disabled", async () => { + let constructed = false; + setComputerControllerFactoryForTests(() => { + constructed = true; + return {}; + }); + const tool = new ComputerTool(createSession()); + const result = await tool.execute("call", { action: "screenshot" }); + expect(result.isError).toBe(true); + expect(result.details?.code).toBe("COMPUTER_DISABLED"); + expect(textOf(result)).toContain("COMPUTER_DISABLED"); + expect(constructed).toBe(false); + }); +}); + +describe("computer tool dispatch", () => { + afterEach(() => setComputerControllerFactoryForTests(undefined)); + + it("maps snake_case model actions to native controller methods and forwards AbortSignal", async () => { + const calls: Array<{ method: string; payload: unknown; signal?: AbortSignal }> = []; + setComputerControllerFactoryForTests(() => ({ + screenshot: (payload, options) => { + calls.push({ method: "screenshot", payload, signal: options?.signal }); + return { widthPx: 20, heightPx: 10, png: new Uint8Array([1, 2, 3]), captureId: "cap-1" }; + }, + doubleClick: (payload, options) => calls.push({ method: "doubleClick", payload, signal: options?.signal }), + drag: (payload, options) => calls.push({ method: "drag", payload, signal: options?.signal }), + scroll: (payload, options) => calls.push({ method: "scroll", payload, signal: options?.signal }), + })); + const tool = new ComputerTool(createSession(Settings.isolated({ "computer.enabled": true }))); + const controller = new AbortController(); + const shot = await tool.execute("shot", { action: "screenshot", timeout: 2 }, controller.signal); + await tool.execute("dbl", { action: "double_click", x: 1, y: 2, button: "right" }, controller.signal); + await tool.execute("drag", { action: "drag", x: 1, y: 2, to_x: 3, to_y: 4 }, controller.signal); + await tool.execute("scroll", { action: "scroll", x: 1, y: 2, scroll_x: 5, scroll_y: -6 }, controller.signal); + + expect(shot.details?.screenshot).toMatchObject({ widthPx: 20, heightPx: 10, pngBytes: 3, captureId: "cap-1" }); + expect(calls.map(call => call.method)).toEqual(["screenshot", "doubleClick", "drag", "scroll"]); + expect(calls[1].payload).toMatchObject({ x: 1, y: 2, button: "right" }); + expect(calls[2].payload).toMatchObject({ x: 1, y: 2, toX: 3, toY: 4, button: "left" }); + expect(calls[3].payload).toMatchObject({ x: 1, y: 2, scrollX: 5, scrollY: -6 }); + expect(calls.every(call => call.signal instanceof AbortSignal)).toBe(true); + }); + + it("maps native COMPUTER_* errors into bounded tool errors", async () => { + setComputerControllerFactoryForTests(() => ({ + click: () => { + const error = new Error("supervisor is not live") as Error & { code: string }; + error.code = "COMPUTER_SUPERVISOR_NOT_LIVE"; + throw error; + }, + })); + const tool = new ComputerTool(createSession(Settings.isolated({ "computer.enabled": true }))); + const result = await tool.execute("click", { action: "click", x: 1, y: 2 }); + expect(result.isError).toBe(true); + expect(result.details?.code).toBe("COMPUTER_SUPERVISOR_NOT_LIVE"); + expect(textOf(result)).toContain("supervisor is not live"); + }); +}); + +describe("computer renderer", () => { + it("renders bounded output without raw screenshot data", () => { + const renderer = toolRenderers.computer; + expect(renderer).toBeDefined(); + const fakeTheme = { + fg: (_name: string, text: string) => text, + format: { bracketLeft: "[", bracketRight: "]" }, + styledSymbol: () => "!", + sep: { dot: " · " }, + } as never; + const output = summarizeComputerDetails( + { + action: "screenshot", + status: "success", + screenshot: { widthPx: 640, heightPx: 480, pngBytes: 1234, captureId: "cap-1" }, + }, + false, + fakeTheme, + ); + expect(output).toContain("640x480"); + expect(output).toContain("1234 bytes"); + expect(output).not.toContain("iVBOR"); + }); +}); From be82f07ec3c86546a6870ee6b17e1771da5587be Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 16:59:55 +0000 Subject: [PATCH 17/23] style(computer-use): format ultragoal fixtures --- .../src/gjc-runtime/ultragoal-runtime.ts | 140 ++++++++++++--- .../computer-red-team-fixtures.test.ts | 160 +++++++++++++----- 2 files changed, 232 insertions(+), 68 deletions(-) diff --git a/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts b/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts index 1ac172a44..df04e5014 100644 --- a/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts +++ b/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts @@ -867,7 +867,10 @@ function categorizeComputerChangePath(value: string): UltragoalChangeCategory { } function isComputerChangePath(row: UltragoalChangeSetPath): boolean { - return categorizeComputerChangePath(row.path) !== "other" || (row.oldPath ? categorizeComputerChangePath(row.oldPath) !== "other" : false); + return ( + categorizeComputerChangePath(row.path) !== "other" || + (row.oldPath ? categorizeComputerChangePath(row.oldPath) !== "other" : false) + ); } function isDocsOnlyStaticComputerChangeSet(changeSet: UltragoalChangeSet | undefined): boolean { @@ -888,7 +891,8 @@ function trustedChangeSetRequiresComputerSuite(changeSet: UltragoalChangeSet | u function executorQaDeclaresComputerTouching(executorQa: JsonObject): boolean { if (executorQa.computerTouching === true) return true; const surfaces = Array.isArray(executorQa.surfaces) ? executorQa.surfaces : []; - if (surfaces.some(value => typeof value === "string" && COMPUTER_SURFACE_TOKENS.has(normalizeSurfaceToken(value)))) return true; + if (surfaces.some(value => typeof value === "string" && COMPUTER_SURFACE_TOKENS.has(normalizeSurfaceToken(value)))) + return true; const surfaceRows = Array.isArray(executorQa.surfaceEvidence) ? executorQa.surfaceEvidence : []; return surfaceRows.some(row => { const object = qualityGateObject(row); @@ -913,7 +917,12 @@ export function normalizeSurfaceToken(value: string): string { export function surfaceFamily(value: string): SurfaceFamily { const normalized = normalizeSurfaceToken(value); - if (["computer", "computer-use", "desktop-input", "native-input", "native", "desktop", "tui"].some(word => normalized.includes(word))) return "native"; + if ( + ["computer", "computer-use", "desktop-input", "native-input", "native", "desktop", "tui"].some(word => + normalized.includes(word), + ) + ) + return "native"; if (["gui", "web", "browser", "ui", "visual"].some(word => normalized.includes(word))) return "web"; if (["cli", "terminal", "command"].some(word => normalized.includes(word))) return "cli"; if (["api", "package", "library", "sdk"].some(word => normalized.includes(word))) return "api-package"; @@ -1955,12 +1964,19 @@ async function validateMandatoryComputerAdversarialCases( } for (const caseId of MANDATORY_COMPUTER_CASE_IDS) { const row = adversarialCases.get(caseId); - if (!row) throw new Error(`COMPUTER_REDTEAM_CASE_MISSING: qualityGate executorQa.adversarialCases must include ${caseId}`); + if (!row) + throw new Error( + `COMPUTER_REDTEAM_CASE_MISSING: qualityGate executorQa.adversarialCases must include ${caseId}`, + ); if (optionalStatusField(row, `executorQa.adversarialCases.${caseId}`) === NOT_APPLICABLE_STATUS) { - throw new Error(`COMPUTER_REDTEAM_CASE_NOT_APPLICABLE: mandatory computer adversarial case ${caseId} must not be not_applicable`); + throw new Error( + `COMPUTER_REDTEAM_CASE_NOT_APPLICABLE: mandatory computer adversarial case ${caseId} must not be not_applicable`, + ); } if (!linkedCaseIds.has(caseId)) { - throw new Error(`COMPUTER_REDTEAM_CASE_UNLINKED: mandatory computer adversarial case ${caseId} must be linked from contractCoverage.adversarialCaseRefs`); + throw new Error( + `COMPUTER_REDTEAM_CASE_UNLINKED: mandatory computer adversarial case ${caseId} must be linked from contractCoverage.adversarialCaseRefs`, + ); } const artifactIds = requireStringLinks(row.artifactRefs, `executorQa.adversarialCases.${caseId}.artifactRefs`); let hasValidLiveNativeProof = false; @@ -1969,28 +1985,52 @@ async function validateMandatoryComputerAdversarialCases( let sawMetadataOnly = false; for (const artifactId of artifactIds) { const artifact = artifactRefs.get(artifactId); - if (!artifact) throw new Error(`qualityGate executorQa.adversarialCases.${caseId}.artifactRefs references unknown id ${artifactId}`); + if (!artifact) + throw new Error( + `qualityGate executorQa.adversarialCases.${caseId}.artifactRefs references unknown id ${artifactId}`, + ); const fieldName = `executorQa.artifactRefs.${artifactId}`; if (artifact.inlineEvidence !== undefined && !nonEmptyString(artifact.path)) sawInlineOnly = true; - if ((artifact.verifiedReceipt !== undefined || artifact.receipt !== undefined) && !nonEmptyString(artifact.path)) sawReceiptOnly = true; - if (!nonEmptyString(artifact.path) && artifact.inlineEvidence === undefined && artifact.verifiedReceipt === undefined && artifact.receipt === undefined) sawMetadataOnly = true; + if ( + (artifact.verifiedReceipt !== undefined || artifact.receipt !== undefined) && + !nonEmptyString(artifact.path) + ) + sawReceiptOnly = true; + if ( + !nonEmptyString(artifact.path) && + artifact.inlineEvidence === undefined && + artifact.verifiedReceipt === undefined && + artifact.receipt === undefined + ) + sawMetadataOnly = true; try { await validateArtifactProof(cwd, artifact, fieldName, { surfaceFamily: "native", live: true }); - if (await validateStructuralArtifact(cwd, artifact, fieldName, { surfaceFamily: "native", live: true })) hasValidLiveNativeProof = true; + if (await validateStructuralArtifact(cwd, artifact, fieldName, { surfaceFamily: "native", live: true })) + hasValidLiveNativeProof = true; } catch { // Preserve the explicit computer red-team error taxonomy below. } } if (!hasValidLiveNativeProof) { - if (sawInlineOnly) throw new Error(`COMPUTER_REDTEAM_INLINE_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`); - if (sawReceiptOnly) throw new Error(`COMPUTER_REDTEAM_RECEIPT_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`); - if (sawMetadataOnly) throw new Error(`COMPUTER_REDTEAM_ARTIFACT_METADATA_ONLY: mandatory computer adversarial case ${caseId} requires durable live structural native proof`); - throw new Error(`COMPUTER_REDTEAM_ARTIFACT_MISSING: mandatory computer adversarial case ${caseId} requires at least one valid live structural native proof artifact`); + if (sawInlineOnly) + throw new Error( + `COMPUTER_REDTEAM_INLINE_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`, + ); + if (sawReceiptOnly) + throw new Error( + `COMPUTER_REDTEAM_RECEIPT_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`, + ); + if (sawMetadataOnly) + throw new Error( + `COMPUTER_REDTEAM_ARTIFACT_METADATA_ONLY: mandatory computer adversarial case ${caseId} requires durable live structural native proof`, + ); + throw new Error( + `COMPUTER_REDTEAM_ARTIFACT_MISSING: mandatory computer adversarial case ${caseId} requires at least one valid live structural native proof artifact`, + ); } } } - function validateContractCoverage( executorQa: JsonObject, surfaceEvidence: Map, @@ -2069,7 +2109,10 @@ async function validateExecutorQaRedTeamEvidence( executorQa: JsonObject, options: { changeSet?: UltragoalChangeSet } = {}, ): Promise { - await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa, { mode: "checkpoint", changeSet: options.changeSet }); + await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa, { + mode: "checkpoint", + changeSet: options.changeSet, + }); } export async function validateExecutorQaRedTeamEvidenceForReview( @@ -2080,7 +2123,11 @@ export async function validateExecutorQaRedTeamEvidenceForReview( await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa as JsonObject, options); } -async function validateCompletionQualityGate(cwd: string, gate: JsonObject, options: { changeSet?: UltragoalChangeSet } = {}): Promise { +async function validateCompletionQualityGate( + cwd: string, + gate: JsonObject, + options: { changeSet?: UltragoalChangeSet } = {}, +): Promise { const codeReview = qualityGateObject(gate.codeReview); if (codeReview) { throw new Error( @@ -2136,7 +2183,11 @@ async function validateCompletionQualityGate(cwd: string, gate: JsonObject, opti requireEmptyBlockers(iteration.blockers, "iteration.blockers"); } -async function readRequiredCompletionQualityGate(cwd: string, value: string | undefined, options: { changeSet?: UltragoalChangeSet } = {}): Promise { +async function readRequiredCompletionQualityGate( + cwd: string, + value: string | undefined, + options: { changeSet?: UltragoalChangeSet } = {}, +): Promise { if (!value?.trim()) { throw new Error( "complete checkpoints require --quality-gate-json with architectReview, executorQa, and iteration evidence", @@ -2869,7 +2920,12 @@ function parseGitNameStatus(output: string): UltragoalChangeSetPath[] { const pathValue = status === "renamed" || status === "copied" ? parts[2] : parts[1]; if (!pathValue) continue; const oldPath = status === "renamed" || status === "copied" ? parts[1] : undefined; - rows.push({ path: normalizeRepoPath(pathValue), oldPath: oldPath ? normalizeRepoPath(oldPath) : undefined, status, category: categorizeComputerChangePath(pathValue) }); + rows.push({ + path: normalizeRepoPath(pathValue), + oldPath: oldPath ? normalizeRepoPath(oldPath) : undefined, + status, + category: categorizeComputerChangePath(pathValue), + }); } return rows; } @@ -2899,7 +2955,11 @@ async function computeCheckpointChangeSet(cwd: string): Promise { async function runGit(cwd: string, args: string[]): Promise { const proc = Bun.spawn(["git", ...args], { cwd, stdout: "pipe", stderr: "pipe" }); - const [stdout, stderr, exitCode] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text(), proc.exited]); + const [stdout, stderr, exitCode] = await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + proc.exited, + ]); if (exitCode !== 0) throw new Error(`git ${args.join(" ")} failed: ${stdout}${stderr}`); } @@ -80,7 +88,11 @@ function syntheticPng(): Buffer { let activeObjective = ""; async function seedPlan(root: string): Promise { - const created = await createUltragoalPlan({ cwd: root, brief: "@goal computer gate fixture", gjcObjective: "fixture" }); + const created = await createUltragoalPlan({ + cwd: root, + brief: "@goal computer gate fixture", + gjcObjective: "fixture", + }); await runGit(root, ["add", ".gjc/ultragoal/goals.json", ".gjc/ultragoal/ledger.jsonl"]); await runGit(root, ["commit", "-m", "plan"]); activeObjective = created.gjcObjective; @@ -88,7 +100,15 @@ async function seedPlan(root: string): Promise { } function goalSnapshot(): string { - return JSON.stringify({ goal: { threadId: "test-thread", objective: activeObjective, status: "active", createdAt: Date.now(), updatedAt: Date.now() } }); + return JSON.stringify({ + goal: { + threadId: "test-thread", + objective: activeObjective, + status: "active", + createdAt: Date.now(), + updatedAt: Date.now(), + }, + }); } function artifact(kind = "native screenshot"): Record { @@ -105,16 +125,25 @@ const CASES = [ "blast-radius", ]; -function executorQa(overrides: { cases?: Record[]; artifacts?: Record[]; computerTouching?: boolean; surface?: string } = {}): Record { - const cases = overrides.cases ?? CASES.map(id => ({ - id, - status: "passed", - contractRef: "computer-safety", - scenario: `${id} adversarial scenario exercises the computer safety boundary`, - expectedBehavior: "fail closed before unsafe desktop input can continue", - verdict: "passed", - artifactRefs: ["case-proof"], - })); +function executorQa( + overrides: { + cases?: Record[]; + artifacts?: Record[]; + computerTouching?: boolean; + surface?: string; + } = {}, +): Record { + const cases = + overrides.cases ?? + CASES.map(id => ({ + id, + status: "passed", + contractRef: "computer-safety", + scenario: `${id} adversarial scenario exercises the computer safety boundary`, + expectedBehavior: "fail closed before unsafe desktop input can continue", + verdict: "passed", + artifactRefs: ["case-proof"], + })); return { status: "passed", e2eStatus: "passed", @@ -124,25 +153,32 @@ function executorQa(overrides: { cases?: Record[]; artifacts?: redTeamCommands: ["bun test fixture"], changedPaths: overrides.computerTouching === true ? ["crates/pi-natives/src/computer/executor.rs"] : undefined, computerTouching: overrides.computerTouching, - artifactRefs: overrides.artifacts ?? [artifact("native screenshot"), { ...artifact("native screenshot"), id: "case-proof" }], - surfaceEvidence: [{ - id: "surface-native", - contractRef: "computer-safety", - surface: overrides.surface ?? "native", - status: "passed", - invocation: "native fixture invocation", - verdict: "passed", - artifactRefs: ["surface-proof"], - }], + artifactRefs: overrides.artifacts ?? [ + artifact("native screenshot"), + { ...artifact("native screenshot"), id: "case-proof" }, + ], + surfaceEvidence: [ + { + id: "surface-native", + contractRef: "computer-safety", + surface: overrides.surface ?? "native", + status: "passed", + invocation: "native fixture invocation", + verdict: "passed", + artifactRefs: ["surface-proof"], + }, + ], adversarialCases: cases, - contractCoverage: [{ - id: "coverage", - contractRef: "computer-safety", - status: "covered", - obligation: "all mandatory computer red-team cases are covered", - surfaceEvidenceRefs: ["surface-native"], - adversarialCaseRefs: cases.map(row => String(row.id)), - }], + contractCoverage: [ + { + id: "coverage", + contractRef: "computer-safety", + status: "covered", + obligation: "all mandatory computer red-team cases are covered", + surfaceEvidenceRefs: ["surface-native"], + adversarialCaseRefs: cases.map(row => String(row.id)), + }, + ], blockers: [], }; } @@ -175,14 +211,22 @@ async function writeQaArtifacts(root: string): Promise { } async function checkpoint(root: string, qa: Record): Promise { - const result = await runNativeUltragoalCommand([ - "checkpoint", - "--goal-id", "G001", - "--status", "complete", - "--evidence", "fixture complete", - "--gjc-goal-json", goalSnapshot(), - "--quality-gate-json", qualityGate(qa), - ], root); + const result = await runNativeUltragoalCommand( + [ + "checkpoint", + "--goal-id", + "G001", + "--status", + "complete", + "--evidence", + "fixture complete", + "--gjc-goal-json", + goalSnapshot(), + "--quality-gate-json", + qualityGate(qa), + ], + root, + ); return result.stderr + result.stdout; } @@ -207,7 +251,15 @@ describe("computer red-team fixture matrix", () => { await seedPlan(root); await writeQaArtifacts(root); await seedComputerChange(root); - const message = await checkpoint(root, executorQa({ computerTouching: true, cases: (executorQa().adversarialCases as Record[]).filter(row => row.id !== "blast-radius") })).catch(error => String(error)); + const message = await checkpoint( + root, + executorQa({ + computerTouching: true, + cases: (executorQa().adversarialCases as Record[]).filter( + row => row.id !== "blast-radius", + ), + }), + ).catch(error => String(error)); expect(message).toContain("COMPUTER_REDTEAM_CASE_MISSING"); }); @@ -217,7 +269,15 @@ describe("computer red-team fixture matrix", () => { await seedPlan(root); await writeQaArtifacts(root); await seedComputerChange(root); - const cases = CASES.map(id => ({ id, status: id === "blast-radius" ? "not_applicable" : "passed", contractRef: "computer-safety", scenario: "scenario text", expectedBehavior: "expected behavior", verdict: "passed", artifactRefs: ["case-proof"] })); + const cases = CASES.map(id => ({ + id, + status: id === "blast-radius" ? "not_applicable" : "passed", + contractRef: "computer-safety", + scenario: "scenario text", + expectedBehavior: "expected behavior", + verdict: "passed", + artifactRefs: ["case-proof"], + })); const message = await checkpoint(root, executorQa({ cases })).catch(error => String(error)); expect(message).toContain("not_applicable"); }); @@ -228,7 +288,21 @@ describe("computer red-team fixture matrix", () => { await seedPlan(root); await seedComputerChange(root); await writeQaArtifacts(root); - const message = await checkpoint(root, executorQa({ computerTouching: true, artifacts: [artifact("native screenshot"), { id: "case-proof", kind: "native metadata", description: "inline only", inlineEvidence: "inline proof is not durable live structural evidence" }] })).catch(error => String(error)); + const message = await checkpoint( + root, + executorQa({ + computerTouching: true, + artifacts: [ + artifact("native screenshot"), + { + id: "case-proof", + kind: "native metadata", + description: "inline only", + inlineEvidence: "inline proof is not durable live structural evidence", + }, + ], + }), + ).catch(error => String(error)); expect(message).toContain("COMPUTER_REDTEAM_INLINE_ONLY"); }); From fd5dce09e3f292decbcfbb9876a608a97ddeeef4 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Mon, 15 Jun 2026 17:36:18 +0000 Subject: [PATCH 18/23] fix(computer-use): repair TypeScript check errors (slice 1) - ComputerToolDetails: add optional meta field so it satisfies the ToolResultBuilder DetailsWithMeta weak-type constraint; toolResult() now infers ComputerToolDetails instead of falling back to DetailsWithMeta - render.ts: type summarizeComputerDetails parts as string[] so dynamic summary strings are not constrained to the action literal union - computer-red-team-fixtures.test.ts: drop removed gjcObjective input option from createUltragoalPlan; guard possibly-undefined stderr/stdout --- packages/coding-agent/src/tools/computer.ts | 2 ++ packages/coding-agent/src/tools/computer/render.ts | 2 +- .../test/gjc-runtime/computer-red-team-fixtures.test.ts | 3 +-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/coding-agent/src/tools/computer.ts b/packages/coding-agent/src/tools/computer.ts index ca1fa2f79..6a0fe246f 100644 --- a/packages/coding-agent/src/tools/computer.ts +++ b/packages/coding-agent/src/tools/computer.ts @@ -3,6 +3,7 @@ import { prompt } from "@gajae-code/utils"; import * as z from "zod/v4"; import computerDescription from "../prompts/tools/computer.md" with { type: "text" }; import type { ToolSession } from "./index"; +import type { OutputMeta } from "./output-meta"; import { ToolAbortError, ToolError, throwIfAborted } from "./tool-errors"; import { toolResult } from "./tool-result"; import { clampTimeout } from "./tool-timeouts"; @@ -99,6 +100,7 @@ export interface ComputerToolDetails { ms?: number; screenshot?: ComputerScreenshotDetails; supervisor?: string; + meta?: OutputMeta; } type NativeController = { diff --git a/packages/coding-agent/src/tools/computer/render.ts b/packages/coding-agent/src/tools/computer/render.ts index 50b4e8c2d..592d6c1a3 100644 --- a/packages/coding-agent/src/tools/computer/render.ts +++ b/packages/coding-agent/src/tools/computer/render.ts @@ -29,7 +29,7 @@ export function summarizeComputerDetails( theme: Theme, ): string { if (!details) return isError ? "Computer action failed" : "Computer action completed"; - const parts = [details.action]; + const parts: string[] = [details.action]; if (details.x !== undefined && details.y !== undefined) parts.push(`@ ${details.x},${details.y}`); if (details.toX !== undefined && details.toY !== undefined) parts.push(`→ ${details.toX},${details.toY}`); if (details.scrollX !== undefined || details.scrollY !== undefined) diff --git a/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts b/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts index 2615b1af9..ba95ef0a8 100644 --- a/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts +++ b/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts @@ -91,7 +91,6 @@ async function seedPlan(root: string): Promise { const created = await createUltragoalPlan({ cwd: root, brief: "@goal computer gate fixture", - gjcObjective: "fixture", }); await runGit(root, ["add", ".gjc/ultragoal/goals.json", ".gjc/ultragoal/ledger.jsonl"]); await runGit(root, ["commit", "-m", "plan"]); @@ -227,7 +226,7 @@ async function checkpoint(root: string, qa: Record): Promise { From cf3f4fd3e52f0ca70ab87db9a858ab70469385a8 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 10:01:37 +0900 Subject: [PATCH 19/23] test(computer-use): add G005 all-nine + kill-switch acceptance drill Adds the COMPUTER_USE_MACOS_TEXTEDIT_ALL_NINE manual-acceptance drill as an #[ignore] live test (crates/pi-natives/src/computer/input.rs live_tests): drives all nine primitives (screenshot/move/click/type/keypress/ double_click/drag/scroll/wait) through the production gated path (execute_input + Supervisor + Mac providers) against the focused app, then waits for a human Control+Option+Command+Escape press and asserts the kill-switch latches and blocks further input until reset. Ignored by default (needs macOS + grants + a human keypress); run with: cargo test -p pi-natives computer::input::live_tests::all_nine_acceptance_drill -- --ignored --nocapture Note: the drill refreshes the supervisor heartbeat per action as a stand-in for a periodic listener heartbeat (follow-up: tick heartbeat from the hotkey listener thread). --- crates/pi-natives/src/computer/input.rs | 89 +++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/crates/pi-natives/src/computer/input.rs b/crates/pi-natives/src/computer/input.rs index 61b538d69..fb018fa1f 100644 --- a/crates/pi-natives/src/computer/input.rs +++ b/crates/pi-natives/src/computer/input.rs @@ -710,4 +710,93 @@ mod live_tests { assert!(!controller.has_held_buttons()); let _ = MouseButton::Left; // keep the import meaningful for future click tests } + + /// G005 acceptance drill: drives all nine primitives through the gated + /// execute_input path against the focused frontmost app, then waits for a + /// human kill-switch press and proves input is blocked afterward. + #[test] + #[ignore = "live G005: drives the focused app + needs a human hotkey press"] + fn all_nine_acceptance_drill() { + use std::{thread::sleep, time::Duration}; + + use crate::computer::{ + capture::capture_primary_display, + executor::{InputAction, MacDisplayContext, MacPermissionGate, execute_input}, + hotkey, + supervisor::Supervisor, + }; + + assert!(hotkey::start(), "kill-switch hotkey listener must be live"); + let frame = capture_primary_display().expect("Screen Recording granted"); // primitive 1: screenshot + let display = frame.display; + let perms = MacPermissionGate; + let dctx = MacDisplayContext; + let cancel = || false; + let cx = f64::from(display.width_px) * 0.5; + let cy = f64::from(display.height_px) * 0.42; + + let mut act = |action: InputAction| { + // Stand in for the listener's periodic heartbeat so input_allowed stays fresh. + Supervisor::global().heartbeat(); + let mut controller = guarded_controller().expect("Accessibility granted"); + execute_input( + &action, + Supervisor::global(), + &perms, + &dctx, + None, + &display, + &mut controller, + &cancel, + ) + .expect("gated action should succeed"); + sleep(Duration::from_millis(350)); + }; + + act(InputAction::Move { x: cx, y: cy }); // 2 move + act(InputAction::Click { x: cx, y: cy, button: MouseButton::Left }); // 3 click (focus body) + act(InputAction::Type { text: "COMPUTER_USE_E2E gajae ".to_string() }); // 4 type + act(InputAction::Keypress { keys: vec!["return".to_string()] }); // 5 keypress + act(InputAction::Type { text: "line two alpha beta gamma delta epsilon".to_string() }); + act(InputAction::DoubleClick { x: cx, y: cy, button: MouseButton::Left }); // 6 double_click + act(InputAction::Drag { + x: cx - 120.0, + y: cy, + to_x: cx + 120.0, + to_y: cy, + button: MouseButton::Left, + }); // 7 drag + act(InputAction::Scroll { x: cx, y: cy, scroll_x: 0.0, scroll_y: -120.0 }); // 8 scroll + act(InputAction::Wait { ms: 300 }); // 9 wait + + println!(">>> KILL-SWITCH DRILL: press Control+Option+Command+Escape now (within ~10s) <<<"); + for _ in 0..50 { + if Supervisor::global().is_suspended() { + break; + } + sleep(Duration::from_millis(200)); + } + assert!( + Supervisor::global().is_suspended(), + "kill-switch should latch after you press the hotkey" + ); + + // Prove input is blocked after the kill-switch, until a user-only reset. + let mut controller = guarded_controller().expect("Accessibility granted"); + let blocked = execute_input( + &InputAction::Move { x: cx, y: cy }, + Supervisor::global(), + &perms, + &dctx, + None, + &display, + &mut controller, + &cancel, + ); + assert!(blocked.is_err(), "input must be blocked while suspended"); + Supervisor::global().reset(); + println!( + "G005 PASS: all nine primitives executed; kill-switch latched and blocked further input." + ); + } } From 30aee963e430c2fe6b40b1e95903e6d6933229fa Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 15:12:30 +0900 Subject: [PATCH 20/23] test(computer-use): persist durable G005 acceptance artifacts + widen kill-switch window The all-nine acceptance drill now writes g005-before.png, g005-after-killswitch.png, and g005-manifest.json to .gjc/ultragoal/artifacts/g005 (override COMPUTER_USE_ACCEPTANCE_DIR), so the human-run drill produces the durable live native proof the G004 mandatory computer red-team suite requires on disk. Widen the kill-switch wait from ~10s to ~60s for manual operation and drop an unnecessary mut on the act closure. --- crates/pi-natives/src/computer/input.rs | 61 +++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/crates/pi-natives/src/computer/input.rs b/crates/pi-natives/src/computer/input.rs index fb018fa1f..7cc02308d 100644 --- a/crates/pi-natives/src/computer/input.rs +++ b/crates/pi-natives/src/computer/input.rs @@ -711,6 +711,16 @@ mod live_tests { let _ = MouseButton::Left; // keep the import meaningful for future click tests } + /// Durable output directory for G005 live-acceptance artifacts. Override + /// with `COMPUTER_USE_ACCEPTANCE_DIR`; defaults to + /// `/.gjc/ultragoal/artifacts/g005`. + fn acceptance_artifacts_dir() -> std::path::PathBuf { + if let Ok(dir) = std::env::var("COMPUTER_USE_ACCEPTANCE_DIR") { + return std::path::PathBuf::from(dir); + } + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../.gjc/ultragoal/artifacts/g005") + } + /// G005 acceptance drill: drives all nine primitives through the gated /// execute_input path against the focused frontmost app, then waits for a /// human kill-switch press and proves input is blocked afterward. @@ -735,7 +745,13 @@ mod live_tests { let cx = f64::from(display.width_px) * 0.5; let cy = f64::from(display.height_px) * 0.42; - let mut act = |action: InputAction| { + // Persist the pre-input frame as durable live-proof (primitive 1). + let artifacts = acceptance_artifacts_dir(); + std::fs::create_dir_all(&artifacts).expect("create acceptance artifacts dir"); + std::fs::write(artifacts.join("g005-before.png"), &frame.png) + .expect("write before screenshot"); + + let act = |action: InputAction| { // Stand in for the listener's periodic heartbeat so input_allowed stays fresh. Supervisor::global().heartbeat(); let mut controller = guarded_controller().expect("Accessibility granted"); @@ -769,8 +785,8 @@ mod live_tests { act(InputAction::Scroll { x: cx, y: cy, scroll_x: 0.0, scroll_y: -120.0 }); // 8 scroll act(InputAction::Wait { ms: 300 }); // 9 wait - println!(">>> KILL-SWITCH DRILL: press Control+Option+Command+Escape now (within ~10s) <<<"); - for _ in 0..50 { + println!(">>> KILL-SWITCH DRILL: press Control+Option+Command+Escape now (within ~60s) <<<"); + for _ in 0..300 { if Supervisor::global().is_suspended() { break; } @@ -794,6 +810,45 @@ mod live_tests { &cancel, ); assert!(blocked.is_err(), "input must be blocked while suspended"); + + // Capture + persist the post-kill-switch frame and a transcript so the + // G004 mandatory computer red-team suite has durable native proof on disk. + let after = capture_primary_display().expect("Screen Recording granted"); + std::fs::write(artifacts.join("g005-after-killswitch.png"), &after.png) + .expect("write post-kill-switch screenshot"); + let manifest = serde_json::json!({ + "schemaVersion": 1, + "kind": "computer-use-acceptance", + "surface": "native", + "hotkey": "Control+Option+Command+Escape", + "display": { + "widthPx": display.width_px, + "heightPx": display.height_px, + "epoch": frame.display_epoch + }, + "primitives": [ + "screenshot", + "move", + "click", + "type", + "keypress", + "double_click", + "drag", + "scroll", + "wait" + ], + "killSwitch": { "latched": true, "blockedFurtherInput": true }, + "artifacts": { + "before": "g005-before.png", + "afterKillSwitch": "g005-after-killswitch.png" + } + }); + std::fs::write( + artifacts.join("g005-manifest.json"), + serde_json::to_vec_pretty(&manifest).expect("serialize manifest"), + ) + .expect("write acceptance manifest"); + println!("G005 artifacts written to {}", artifacts.display()); Supervisor::global().reset(); println!( "G005 PASS: all nine primitives executed; kill-switch latched and blocked further input." From d4026bad308ba5b3944637dbe7c95cc7c9dad1eb Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 15:44:51 +0900 Subject: [PATCH 21/23] feat(computer-use): do not load the computer tool at all on Windows Add isComputerLoadablePlatform (true everywhere except win32) and gate both BUILTIN_CAPABILITY_CATALOG and the BUILTIN_TOOLS computer factory on it: macOS stays callable, Linux stays listable (support planned via #712), Windows is fully absent (not registered, not advertised). Also fix a pre-existing initial-tools metadata test by constructing ComputerTool directly for loadMode coverage, mirroring how other createIf-gated tools (Ask/Ssh/Job/Recipe/Irc) are handled. --- packages/coding-agent/src/tools/computer.ts | 8 ++++++ packages/coding-agent/src/tools/index.ts | 27 ++++++++++--------- .../test/tool-discovery/initial-tools.test.ts | 2 ++ .../coding-agent/test/tools/computer.test.ts | 17 +++++++++--- 4 files changed, 38 insertions(+), 16 deletions(-) diff --git a/packages/coding-agent/src/tools/computer.ts b/packages/coding-agent/src/tools/computer.ts index 6a0fe246f..f673522ab 100644 --- a/packages/coding-agent/src/tools/computer.ts +++ b/packages/coding-agent/src/tools/computer.ts @@ -160,6 +160,14 @@ export function isComputerSupportedPlatform(platform: NodeJS.Platform = process. return platform === "darwin"; } +/** + * Whether the computer capability is loaded/advertised at all on this platform. + * macOS is callable; Linux is listable (support planned); Windows is fully absent. + */ +export function isComputerLoadablePlatform(platform: NodeJS.Platform = process.platform): boolean { + return platform !== "win32"; +} + export function isComputerEnabled(session: Pick): boolean { return Boolean(session.settings.get("computer.enabled") || session.settings.get("computer.alwaysOn")); } diff --git a/packages/coding-agent/src/tools/index.ts b/packages/coding-agent/src/tools/index.ts index fa12e455b..c9281fa6d 100644 --- a/packages/coding-agent/src/tools/index.ts +++ b/packages/coding-agent/src/tools/index.ts @@ -36,7 +36,7 @@ import { BashTool } from "./bash"; import { BrowserTool } from "./browser"; import { CalculatorTool } from "./calculator"; import { type CheckpointState, CheckpointTool, RewindTool } from "./checkpoint"; -import { ComputerTool, isComputerCallable } from "./computer"; +import { ComputerTool, isComputerCallable, isComputerLoadablePlatform } from "./computer"; import { CronCreateTool, CronDeleteTool, CronListTool } from "./cron"; import { DebugTool } from "./debug"; import { EvalTool } from "./eval"; @@ -323,16 +323,19 @@ export interface BuiltinCapabilityCatalogEntry { defaultEnabled: boolean; } -export const BUILTIN_CAPABILITY_CATALOG: readonly BuiltinCapabilityCatalogEntry[] = [ - { - name: "computer", - label: "Computer", - summary: "Explicitly enabled macOS desktop screenshot and input control; off by default and supervisor-gated.", - docsPath: "docs/tools/computer.md", - callableBuiltin: false, - defaultEnabled: false, - }, -] as const; +export const BUILTIN_CAPABILITY_CATALOG: readonly BuiltinCapabilityCatalogEntry[] = isComputerLoadablePlatform() + ? [ + { + name: "computer", + label: "Computer", + summary: + "Explicitly enabled macOS desktop screenshot and input control; off by default and supervisor-gated.", + docsPath: "docs/tools/computer.md", + callableBuiltin: false, + defaultEnabled: false, + }, + ] + : []; export const BUILTIN_TOOLS: Record = { read: s => new ReadTool(s), @@ -352,7 +355,7 @@ export const BUILTIN_TOOLS: Record = { lsp: LspTool.createIf, inspect_image: s => new InspectImageTool(s), browser: s => new BrowserTool(s), - computer: ComputerTool.createIf, + ...(isComputerLoadablePlatform() ? { computer: ComputerTool.createIf } : {}), checkpoint: CheckpointTool.createIf, rewind: RewindTool.createIf, task: s => TaskTool.create(s), diff --git a/packages/coding-agent/test/tool-discovery/initial-tools.test.ts b/packages/coding-agent/test/tool-discovery/initial-tools.test.ts index 307a42f60..938347add 100644 --- a/packages/coding-agent/test/tool-discovery/initial-tools.test.ts +++ b/packages/coding-agent/test/tool-discovery/initial-tools.test.ts @@ -6,6 +6,7 @@ import type { ToolSession } from "../../src/tools/index"; import { AskTool, BUILTIN_TOOLS, + ComputerTool, computeEssentialBuiltinNames, createTools, DEFAULT_ESSENTIAL_TOOL_NAMES, @@ -90,6 +91,7 @@ async function getToolMetadata(): Promise [tool.name, { loadMode: tool.loadMode, summary: tool.summary }])); for (const tool of [ new AskTool({ ...toolSession, hasUI: true }), + new ComputerTool(toolSession), new SshTool(toolSession, [], new Map(), ""), new JobTool(toolSession), new RecipeTool(toolSession, []), diff --git a/packages/coding-agent/test/tools/computer.test.ts b/packages/coding-agent/test/tools/computer.test.ts index 73465821e..59ee2c36c 100644 --- a/packages/coding-agent/test/tools/computer.test.ts +++ b/packages/coding-agent/test/tools/computer.test.ts @@ -6,6 +6,7 @@ import { computerSchema, createTools, isComputerCallable, + isComputerLoadablePlatform, setComputerControllerFactoryForTests, type ToolSession, } from "@gajae-code/coding-agent/tools"; @@ -69,10 +70,12 @@ describe("computer tool gating", () => { const tools = await createTools(session); const names = tools.map(t => t.name); expect(names).not.toContain("computer"); - expect(BUILTIN_CAPABILITY_CATALOG.find(entry => entry.name === "computer")).toMatchObject({ - callableBuiltin: false, - defaultEnabled: false, - }); + const catalogEntry = BUILTIN_CAPABILITY_CATALOG.find(entry => entry.name === "computer"); + if (isComputerLoadablePlatform()) { + expect(catalogEntry).toMatchObject({ callableBuiltin: false, defaultEnabled: false }); + } else { + expect(catalogEntry).toBeUndefined(); + } const discoverable = tools.filter(t => t.loadMode === "discoverable").map(t => t.name); expect(discoverable).not.toContain("computer"); }); @@ -92,6 +95,12 @@ describe("computer tool gating", () => { expect(isComputerCallable(createSession(Settings.isolated({ "computer.enabled": true })), "linux")).toBe(false); }); + it("is loadable on macOS and Linux but not loaded at all on Windows", () => { + expect(isComputerLoadablePlatform("darwin")).toBe(true); + expect(isComputerLoadablePlatform("linux")).toBe(true); + expect(isComputerLoadablePlatform("win32")).toBe(false); + }); + it("returns COMPUTER_DISABLED without constructing native controller when directly invoked while disabled", async () => { let constructed = false; setComputerControllerFactoryForTests(() => { From 0e996ba4daee8044c60d326999ac0062a4b60344 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 07:11:35 +0000 Subject: [PATCH 22/23] test(computer): isolate macOS availability in tool tests --- packages/coding-agent/src/tools/computer.ts | 13 +++++++++++-- .../test/tool-discovery/initial-tools.test.ts | 8 ++++++++ packages/coding-agent/test/tools/computer.test.ts | 14 ++++++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/packages/coding-agent/src/tools/computer.ts b/packages/coding-agent/src/tools/computer.ts index f673522ab..10344f0b5 100644 --- a/packages/coding-agent/src/tools/computer.ts +++ b/packages/coding-agent/src/tools/computer.ts @@ -151,12 +151,21 @@ function createNativeComputerController(): NativeController { } let controllerFactory: ComputerControllerFactory = createNativeComputerController; +let platformOverrideForTests: NodeJS.Platform | undefined; export function setComputerControllerFactoryForTests(factory: ComputerControllerFactory | undefined): void { controllerFactory = factory ?? createNativeComputerController; } -export function isComputerSupportedPlatform(platform: NodeJS.Platform = process.platform): boolean { +export function setComputerPlatformForTests(platform: NodeJS.Platform | undefined): void { + platformOverrideForTests = platform; +} + +function currentComputerPlatform(): NodeJS.Platform { + return platformOverrideForTests ?? process.platform; +} + +export function isComputerSupportedPlatform(platform: NodeJS.Platform = currentComputerPlatform()): boolean { return platform === "darwin"; } @@ -174,7 +183,7 @@ export function isComputerEnabled(session: Pick): boole export function isComputerCallable( session: Pick, - platform: NodeJS.Platform = process.platform, + platform: NodeJS.Platform = currentComputerPlatform(), ): boolean { return isComputerSupportedPlatform(platform) && isComputerEnabled(session); } diff --git a/packages/coding-agent/test/tool-discovery/initial-tools.test.ts b/packages/coding-agent/test/tool-discovery/initial-tools.test.ts index 938347add..3c31a1a44 100644 --- a/packages/coding-agent/test/tool-discovery/initial-tools.test.ts +++ b/packages/coding-agent/test/tool-discovery/initial-tools.test.ts @@ -5,6 +5,7 @@ import { AgentRegistry, MAIN_AGENT_ID } from "../../src/registry/agent-registry" import type { ToolSession } from "../../src/tools/index"; import { AskTool, + BUILTIN_CAPABILITY_CATALOG, BUILTIN_TOOLS, ComputerTool, computeEssentialBuiltinNames, @@ -99,6 +100,13 @@ async function getToolMetadata(): Promise entry.name === "computer"); + if (computerCapability) { + metadata.set("computer", { loadMode: "discoverable", summary: computerCapability.summary }); + } return metadata; } describe("BUILTIN_TOOLS public factory map", () => { diff --git a/packages/coding-agent/test/tools/computer.test.ts b/packages/coding-agent/test/tools/computer.test.ts index 59ee2c36c..28fdcbf9c 100644 --- a/packages/coding-agent/test/tools/computer.test.ts +++ b/packages/coding-agent/test/tools/computer.test.ts @@ -8,6 +8,7 @@ import { isComputerCallable, isComputerLoadablePlatform, setComputerControllerFactoryForTests, + setComputerPlatformForTests, type ToolSession, } from "@gajae-code/coding-agent/tools"; import { summarizeComputerDetails } from "@gajae-code/coding-agent/tools/computer/render"; @@ -63,7 +64,10 @@ describe("computer tool schema", () => { }); describe("computer tool gating", () => { - afterEach(() => setComputerControllerFactoryForTests(undefined)); + afterEach(() => { + setComputerControllerFactoryForTests(undefined); + setComputerPlatformForTests(undefined); + }); it("is metadata-only by default and not callable/discoverable", async () => { const session = createSession(Settings.isolated({ "tools.discoveryMode": "all" })); @@ -81,6 +85,7 @@ describe("computer tool gating", () => { }); it("is callable with per-session enable or alwaysOn on macOS", async () => { + setComputerPlatformForTests("darwin"); const enabledNames = (await createTools(createSession(Settings.isolated({ "computer.enabled": true })))).map( t => t.name, ); @@ -117,9 +122,13 @@ describe("computer tool gating", () => { }); describe("computer tool dispatch", () => { - afterEach(() => setComputerControllerFactoryForTests(undefined)); + afterEach(() => { + setComputerControllerFactoryForTests(undefined); + setComputerPlatformForTests(undefined); + }); it("maps snake_case model actions to native controller methods and forwards AbortSignal", async () => { + setComputerPlatformForTests("darwin"); const calls: Array<{ method: string; payload: unknown; signal?: AbortSignal }> = []; setComputerControllerFactoryForTests(() => ({ screenshot: (payload, options) => { @@ -146,6 +155,7 @@ describe("computer tool dispatch", () => { }); it("maps native COMPUTER_* errors into bounded tool errors", async () => { + setComputerPlatformForTests("darwin"); setComputerControllerFactoryForTests(() => ({ click: () => { const error = new Error("supervisor is not live") as Error & { code: string }; From b4df5c658413b4a1532ad508f57d70dd9837eae4 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Tue, 16 Jun 2026 07:29:41 +0000 Subject: [PATCH 23/23] test(natives): avoid static macOS computer imports on Linux --- packages/natives/test/computer.test.ts | 28 +++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/packages/natives/test/computer.test.ts b/packages/natives/test/computer.test.ts index 4f83a3681..ce3a52291 100644 --- a/packages/natives/test/computer.test.ts +++ b/packages/natives/test/computer.test.ts @@ -1,10 +1,27 @@ import { describe, expect, it } from "bun:test"; -import { ComputerController, computerScreenshot } from "../native/index.js"; const isMacOS = process.platform === "darwin"; +type NativeComputerModule = { + ComputerController: new () => Record; + computerScreenshot: () => { + widthPx: number; + heightPx: number; + scaleX: number; + scaleY: number; + png: Uint8Array; + displayEpoch: number; + captureId: number; + }; +}; + +async function loadNativeComputerModule(): Promise { + return (await import("../native/index.js")) as unknown as NativeComputerModule; +} + describe.if(isMacOS)("ComputerController napi binding", () => { - it("exists with expected methods", () => { + it("exists with expected methods", async () => { + const { ComputerController } = await loadNativeComputerModule(); const controller = new ComputerController(); expect(controller).toBeInstanceOf(ComputerController); for (const method of [ @@ -18,7 +35,7 @@ describe.if(isMacOS)("ComputerController napi binding", () => { "keypress", "wait", ]) { - expect(typeof controller[method as keyof ComputerController]).toBe("function"); + expect(typeof controller[method]).toBe("function"); } }); }); @@ -27,8 +44,9 @@ describe.if(isMacOS)("ComputerController napi binding", () => { // primary display, so it requires the Screen Recording permission. Gate on // platform and skip gracefully when capture is unavailable in the environment. describe.if(isMacOS)("computer screenshot napi binding", () => { - it("returns a decodable PNG whose dimensions match the descriptor", () => { - let shot: ReturnType; + it("returns a decodable PNG whose dimensions match the descriptor", async () => { + const { computerScreenshot } = await loadNativeComputerModule(); + let shot: ReturnType; try { shot = computerScreenshot(); } catch (err) {