diff --git a/crates/pi-natives/src/computer/bypass_guard.rs b/crates/pi-natives/src/computer/bypass_guard.rs new file mode 100644 index 000000000..09a1ad437 --- /dev/null +++ b/crates/pi-natives/src/computer/bypass_guard.rs @@ -0,0 +1,44 @@ +#[cfg(test)] +mod tests { + use std::{fs, path::Path}; + + const SIDE_EFFECT_METHODS: &[&str] = + &[".click(", ".double_click(", ".drag(", ".scroll(", ".type_text(", ".keypress("]; + + #[test] + fn input_controller_side_effect_methods_stay_behind_executor() { + let computer_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("src/computer"); + let mut violations = Vec::new(); + + for entry in fs::read_dir(&computer_dir).expect("computer module directory is readable") { + let entry = entry.expect("computer module entry is readable"); + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) != Some("rs") { + continue; + } + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or_default(); + if file_name == "bypass_guard.rs" { + continue; + } + let source = fs::read_to_string(&path).expect("computer module source is readable"); + for method in SIDE_EFFECT_METHODS { + if !source.contains(method) { + continue; + } + if file_name != "input.rs" && file_name != "executor.rs" { + violations.push(format!("{file_name} references {method}")); + } + } + } + + assert!( + violations.is_empty(), + "InputController side-effect methods must be referenced only in input.rs and \ + executor.rs: {}", + violations.join(", ") + ); + } +} diff --git a/crates/pi-natives/src/computer/capture.rs b/crates/pi-natives/src/computer/capture.rs new file mode 100644 index 000000000..b8dc0a79f --- /dev/null +++ b/crates/pi-natives/src/computer/capture.rs @@ -0,0 +1,309 @@ +//! Primary-display screen capture (macOS). +//! +//! # Overview +//! Read-only capture of the current primary display into a PNG plus the +//! [`NormalizedDisplay`] descriptor whose pixel dimensions define the action +//! coordinate space (see [`super::coords`]). The display scale is derived from +//! the captured physical pixel size versus the logical display bounds, so the +//! coordinate contract stays correct on Retina/HiDPI. +//! +//! Capture requires the macOS Screen Recording (TCC) permission. When it is not +//! granted, `CGDisplayCreateImage` returns null and this surfaces +//! [`CaptureError::CaptureFailed`] rather than silently returning a black +//! frame. +//! +//! Implemented with raw CoreGraphics FFI (no extra crates); the buffer is owned +//! Rust memory and every Core Graphics handle is released exactly once. + +use std::{ + collections::hash_map::DefaultHasher, + ffi::c_void, + fmt, + hash::{Hash, Hasher}, + sync::atomic::{AtomicU64, Ordering}, +}; + +use crate::computer::coords::NormalizedDisplay; + +#[repr(C)] +#[derive(Clone, Copy)] +struct CgPoint { + x: f64, + y: f64, +} + +#[repr(C)] +#[derive(Clone, Copy)] +struct CgSize { + width: f64, + height: f64, +} + +#[repr(C)] +#[derive(Clone, Copy)] +struct CgRect { + origin: CgPoint, + size: CgSize, +} + +type CgDirectDisplayId = u32; +type CgImageRef = *mut c_void; +type CgColorSpaceRef = *mut c_void; +type CgContextRef = *mut c_void; + +/// `kCGImageAlphaPremultipliedLast` (1) | `kCGBitmapByteOrder32Big` (4 << 12) +/// yields an RGBA8888 byte layout. +const RGBA_BITMAP_INFO: u32 = 1 | (4 << 12); +const BITS_PER_COMPONENT: usize = 8; +const BYTES_PER_PIXEL: usize = 4; + +#[link(name = "CoreGraphics", kind = "framework")] +unsafe extern "C" { + fn CGMainDisplayID() -> CgDirectDisplayId; + fn CGDisplayBounds(display: CgDirectDisplayId) -> CgRect; + fn CGDisplayCreateImage(display: CgDirectDisplayId) -> CgImageRef; + fn CGDisplayPixelsWide(display: CgDirectDisplayId) -> usize; + fn CGDisplayPixelsHigh(display: CgDirectDisplayId) -> usize; + fn CGImageGetWidth(image: CgImageRef) -> usize; + fn CGImageGetHeight(image: CgImageRef) -> usize; + fn CGImageRelease(image: CgImageRef); + fn CGColorSpaceCreateDeviceRGB() -> CgColorSpaceRef; + fn CGColorSpaceRelease(space: CgColorSpaceRef); + fn CGBitmapContextCreate( + data: *mut c_void, + width: usize, + height: usize, + bits_per_component: usize, + bytes_per_row: usize, + space: CgColorSpaceRef, + bitmap_info: u32, + ) -> CgContextRef; + fn CGContextDrawImage(context: CgContextRef, rect: CgRect, image: CgImageRef); + fn CGContextRelease(context: CgContextRef); +} + +/// Reason a primary-display capture failed. +#[derive(Debug, Clone)] +pub enum CaptureError { + /// `CGDisplayCreateImage` returned null or a zero-sized image — commonly the + /// Screen Recording permission is not granted. + CaptureFailed, + /// A Core Graphics color space or bitmap context could not be created. + ContextFailed, + /// The captured frame could not be PNG-encoded. + Encode(String), +} + +impl fmt::Display for CaptureError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::CaptureFailed => { + write!(f, "screen capture failed; the Screen Recording permission may not be granted") + }, + Self::ContextFailed => write!(f, "failed to create a Core Graphics bitmap context"), + Self::Encode(reason) => write!(f, "failed to encode captured frame as PNG: {reason}"), + } + } +} + +impl std::error::Error for CaptureError {} + +static NEXT_CAPTURE_ID: AtomicU64 = AtomicU64::new(1); + +/// A captured primary-display frame. +pub struct CapturedFrame { + /// Coordinate descriptor for the captured display. + pub display: NormalizedDisplay, + /// PNG-encoded RGBA image bytes. + pub png: Vec, + /// Stable hash of the display geometry used for stale-display checks. + pub display_epoch: u64, + /// Process-local opaque capture id. + pub capture_id: u32, +} + +/// Capture the current primary display as a PNG plus its coordinate descriptor. +/// +/// # Errors +/// Returns [`CaptureError`] when the OS capture call fails (often a missing +/// Screen Recording grant), a bitmap context cannot be created, or PNG encoding +/// fails. +pub fn capture_primary_display() -> Result { + // SAFETY: pure Core Graphics geometry queries for the active primary display; + // no image capture occurs before `CGDisplayCreateImage` below. + let (display_id, display) = unsafe { + let id = CGMainDisplayID(); + let bounds = CGDisplayBounds(id); + let pixels_wide = CGDisplayPixelsWide(id); + let pixels_high = CGDisplayPixelsHigh(id); + (id, display_descriptor(pixels_wide, pixels_high, bounds)) + }; + + let display_epoch = display_epoch(&display); + let capture_id = next_capture_id(); + + // SAFETY: `display_id` is a valid primary-display id. The returned image is + // released exactly once below regardless of the `frame_from_image` result. + let image = unsafe { CGDisplayCreateImage(display_id) }; + if image.is_null() { + return Err(CaptureError::CaptureFailed); + } + + let result = frame_from_image(image, display, display_epoch, capture_id); + + // SAFETY: `image` is non-null (checked above) and not used after release. + unsafe { CGImageRelease(image) }; + result +} + +#[must_use] +pub fn current_display_epoch() -> u64 { + let display = current_display_descriptor(); + display_epoch(&display) +} + +/// Convert a non-null `CGImage` into a [`CapturedFrame`]. Does not release +/// `image`; the caller owns its lifetime. +fn frame_from_image( + image: CgImageRef, + display: NormalizedDisplay, + display_epoch: u64, + capture_id: u32, +) -> Result { + // SAFETY: `image` is non-null per the caller's check. + let (width, height) = unsafe { (CGImageGetWidth(image), CGImageGetHeight(image)) }; + if width == 0 || height == 0 { + return Err(CaptureError::CaptureFailed); + } + + let bytes_per_row = width * BYTES_PER_PIXEL; + let mut buffer = vec![0u8; bytes_per_row * height]; + + // SAFETY: device RGB color space; released on every path below. + let space = unsafe { CGColorSpaceCreateDeviceRGB() }; + if space.is_null() { + return Err(CaptureError::ContextFailed); + } + + // SAFETY: `buffer` is exactly `bytes_per_row * height` bytes, matching the + // dimensions/stride passed here; `space` is non-null. + let context = unsafe { + CGBitmapContextCreate( + buffer.as_mut_ptr().cast::(), + width, + height, + BITS_PER_COMPONENT, + bytes_per_row, + space, + RGBA_BITMAP_INFO, + ) + }; + if context.is_null() { + // SAFETY: `space` is non-null and released exactly once here. + unsafe { CGColorSpaceRelease(space) }; + return Err(CaptureError::ContextFailed); + } + + let rect = CgRect { + origin: CgPoint { x: 0.0, y: 0.0 }, + size: CgSize { width: width as f64, height: height as f64 }, + }; + // SAFETY: `context` and `image` are non-null; `rect` matches the buffer the + // context was created over, so the draw stays in bounds. + unsafe { CGContextDrawImage(context, rect, image) }; + + // SAFETY: both handles are non-null and released exactly once; not used after. + unsafe { + CGContextRelease(context); + CGColorSpaceRelease(space); + } + + let png = encode_png(&buffer, width as u32, height as u32)?; + + Ok(CapturedFrame { display, png, display_epoch, capture_id }) +} + +/// Scale = physical pixels / logical points, defaulting to `1.0` when the +/// logical extent is not positive. +fn derive_scale(pixels: f64, logical: f64) -> f64 { + if logical > 0.0 { pixels / logical } else { 1.0 } +} + +fn current_display_descriptor() -> NormalizedDisplay { + // SAFETY: pure Core Graphics geometry queries for the active primary display; + // no image capture or Screen Recording permission is involved. + unsafe { + let display_id = CGMainDisplayID(); + let bounds = CGDisplayBounds(display_id); + display_descriptor(CGDisplayPixelsWide(display_id), CGDisplayPixelsHigh(display_id), bounds) + } +} + +fn display_descriptor(width: usize, height: usize, bounds: CgRect) -> NormalizedDisplay { + let scale_x = derive_scale(width as f64, bounds.size.width); + let scale_y = derive_scale(height as f64, bounds.size.height); + NormalizedDisplay::new( + width as u32, + height as u32, + scale_x, + scale_y, + bounds.origin.x, + bounds.origin.y, + ) +} + +fn display_epoch(display: &NormalizedDisplay) -> u64 { + let mut hasher = DefaultHasher::new(); + display.width_px.hash(&mut hasher); + display.height_px.hash(&mut hasher); + display.scale_x.to_bits().hash(&mut hasher); + display.scale_y.to_bits().hash(&mut hasher); + display.origin_x.to_bits().hash(&mut hasher); + display.origin_y.to_bits().hash(&mut hasher); + hasher.finish() +} + +fn next_capture_id() -> u32 { + let id = NEXT_CAPTURE_ID.fetch_add(1, Ordering::Relaxed); + ((id - 1) % u64::from(u32::MAX) + 1) as u32 +} + +fn encode_png(rgba: &[u8], width: u32, height: u32) -> Result, CaptureError> { + use image::{ExtendedColorType, ImageEncoder, codecs::png::PngEncoder}; + + let mut out = Vec::new(); + PngEncoder::new(&mut out) + .write_image(rgba, width, height, ExtendedColorType::Rgba8) + .map_err(|err| CaptureError::Encode(err.to_string()))?; + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::capture_primary_display; + + /// Exercises the real OS capture path, so it is ignored by default and run + /// explicitly (`cargo test -p pi-natives --ignored`) on a macOS host with + /// Screen Recording granted. + #[test] + #[ignore = "captures the real primary display; needs macOS + Screen Recording grant"] + fn captures_non_uniform_primary_display() { + let frame = capture_primary_display() + .expect("capture should succeed when Screen Recording is granted"); + assert!(frame.display.width_px > 0 && frame.display.height_px > 0); + + let decoded = image::load_from_memory(&frame.png).expect("captured bytes decode as PNG"); + assert_eq!(decoded.width(), frame.display.width_px); + assert_eq!(decoded.height(), frame.display.height_px); + + let rgba = decoded.to_rgba8(); + let first = rgba.pixels().next().copied(); + let non_uniform = rgba.pixels().any(|pixel| Some(*pixel) != first); + assert!( + non_uniform, + "captured frame is uniform (black/blank) — Screen Recording likely not granted" + ); + + std::fs::write("/tmp/computer-capture-evidence.png", &frame.png).ok(); + } +} diff --git a/crates/pi-natives/src/computer/controller.rs b/crates/pi-natives/src/computer/controller.rs new file mode 100644 index 000000000..1faaa921b --- /dev/null +++ b/crates/pi-natives/src/computer/controller.rs @@ -0,0 +1,169 @@ +//! N-API controller surface for macOS computer-use. +//! +//! Side-effecting methods are thin adapters: they construct an [`InputAction`] +//! and delegate to [`execute_input`]. No direct input controller methods are +//! called from this module. + +use napi::bindgen_prelude::Uint8Array; +use napi_derive::napi; + +use crate::computer::{ + ComputerScreenshot, + capture::capture_primary_display, + executor::{ExecError, InputAction, MacDisplayContext, MacPermissionGate, execute_input}, + hotkey, + input::{MouseButton, guarded_controller}, + supervisor::Supervisor, +}; + +#[napi] +pub struct ComputerController; + +#[napi] +impl ComputerController { + #[napi(constructor)] + pub fn new() -> Self { + Self + } + + #[napi] + pub fn screenshot(&self) -> napi::Result { + let frame = + capture_primary_display().map_err(|err| napi::Error::from_reason(format!("{err}")))?; + Ok(ComputerScreenshot { + png: Uint8Array::from(frame.png), + width_px: frame.display.width_px, + height_px: frame.display.height_px, + scale_x: frame.display.scale_x, + scale_y: frame.display.scale_y, + origin_x: frame.display.origin_x, + origin_y: frame.display.origin_y, + display_epoch: frame.display_epoch as f64, + capture_id: frame.capture_id, + }) + } + + #[napi] + pub fn click( + &self, + expected_epoch: Option, + x: f64, + y: f64, + button: Option, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Click { x, y, button: parse_button(button)? }) + } + + #[napi(js_name = "doubleClick")] + pub fn double_click( + &self, + expected_epoch: Option, + x: f64, + y: f64, + button: Option, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::DoubleClick { x, y, button: parse_button(button)? }) + } + + #[napi] + pub fn move_(&self, expected_epoch: Option, x: f64, y: f64) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Move { x, y }) + } + + #[napi] + pub fn drag( + &self, + expected_epoch: Option, + x: f64, + y: f64, + to_x: f64, + to_y: f64, + button: Option, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Drag { + x, + y, + to_x, + to_y, + button: parse_button(button)?, + }) + } + + #[napi] + pub fn scroll( + &self, + expected_epoch: Option, + x: f64, + y: f64, + scroll_x: f64, + scroll_y: f64, + ) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Scroll { x, y, scroll_x, scroll_y }) + } + + #[napi(js_name = "type")] + pub fn type_(&self, expected_epoch: Option, text: String) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Type { text }) + } + + #[napi] + pub fn keypress(&self, expected_epoch: Option, keys: Vec) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Keypress { keys }) + } + + #[napi] + pub fn wait(&self, expected_epoch: Option, ms: u32) -> napi::Result<()> { + self.execute(expected_epoch, InputAction::Wait { ms: u64::from(ms) }) + } + + fn execute(&self, expected_epoch: Option, action: InputAction) -> napi::Result<()> { + hotkey::start(); + let frame = + capture_primary_display().map_err(|err| napi::Error::from_reason(format!("{err}")))?; + let display = frame.display; + let mut controller = guarded_controller() + .map_err(|err| napi_error("COMPUTER_PERMISSION_REQUIRED", err.to_string()))?; + let cancel = || Supervisor::global().is_suspended(); + execute_input( + &action, + Supervisor::global(), + &MacPermissionGate, + &MacDisplayContext, + expected_epoch.map(epoch_from_f64), + &display, + &mut controller, + &cancel, + ) + .map_err(exec_error) + } +} + +fn parse_button(button: Option) -> napi::Result { + match button + .as_deref() + .unwrap_or("left") + .to_ascii_lowercase() + .as_str() + { + "left" => Ok(MouseButton::Left), + "right" => Ok(MouseButton::Right), + "center" | "middle" => Ok(MouseButton::Center), + other => Err(napi_error("COMPUTER_COORD_INVALID", format!("unknown mouse button: {other}"))), + } +} + +fn epoch_from_f64(value: f64) -> u64 { + if value.is_finite() && value >= 0.0 { + value as u64 + } else { + u64::MAX + } +} + +fn exec_error(err: ExecError) -> napi::Error { + napi_error(err.code(), err.to_string()) +} + +fn napi_error(code: &'static str, reason: String) -> napi::Error { + napi::Error::new(napi::Status::GenericFailure, format!("{code}: {reason}")) +} diff --git a/crates/pi-natives/src/computer/coords.rs b/crates/pi-natives/src/computer/coords.rs new file mode 100644 index 000000000..152779fc7 --- /dev/null +++ b/crates/pi-natives/src/computer/coords.rs @@ -0,0 +1,267 @@ +//! Coordinate contract for the native computer-use tool. +//! +//! # Overview +//! The computer-use tool exposes a single *normalized virtual display* to the +//! model: the dimensions of the returned screenshot (in pixels) define the +//! action coordinate space. Every model-supplied `x`/`y` is a pixel in that +//! screenshot. macOS input injection (`CGEvent`) operates in *logical points*, +//! not physical pixels, so on Retina/HiDPI displays a screenshot pixel and a +//! logical point differ by the display scale factor. This module owns the one +//! authoritative transform from screenshot pixels to macOS logical points, plus +//! strict bounds rejection. +//! +//! It is deliberately framework-free (no `CoreGraphics`, no napi) so the +//! coordinate math is unit-testable without a display or granted permissions. +//! The native capture/input backend that produces [`NormalizedDisplay`] values +//! lands in a later slice (see `docs/computer-use/`). +//! +//! # Example +//! ``` +//! use pi_natives::computer::coords::NormalizedDisplay; +//! +//! // A 200x100-point Retina display captured at 2x => 400x200 screenshot px. +//! let display = NormalizedDisplay::new(400, 200, 2.0, 2.0, 0.0, 0.0); +//! let point = display.to_logical_point(100.0, 50.0).unwrap(); +//! assert!((point.x - 50.0).abs() < 0.5); +//! assert!((point.y - 25.0).abs() < 0.5); +//! ``` + +use core::fmt; + +/// A point in macOS logical (point) coordinate space, suitable for `CGEvent` +/// injection by the native input backend. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct LogicalPoint { + /// Logical X (points), including the display's logical origin. + pub x: f64, + /// Logical Y (points), including the display's logical origin. + pub y: f64, +} + +/// Reason a screenshot-space pixel could not be mapped to a logical point. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum CoordError { + /// The pixel coordinate is outside the normalized display bounds, or not a + /// finite number. Side-effecting actions must reject rather than clamp. + OutOfBounds { + /// Offending X pixel. + x: f64, + /// Offending Y pixel. + y: f64, + /// Normalized display width in pixels. + width_px: u32, + /// Normalized display height in pixels. + height_px: u32, + }, + /// The display descriptor has a non-positive or non-finite scale factor, so + /// no correct transform exists. + InvalidScale { + /// Offending X scale. + scale_x: f64, + /// Offending Y scale. + scale_y: f64, + }, +} + +impl fmt::Display for CoordError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Self::OutOfBounds { x, y, width_px, height_px } => write!( + f, + "pixel ({x}, {y}) is out of bounds for a {width_px}x{height_px} normalized display" + ), + Self::InvalidScale { scale_x, scale_y } => { + write!(f, "invalid display scale ({scale_x}, {scale_y}); must be finite and > 0") + }, + } + } +} + +impl std::error::Error for CoordError {} + +/// Descriptor of the single normalized virtual display whose screenshot pixels +/// define the action coordinate space. +/// +/// `scale_x`/`scale_y` are the per-axis ratios of physical screenshot pixels to +/// logical points (typically `1.0` on non-Retina and `2.0` on Retina). +/// `origin_x`/`origin_y` are the display's logical origin, preserved so the +/// transform stays correct for non-zero display origins. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct NormalizedDisplay { + /// Screenshot width in physical pixels. + pub width_px: u32, + /// Screenshot height in physical pixels. + pub height_px: u32, + /// Physical-pixels-per-logical-point along X. + pub scale_x: f64, + /// Physical-pixels-per-logical-point along Y. + pub scale_y: f64, + /// Logical origin X of the display (points). + pub origin_x: f64, + /// Logical origin Y of the display (points). + pub origin_y: f64, +} + +impl NormalizedDisplay { + /// Construct a descriptor from raw capture geometry. + #[must_use] + pub const fn new( + width_px: u32, + height_px: u32, + scale_x: f64, + scale_y: f64, + origin_x: f64, + origin_y: f64, + ) -> Self { + Self { width_px, height_px, scale_x, scale_y, origin_x, origin_y } + } + + /// Whether both scale factors are finite and strictly positive. + #[must_use] + pub fn has_valid_scale(&self) -> bool { + self.scale_x.is_finite() + && self.scale_x > 0.0 + && self.scale_y.is_finite() + && self.scale_y > 0.0 + } + + /// Whether `(x, y)` is a finite pixel inside `[0, width_px) x [0, + /// height_px)`. + #[must_use] + pub fn contains(&self, x: f64, y: f64) -> bool { + // `Range::contains` is false for NaN, so non-finite pixels are rejected too. + (0.0..f64::from(self.width_px)).contains(&x) && (0.0..f64::from(self.height_px)).contains(&y) + } + + /// Map a screenshot-space pixel to a macOS logical point. + /// + /// # Errors + /// Returns [`CoordError::InvalidScale`] when the descriptor's scale is not + /// finite and positive, or [`CoordError::OutOfBounds`] when `(x, y)` is not + /// a finite pixel inside the display bounds. + pub fn to_logical_point(&self, x: f64, y: f64) -> Result { + if !self.has_valid_scale() { + return Err(CoordError::InvalidScale { scale_x: self.scale_x, scale_y: self.scale_y }); + } + if !self.contains(x, y) { + return Err(CoordError::OutOfBounds { + x, + y, + width_px: self.width_px, + height_px: self.height_px, + }); + } + Ok(LogicalPoint { x: self.origin_x + x / self.scale_x, y: self.origin_y + y / self.scale_y }) + } +} + +#[cfg(test)] +mod tests { + use super::{CoordError, NormalizedDisplay}; + + /// Logical points must match the expected value well within the 0.5-point + /// accuracy tolerance the plan requires. + const TOLERANCE: f64 = 0.5; + + fn assert_close(actual: f64, expected: f64) { + assert!((actual - expected).abs() < TOLERANCE, "expected ~{expected}, got {actual}"); + } + + #[test] + fn identity_scale_zero_origin() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + let p = display.to_logical_point(40.0, 60.0).unwrap(); + assert_close(p.x, 40.0); + assert_close(p.y, 60.0); + } + + #[test] + fn retina_scale_halves_pixels() { + let display = NormalizedDisplay::new(400, 200, 2.0, 2.0, 0.0, 0.0); + let p = display.to_logical_point(100.0, 50.0).unwrap(); + assert_close(p.x, 50.0); + assert_close(p.y, 25.0); + } + + #[test] + fn fractional_scale() { + let display = NormalizedDisplay::new(300, 150, 1.5, 1.5, 0.0, 0.0); + let p = display.to_logical_point(150.0, 75.0).unwrap(); + assert_close(p.x, 100.0); + assert_close(p.y, 50.0); + } + + #[test] + fn non_zero_origin_is_preserved() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 10.0, 20.0); + let p = display.to_logical_point(5.0, 5.0).unwrap(); + assert_close(p.x, 15.0); + assert_close(p.y, 25.0); + } + + #[test] + fn anisotropic_scale_per_axis() { + let display = NormalizedDisplay::new(200, 100, 2.0, 1.0, 0.0, 0.0); + let p = display.to_logical_point(100.0, 40.0).unwrap(); + assert_close(p.x, 50.0); + assert_close(p.y, 40.0); + } + + #[test] + fn top_left_edge_is_inside() { + let display = NormalizedDisplay::new(100, 100, 2.0, 2.0, 0.0, 0.0); + assert!(display.to_logical_point(0.0, 0.0).is_ok()); + } + + #[test] + fn bottom_right_inclusive_pixel_is_inside() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(display.to_logical_point(99.0, 99.0).is_ok()); + } + + #[test] + fn width_height_pixel_is_out_of_bounds() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(matches!(display.to_logical_point(100.0, 0.0), Err(CoordError::OutOfBounds { .. }))); + assert!(matches!(display.to_logical_point(0.0, 100.0), Err(CoordError::OutOfBounds { .. }))); + } + + #[test] + fn negative_pixel_is_out_of_bounds() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(matches!(display.to_logical_point(-1.0, 10.0), Err(CoordError::OutOfBounds { .. }))); + } + + #[test] + fn non_finite_pixel_is_out_of_bounds() { + let display = NormalizedDisplay::new(100, 100, 1.0, 1.0, 0.0, 0.0); + assert!(matches!( + display.to_logical_point(f64::NAN, 10.0), + Err(CoordError::OutOfBounds { .. }) + )); + assert!(matches!( + display.to_logical_point(10.0, f64::INFINITY), + Err(CoordError::OutOfBounds { .. }) + )); + } + + #[test] + fn invalid_scale_is_rejected() { + for (sx, sy) in [(0.0, 1.0), (1.0, -2.0), (f64::NAN, 1.0)] { + let display = NormalizedDisplay::new(100, 100, sx, sy, 0.0, 0.0); + assert!(matches!( + display.to_logical_point(10.0, 10.0), + Err(CoordError::InvalidScale { .. }) + )); + } + } + + #[test] + fn invalid_scale_takes_priority_over_bounds() { + let display = NormalizedDisplay::new(100, 100, 0.0, 1.0, 0.0, 0.0); + assert!(matches!( + display.to_logical_point(999.0, 999.0), + Err(CoordError::InvalidScale { .. }) + )); + } +} diff --git a/crates/pi-natives/src/computer/executor.rs b/crates/pi-natives/src/computer/executor.rs new file mode 100644 index 000000000..87b0e7ede --- /dev/null +++ b/crates/pi-natives/src/computer/executor.rs @@ -0,0 +1,462 @@ +//! Central supervisor-gated execution for computer-use input. +//! +//! # Single side-effect authority +//! Every side-effecting input action passes [`execute_input`] before the +//! [`InputController`] touches the OS. The gate is fail-closed: it requires the +//! supervisor stop-path live + fresh + not-suspended, Accessibility granted, +//! and (for coordinate actions) a matching display epoch. `release_all` runs on +//! every non-success exit and whenever suspension is observed mid-flight, so a +//! partial drag never leaves a button held. Screenshot is read-only (see +//! [`super::capture`]) and is intentionally NOT gated here. +//! +//! The gate logic is OS-agnostic and unit-tested with a fake permission gate, +//! fake display context, a real [`Supervisor`], and a recording [`EventSink`]; +//! macOS supplies the concrete permission/display providers. + +use super::{ + coords::{CoordError, NormalizedDisplay}, + input::{EventSink, InputController, InputError, MouseButton}, + supervisor::Supervisor, +}; + +/// A side-effecting computer-use action (the 8 input primitives). Screenshot is +/// handled by the read-only capture path, not this executor. +#[derive(Debug, Clone, PartialEq)] +pub enum InputAction { + /// Move + click. + Click { x: f64, y: f64, button: MouseButton }, + /// Move + double click. + DoubleClick { x: f64, y: f64, button: MouseButton }, + /// Move the cursor. + Move { x: f64, y: f64 }, + /// Press, drag, release. + Drag { x: f64, y: f64, to_x: f64, to_y: f64, button: MouseButton }, + /// Move + scroll by logical deltas. + Scroll { x: f64, y: f64, scroll_x: f64, scroll_y: f64 }, + /// Type a unicode string. + Type { text: String }, + /// Press/release named keys in order. + Keypress { keys: Vec }, + /// Abort-aware wait. + Wait { ms: u64 }, +} + +impl InputAction { + /// Whether the action targets a screenshot-space coordinate (and so needs a + /// fresh, matching display epoch). + #[must_use] + pub const fn is_coordinate(&self) -> bool { + matches!( + self, + Self::Click { .. } + | Self::DoubleClick { .. } + | Self::Move { .. } + | Self::Drag { .. } + | Self::Scroll { .. } + ) + } +} + +/// Reason an action was rejected or failed. Each maps to a stable error code so +/// the TS tool can surface consistent, actionable messages. +#[derive(Debug, Clone, PartialEq)] +pub enum ExecError { + /// Kill-switch latched; input stays off until a user-only reset. + Suspended, + /// The global stop path is not live/fresh; input is disabled fail-closed. + SupervisorNotLive, + /// Accessibility is not granted; no input may be injected. + PermissionRequired, + /// The display changed since the screenshot the coordinates came from. + DisplayStale, + /// A coordinate was out of bounds / non-finite / invalid scale. + Coord(CoordError), + /// The action was cancelled (AbortSignal/timeout/supervisor stop). + Cancelled, + /// A key name was not recognized. + UnknownKey(String), +} + +impl ExecError { + /// Stable error code string for the TS surface. + #[must_use] + pub const fn code(&self) -> &'static str { + match self { + Self::Suspended => "COMPUTER_SUSPENDED", + Self::SupervisorNotLive => "COMPUTER_SUPERVISOR_NOT_LIVE", + Self::PermissionRequired => "COMPUTER_PERMISSION_REQUIRED", + Self::DisplayStale => "COMPUTER_DISPLAY_STALE", + Self::Coord(_) => "COMPUTER_COORD_INVALID", + Self::Cancelled => "COMPUTER_CANCELLED", + Self::UnknownKey(_) => "COMPUTER_UNKNOWN_KEY", + } + } +} + +impl From for ExecError { + fn from(value: InputError) -> Self { + match value { + InputError::Coord(err) => Self::Coord(err), + InputError::UnknownKey(key) => Self::UnknownKey(key), + } + } +} + +impl std::fmt::Display for ExecError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Coord(err) => write!(f, "{}: {err}", self.code()), + Self::UnknownKey(key) => write!(f, "{}: {key}", self.code()), + _ => write!(f, "{}", self.code()), + } + } +} + +impl std::error::Error for ExecError {} + +/// Provides the current Accessibility (input) grant state. macOS implements +/// this over `permissions::accessibility_granted`; tests inject a fake. +pub trait PermissionGate { + /// Whether Accessibility is currently granted. + fn accessibility_granted(&self) -> bool; +} + +/// Provides the current display epoch so coordinate actions can reject stale +/// screenshots. macOS implements this over the capture/display descriptor. +pub trait DisplayContext { + /// The current display epoch (hash of topology/scale/origin). + fn current_epoch(&self) -> u64; +} + +#[cfg(target_os = "macos")] +pub struct MacPermissionGate; + +#[cfg(target_os = "macos")] +impl PermissionGate for MacPermissionGate { + fn accessibility_granted(&self) -> bool { + crate::computer::permissions::accessibility_granted() + } +} + +#[cfg(target_os = "macos")] +pub struct MacDisplayContext; + +#[cfg(target_os = "macos")] +impl DisplayContext for MacDisplayContext { + fn current_epoch(&self) -> u64 { + crate::computer::capture::current_display_epoch() + } +} + +/// Fail-closed gate run before any side-effecting input. +fn gate( + action: &InputAction, + supervisor: &Supervisor, + perms: &P, + display_ctx: &D, + expected_epoch: Option, +) -> Result<(), ExecError> { + let status = supervisor.status(); + if status.suspended { + return Err(ExecError::Suspended); + } + if !status.hotkey_live || !status.heartbeat_fresh { + return Err(ExecError::SupervisorNotLive); + } + if !perms.accessibility_granted() { + return Err(ExecError::PermissionRequired); + } + if action.is_coordinate() + && let Some(expected) = expected_epoch + && display_ctx.current_epoch() != expected + { + return Err(ExecError::DisplayStale); + } + Ok(()) +} + +/// Execute a side-effecting input action through the fail-closed gate. +/// +/// `cancelled` is polled before and (for multi-step actions) reflected via the +/// controller; on any error or observed suspension, `release_all` runs so no +/// mouse button or modifier is left held. +/// +/// # Errors +/// Returns [`ExecError`] when the gate rejects (suspended / not-live / +/// permission / stale display), the action is cancelled, or the controller +/// reports a coordinate/key error. +pub fn execute_input( + action: &InputAction, + supervisor: &Supervisor, + perms: &P, + display_ctx: &D, + expected_epoch: Option, + display: &NormalizedDisplay, + controller: &mut InputController, + cancelled: &dyn Fn() -> bool, +) -> Result<(), ExecError> +where + S: EventSink, + P: PermissionGate, + D: DisplayContext, +{ + gate(action, supervisor, perms, display_ctx, expected_epoch)?; + if cancelled() { + return Err(ExecError::Cancelled); + } + + let result = dispatch(action, display, controller, cancelled); + + // release_all on any failure, or if the kill-switch latched mid-action. + if result.is_err() || supervisor.is_suspended() { + controller.release_all(); + } + result +} + +fn dispatch( + action: &InputAction, + display: &NormalizedDisplay, + controller: &mut InputController, + cancelled: &dyn Fn() -> bool, +) -> Result<(), ExecError> { + match action { + InputAction::Click { x, y, button } => controller + .click(display, *x, *y, *button) + .map_err(Into::into), + InputAction::DoubleClick { x, y, button } => controller + .double_click(display, *x, *y, *button) + .map_err(Into::into), + InputAction::Move { x, y } => controller.move_to(display, *x, *y).map_err(Into::into), + InputAction::Drag { x, y, to_x, to_y, button } => controller + .drag(display, *x, *y, *to_x, *to_y, *button) + .map_err(Into::into), + InputAction::Scroll { x, y, scroll_x, scroll_y } => controller + .scroll(display, *x, *y, *scroll_x, *scroll_y) + .map_err(Into::into), + InputAction::Type { text } => { + controller.type_text(text); + Ok(()) + }, + InputAction::Keypress { keys } => controller.keypress(keys).map_err(Into::into), + InputAction::Wait { ms } => wait_abortable(*ms, cancelled), + } +} + +/// Sleep up to `ms`, checking `cancelled` periodically. +fn wait_abortable(ms: u64, cancelled: &dyn Fn() -> bool) -> Result<(), ExecError> { + use std::time::{Duration, Instant}; + let deadline = Instant::now() + Duration::from_millis(ms); + while Instant::now() < deadline { + if cancelled() { + return Err(ExecError::Cancelled); + } + std::thread::sleep(Duration::from_millis(ms.min(10))); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::{DisplayContext, ExecError, InputAction, PermissionGate, execute_input}; + use crate::computer::{ + coords::{LogicalPoint, NormalizedDisplay}, + input::{EventSink, InputController, MouseButton, SinkOp}, + supervisor::Supervisor, + }; + + struct FakePerms { + granted: bool, + } + impl PermissionGate for FakePerms { + fn accessibility_granted(&self) -> bool { + self.granted + } + } + + struct FakeDisplay { + epoch: u64, + } + impl DisplayContext for FakeDisplay { + fn current_epoch(&self) -> u64 { + self.epoch + } + } + + #[derive(Default)] + struct RecordingSink { + ops: Vec, + } + impl EventSink for RecordingSink { + fn move_cursor(&mut self, to: LogicalPoint) { + self.ops.push(SinkOp::Move(to)); + } + + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool) { + self.ops.push(SinkOp::Button { at, button, down }); + } + + fn scroll(&mut self, dx: f64, dy: f64) { + self.ops.push(SinkOp::Scroll { dx, dy }); + } + + fn type_unicode(&mut self, text: &str) { + self.ops.push(SinkOp::TypeUnicode(text.to_string())); + } + + fn key(&mut self, code: u16, down: bool) { + self.ops.push(SinkOp::Key { code, down }); + } + } + + fn display() -> NormalizedDisplay { + NormalizedDisplay::new(200, 100, 2.0, 2.0, 0.0, 0.0) + } + + fn live_supervisor() -> Supervisor { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat(); + s + } + + fn never_cancel() -> impl Fn() -> bool { + || false + } + + fn run( + action: &InputAction, + sup: &Supervisor, + granted: bool, + expected_epoch: Option, + current_epoch: u64, + ) -> (Result<(), ExecError>, Vec) { + let mut controller = InputController::new(RecordingSink::default()); + let perms = FakePerms { granted }; + let disp_ctx = FakeDisplay { epoch: current_epoch }; + let cancel = never_cancel(); + let res = execute_input( + action, + sup, + &perms, + &disp_ctx, + expected_epoch, + &display(), + &mut controller, + &cancel, + ); + (res, controller.into_sink().ops) + } + + #[test] + fn suspended_rejects_before_any_sink_op() { + let sup = live_supervisor(); + sup.trigger_stop(); + let (res, ops) = run(&InputAction::Move { x: 10.0, y: 10.0 }, &sup, true, None, 0); + assert_eq!(res, Err(ExecError::Suspended)); + assert!(ops.is_empty(), "no events when suspended"); + } + + #[test] + fn not_live_rejects() { + let sup = Supervisor::new(); // hotkey not live + let (res, ops) = run( + &InputAction::Click { x: 1.0, y: 1.0, button: MouseButton::Left }, + &sup, + true, + None, + 0, + ); + assert_eq!(res, Err(ExecError::SupervisorNotLive)); + assert!(ops.is_empty()); + } + + #[test] + fn missing_accessibility_rejects() { + let sup = live_supervisor(); + let (res, ops) = run(&InputAction::Move { x: 1.0, y: 1.0 }, &sup, false, None, 0); + assert_eq!(res, Err(ExecError::PermissionRequired)); + assert!(ops.is_empty()); + } + + #[test] + fn stale_display_epoch_rejects_coordinate_action() { + let sup = live_supervisor(); + let (res, ops) = run( + &InputAction::Click { x: 1.0, y: 1.0, button: MouseButton::Left }, + &sup, + true, + Some(7), + 9, + ); + assert_eq!(res, Err(ExecError::DisplayStale)); + assert!(ops.is_empty()); + } + + #[test] + fn matching_epoch_allows_action() { + let sup = live_supervisor(); + let (res, ops) = run( + &InputAction::Click { x: 100.0, y: 50.0, button: MouseButton::Left }, + &sup, + true, + Some(7), + 7, + ); + assert!(res.is_ok()); + assert!(!ops.is_empty()); + } + + #[test] + fn out_of_bounds_coordinate_errors_and_releases() { + let sup = live_supervisor(); + // drag to out-of-bounds: press happens then error -> release_all leaves nothing + // held. + let action = InputAction::Drag { + x: 0.0, + y: 0.0, + to_x: 999.0, + to_y: 0.0, + button: MouseButton::Left, + }; + let (res, ops) = run(&action, &sup, true, None, 0); + assert!(matches!(res, Err(ExecError::Coord(_)))); + let downs = ops + .iter() + .filter(|o| matches!(o, SinkOp::Button { down: true, .. })) + .count(); + let ups = ops + .iter() + .filter(|o| matches!(o, SinkOp::Button { down: false, .. })) + .count(); + assert_eq!(downs, ups, "every press is released after the error path"); + } + + #[test] + fn type_and_keypress_pass_the_gate() { + let sup = live_supervisor(); + let (res, ops) = run(&InputAction::Type { text: "hi".to_string() }, &sup, true, None, 0); + assert!(res.is_ok()); + assert_eq!(ops, vec![SinkOp::TypeUnicode("hi".to_string())]); + + let (res2, ops2) = + run(&InputAction::Keypress { keys: vec!["enter".to_string()] }, &sup, true, None, 0); + assert!(res2.is_ok()); + assert_eq!(ops2.len(), 2); // key down + up + } + + #[test] + fn wait_zero_is_ok() { + let sup = live_supervisor(); + let (res, _) = run(&InputAction::Wait { ms: 0 }, &sup, true, None, 0); + assert!(res.is_ok()); + } + + #[test] + fn error_codes_are_stable() { + assert_eq!(ExecError::Suspended.code(), "COMPUTER_SUSPENDED"); + assert_eq!(ExecError::SupervisorNotLive.code(), "COMPUTER_SUPERVISOR_NOT_LIVE"); + assert_eq!(ExecError::PermissionRequired.code(), "COMPUTER_PERMISSION_REQUIRED"); + assert_eq!(ExecError::DisplayStale.code(), "COMPUTER_DISPLAY_STALE"); + } +} diff --git a/crates/pi-natives/src/computer/hotkey.rs b/crates/pi-natives/src/computer/hotkey.rs new file mode 100644 index 000000000..81ada58ac --- /dev/null +++ b/crates/pi-natives/src/computer/hotkey.rs @@ -0,0 +1,251 @@ +//! Global kill-switch hotkey listener (macOS). +//! +//! Runs a listen-only `CGEventTap` for key-down events on a dedicated thread +//! that owns its own `CFRunLoop`. When the configured hotkey +//! (Control+Option+Command+Escape by default) is seen, it latches +//! [`Supervisor::trigger_stop`] on the process-global supervisor — independent +//! of the model's tool path, so the model cannot bypass it. +//! +//! The listener marks the supervisor's stop path live on successful tap +//! creation and clears it on teardown, so input gating fails closed if the tap +//! cannot start. Verified by a synthetic-injection self-test (post the hotkey, +//! observe the latch) plus a real key press by a human for the final drill. + +use std::{ + ffi::c_void, + sync::OnceLock, + thread, + time::{Duration, Instant}, +}; + +use super::supervisor::Supervisor; + +type CfMachPortRef = *mut c_void; +type CfRunLoopSourceRef = *mut c_void; +type CfRunLoopRef = *const c_void; +type CfAllocatorRef = *const c_void; +type CfStringRef = *const c_void; +type CgEventRef = *mut c_void; +type CgEventTapProxy = *mut c_void; +type CgEventTapCallBack = extern "C" fn( + proxy: CgEventTapProxy, + event_type: u32, + event: CgEventRef, + user_info: *mut c_void, +) -> CgEventRef; + +// CGEventTap placement/options/location. +const SESSION_EVENT_TAP: u32 = 1; // kCGSessionEventTap +const HEAD_INSERT: u32 = 0; // kCGHeadInsertEventTap +const LISTEN_ONLY: u32 = 1; // kCGEventTapOptionListenOnly +const EVENT_KEY_DOWN: u32 = 10; // kCGEventKeyDown +const KEYCODE_FIELD: u32 = 9; // kCGKeyboardEventKeycode +const KEY_DOWN_MASK: u64 = 1 << EVENT_KEY_DOWN; // CGEventMaskBit(kCGEventKeyDown) + +// Default hotkey: Control+Option+Command+Escape — distinctive, unlikely to +// collide. +const HOTKEY_KEYCODE: i64 = 53; // Escape +const FLAG_CONTROL: u64 = 0x0004_0000; +const FLAG_OPTION: u64 = 0x0008_0000; +const FLAG_COMMAND: u64 = 0x0010_0000; +const HOTKEY_MODS: u64 = FLAG_CONTROL | FLAG_OPTION | FLAG_COMMAND; + +#[link(name = "CoreGraphics", kind = "framework")] +unsafe extern "C" { + fn CGEventTapCreate( + tap: u32, + place: u32, + options: u32, + events_of_interest: u64, + callback: CgEventTapCallBack, + user_info: *mut c_void, + ) -> CfMachPortRef; + fn CGEventTapEnable(tap: CfMachPortRef, enable: bool); + fn CGEventGetIntegerValueField(event: CgEventRef, field: u32) -> i64; + fn CGEventGetFlags(event: CgEventRef) -> u64; +} + +#[link(name = "CoreFoundation", kind = "framework")] +unsafe extern "C" { + static kCFRunLoopCommonModes: CfStringRef; + fn CFMachPortCreateRunLoopSource( + allocator: CfAllocatorRef, + port: CfMachPortRef, + order: isize, + ) -> CfRunLoopSourceRef; + fn CFRunLoopGetCurrent() -> CfRunLoopRef; + fn CFRunLoopAddSource(rl: CfRunLoopRef, source: CfRunLoopSourceRef, mode: CfStringRef); + fn CFRunLoopRun(); + fn CFRelease(cf: *const c_void); +} + +const fn matches_hotkey(keycode: i64, flags: u64) -> bool { + keycode == HOTKEY_KEYCODE && (flags & HOTKEY_MODS) == HOTKEY_MODS +} + +extern "C" fn tap_callback( + _proxy: CgEventTapProxy, + event_type: u32, + event: CgEventRef, + _user_info: *mut c_void, +) -> CgEventRef { + if event_type == EVENT_KEY_DOWN && !event.is_null() { + // SAFETY: `event` is a valid key event provided by the tap for the + // duration of this callback; we only read fields and return it unchanged. + let (keycode, flags) = + unsafe { (CGEventGetIntegerValueField(event, KEYCODE_FIELD), CGEventGetFlags(event)) }; + if matches_hotkey(keycode, flags) { + Supervisor::global().trigger_stop(); + } + } + // Listen-only: pass the event through untouched. + event +} + +static STARTED: OnceLock = OnceLock::new(); + +/// Start the global hotkey listener once (idempotent). +/// +/// Spawns a dedicated `CFRunLoop` thread; on successful tap creation the +/// supervisor's stop path is marked live. Returns whether the listener is +/// (now) live. +pub fn start() -> bool { + let first = STARTED.set(true).is_ok(); + if first { + thread::Builder::new() + .name("computer-killswitch".into()) + .spawn(run_listener) + .ok(); + } + wait_until_live(Duration::from_secs(1)) +} + +fn run_listener() { + // SAFETY: a listen-only key-down session tap; the returned mach port and + // run-loop source are added to this thread's run loop, which then runs for + // the process lifetime. Handles are released only on the (non-returning) + // teardown path below. + unsafe { + let tap = CGEventTapCreate( + SESSION_EVENT_TAP, + HEAD_INSERT, + LISTEN_ONLY, + KEY_DOWN_MASK, + tap_callback, + std::ptr::null_mut(), + ); + if tap.is_null() { + Supervisor::global().set_hotkey_live(false); + return; + } + let source = CFMachPortCreateRunLoopSource(std::ptr::null(), tap, 0); + if source.is_null() { + CFRelease(tap.cast_const()); + Supervisor::global().set_hotkey_live(false); + return; + } + CFRunLoopAddSource(CFRunLoopGetCurrent(), source, kCFRunLoopCommonModes); + CGEventTapEnable(tap, true); + Supervisor::global().set_hotkey_live(true); + CFRunLoopRun(); + // Unreached in normal operation; if the run loop ever returns, fail closed. + Supervisor::global().set_hotkey_live(false); + CFRelease(source.cast_const()); + CFRelease(tap.cast_const()); + } +} + +fn wait_until_live(timeout: Duration) -> bool { + let deadline = Instant::now() + timeout; + loop { + if Supervisor::global().status().hotkey_live { + return true; + } + if Instant::now() >= deadline { + return false; + } + thread::sleep(Duration::from_millis(20)); + } +} + +#[cfg(test)] +mod tests { + use super::{HOTKEY_KEYCODE, HOTKEY_MODS, matches_hotkey}; + + #[test] + fn matches_only_the_full_hotkey_combo() { + assert!(matches_hotkey(HOTKEY_KEYCODE, HOTKEY_MODS)); + assert!(matches_hotkey(HOTKEY_KEYCODE, HOTKEY_MODS | 0x1)); // extra bits ok + assert!(!matches_hotkey(HOTKEY_KEYCODE, 0)); // no modifiers + assert!(!matches_hotkey(HOTKEY_KEYCODE, 0x0004_0000)); // only control + assert!(!matches_hotkey(0, HOTKEY_MODS)); // wrong key + } +} + +#[cfg(all(test, target_os = "macos"))] +mod live_tests { + use std::{ffi::c_void, thread, time::Duration}; + + use super::{HOTKEY_KEYCODE, HOTKEY_MODS, start}; + use crate::computer::{permissions::accessibility_granted, supervisor::Supervisor}; + + type CgEventSourceRef = *mut c_void; + type CgEventRef = *mut c_void; + + #[link(name = "CoreGraphics", kind = "framework")] + unsafe extern "C" { + fn CGEventSourceCreate(state_id: u32) -> CgEventSourceRef; + fn CGEventCreateKeyboardEvent( + source: CgEventSourceRef, + keycode: u16, + key_down: bool, + ) -> CgEventRef; + fn CGEventSetFlags(event: CgEventRef, flags: u64); + fn CGEventPost(tap: u32, event: CgEventRef); + fn CFRelease(cf: *const c_void); + } + + fn post_hotkey() { + // SAFETY: creates, flags, posts, and releases a synthetic key event. + unsafe { + let source = CGEventSourceCreate(0); + for down in [true, false] { + let event = CGEventCreateKeyboardEvent(source, HOTKEY_KEYCODE as u16, down); + if event.is_null() { + continue; + } + CGEventSetFlags(event, HOTKEY_MODS); + CGEventPost(0, event); + CFRelease(event.cast_const()); + } + if !source.is_null() { + CFRelease(source.cast_const()); + } + } + } + + /// Starts the listener and posts a synthetic hotkey, proving the tap latches + /// the supervisor. Requires Accessibility/Input-Monitoring; ignored by + /// default. + #[test] + #[ignore = "starts a global event tap and posts a synthetic hotkey; needs macOS + grants"] + fn synthetic_hotkey_triggers_stop() { + assert!(accessibility_granted(), "Accessibility must be granted"); + let live = start(); + assert!(live, "hotkey listener should report live (tap created)"); + + Supervisor::global().reset(); + assert!(!Supervisor::global().is_suspended()); + + post_hotkey(); + // Give the tap callback time to fire on its run-loop thread. + for _ in 0..50 { + if Supervisor::global().is_suspended() { + break; + } + thread::sleep(Duration::from_millis(20)); + } + assert!(Supervisor::global().is_suspended(), "synthetic hotkey should latch trigger_stop"); + Supervisor::global().reset(); + } +} diff --git a/crates/pi-natives/src/computer/input.rs b/crates/pi-natives/src/computer/input.rs new file mode 100644 index 000000000..7cc02308d --- /dev/null +++ b/crates/pi-natives/src/computer/input.rs @@ -0,0 +1,857 @@ +//! macOS native input injection for computer-use. +//! +//! # Safety model +//! Input is **runtime-gated**: [`InputController::guarded`] refuses to +//! construct unless Accessibility is granted (see [`super::permissions`]), so +//! no event can be posted while the TCC gate is closed. This module is also +//! **not** wired to napi or the model surface yet — per the approved plan, +//! input is exposed only after the kill-switch supervisor is proven live. +//! +//! # Testability +//! All event *orchestration* (action → low-level event sequence, held +//! button/modifier tracking, coordinate transforms, release-all cleanup) lives +//! in [`InputController`] over an [`EventSink`] trait. Unit tests drive a +//! [`RecordingSink`] to assert exact sequences without posting real OS events. +//! Only [`MacEventSink`] performs `CGEvent` FFI; its live behavior is verified +//! in a granted `gjc` session, not from a non-TCC-trusted test binary. + +use super::coords::{CoordError, LogicalPoint, NormalizedDisplay}; + +/// A mouse button. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MouseButton { + /// Primary (left) button. + Left, + /// Secondary (right) button. + Right, + /// Tertiary (middle) button. + Center, +} + +/// One low-level event recorded by [`RecordingSink`] for tests. +#[derive(Debug, Clone, PartialEq)] +pub enum SinkOp { + /// Move the cursor to a logical point. + Move(LogicalPoint), + /// Press or release `button` at a logical point. + Button { at: LogicalPoint, button: MouseButton, down: bool }, + /// Scroll by logical deltas (`dx`, `dy`). + Scroll { dx: f64, dy: f64 }, + /// Type a unicode string. + TypeUnicode(String), + /// Press or release a virtual key code. + Key { code: u16, down: bool }, +} + +/// Sink for low-level input events. The real implementation posts `CGEvent`s; +/// the test implementation records them. +pub trait EventSink { + /// Move the cursor. + fn move_cursor(&mut self, to: LogicalPoint); + /// Press or release a mouse button at a point. + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool); + /// Scroll by logical deltas. + fn scroll(&mut self, dx: f64, dy: f64); + /// Type a unicode string. + fn type_unicode(&mut self, text: &str); + /// Press or release a virtual key code. + fn key(&mut self, code: u16, down: bool); +} + +/// Error from an input action. +#[derive(Debug, Clone, PartialEq)] +pub enum InputError { + /// A coordinate could not be mapped to a logical point. + Coord(CoordError), + /// A key name was not recognized. + UnknownKey(String), +} + +impl From for InputError { + fn from(value: CoordError) -> Self { + Self::Coord(value) + } +} + +impl std::fmt::Display for InputError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Coord(err) => write!(f, "{err}"), + Self::UnknownKey(key) => write!(f, "unknown key name: {key}"), + } + } +} + +impl std::error::Error for InputError {} + +/// Resolve a named key (or single character) to a macOS virtual key code. +/// Returns `None` for unrecognized names. +#[must_use] +pub fn key_code_for(name: &str) -> Option { + let code = match name.to_ascii_lowercase().as_str() { + "return" | "enter" => 36, + "tab" => 48, + "space" => 49, + "delete" | "backspace" => 51, + "escape" | "esc" => 53, + "left" | "arrowleft" => 123, + "right" | "arrowright" => 124, + "down" | "arrowdown" => 125, + "up" | "arrowup" => 126, + _ => return None, + }; + Some(code) +} + +/// Orchestrates input actions over an [`EventSink`], tracking held buttons so +/// [`InputController::release_all`] can clean up after an abort or error. +pub struct InputController { + sink: S, + cursor: LogicalPoint, + held_buttons: Vec, +} + +impl InputController { + /// Construct a controller over `sink`. Prefer [`InputController::guarded`] + /// for any path that posts real events. + pub const fn new(sink: S) -> Self { + Self { sink, cursor: LogicalPoint { x: 0.0, y: 0.0 }, held_buttons: Vec::new() } + } + + /// The most recent cursor position. + #[must_use] + pub const fn cursor(&self) -> LogicalPoint { + self.cursor + } + + /// Whether any mouse button is currently held. + #[must_use] + pub const fn has_held_buttons(&self) -> bool { + !self.held_buttons.is_empty() + } + + /// Consume the controller and return the underlying sink (e.g. to inspect + /// recorded events in tests). + #[must_use] + pub fn into_sink(self) -> S { + self.sink + } + + fn press(&mut self, at: LogicalPoint, button: MouseButton) { + self.sink.mouse_button(at, button, true); + if !self.held_buttons.contains(&button) { + self.held_buttons.push(button); + } + } + + fn release(&mut self, at: LogicalPoint, button: MouseButton) { + self.sink.mouse_button(at, button, false); + self.held_buttons.retain(|held| *held != button); + } + + /// Move the cursor to a screenshot-space pixel on `display`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn move_to( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + ) -> Result<(), InputError> { + let point = display.to_logical_point(x, y)?; + self.cursor = point; + self.sink.move_cursor(point); + Ok(()) + } + + /// Move to `(x, y)` and click `button`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn click( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + button: MouseButton, + ) -> Result<(), InputError> { + self.move_to(display, x, y)?; + let at = self.cursor; + self.press(at, button); + self.release(at, button); + Ok(()) + } + + /// Double-click `button` at `(x, y)`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn double_click( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + button: MouseButton, + ) -> Result<(), InputError> { + self.click(display, x, y, button)?; + let at = self.cursor; + self.press(at, button); + self.release(at, button); + Ok(()) + } + + /// Press at `(from_x, from_y)`, drag to `(to_x, to_y)`, and release. + /// Releases the button on the error path so no button is left held. + /// + /// # Errors + /// Returns [`InputError::Coord`] when either pixel is out of bounds. + pub fn drag( + &mut self, + display: &NormalizedDisplay, + from_x: f64, + from_y: f64, + to_x: f64, + to_y: f64, + button: MouseButton, + ) -> Result<(), InputError> { + self.move_to(display, from_x, from_y)?; + let start = self.cursor; + self.press(start, button); + match display.to_logical_point(to_x, to_y) { + Ok(end) => { + self.cursor = end; + self.sink.move_cursor(end); + self.release(end, button); + Ok(()) + }, + Err(err) => { + // Out-of-bounds destination: release the held button before erroring. + self.release(start, button); + Err(InputError::Coord(err)) + }, + } + } + + /// Scroll by logical deltas after moving to `(x, y)`. + /// + /// # Errors + /// Returns [`InputError::Coord`] when the pixel is out of bounds. + pub fn scroll( + &mut self, + display: &NormalizedDisplay, + x: f64, + y: f64, + dx: f64, + dy: f64, + ) -> Result<(), InputError> { + self.move_to(display, x, y)?; + self.sink.scroll(dx, dy); + Ok(()) + } + + /// Type a unicode string. + pub fn type_text(&mut self, text: &str) { + self.sink.type_unicode(text); + } + + /// Press and release each named key in order. + /// + /// # Errors + /// Returns [`InputError::UnknownKey`] when a name is unrecognized; keys + /// before the failure have already been sent. + pub fn keypress(&mut self, keys: &[String]) -> Result<(), InputError> { + for name in keys { + let code = key_code_for(name).ok_or_else(|| InputError::UnknownKey(name.clone()))?; + self.sink.key(code, true); + self.sink.key(code, false); + } + Ok(()) + } + + /// Release every held mouse button (idempotent). Run on abort/error paths + /// so a partial drag never leaves a button stuck. + pub fn release_all(&mut self) { + let at = self.cursor; + let held: Vec = self.held_buttons.drain(..).collect(); + for button in held { + self.sink.mouse_button(at, button, false); + } + } +} + +#[cfg(target_os = "macos")] +pub use mac::{MacEventSink, current_cursor_position, guarded_controller}; + +#[cfg(target_os = "macos")] +mod mac { + //! Real CGEvent-backed [`EventSink`] (macOS). Live behavior is verified in a + //! granted `gjc` session; construction is gated on Accessibility. + + use std::ffi::c_void; + + use super::{EventSink, InputController, MouseButton}; + use crate::computer::{ + coords::LogicalPoint, + permissions::{PermissionError, require_accessibility_for_input}, + }; + + #[repr(C)] + #[derive(Clone, Copy)] + struct CgPoint { + x: f64, + y: f64, + } + + type CgEventSourceRef = *mut c_void; + type CgEventRef = *mut c_void; + + // CGEventType values. + const LEFT_DOWN: u32 = 1; + const LEFT_UP: u32 = 2; + const RIGHT_DOWN: u32 = 3; + const RIGHT_UP: u32 = 4; + const MOUSE_MOVED: u32 = 5; + const OTHER_DOWN: u32 = 25; + const OTHER_UP: u32 = 26; + + // CGMouseButton values. + const BTN_LEFT: u32 = 0; + const BTN_RIGHT: u32 = 1; + const BTN_CENTER: u32 = 2; + + // kCGEventSourceStateCombinedSessionState / kCGHIDEventTap. + const SOURCE_COMBINED_SESSION: u32 = 0; + const HID_EVENT_TAP: u32 = 0; + // kCGScrollEventUnitPixel. + const SCROLL_UNIT_PIXEL: u32 = 0; + + #[link(name = "CoreGraphics", kind = "framework")] + unsafe extern "C" { + fn CGEventSourceCreate(state_id: u32) -> CgEventSourceRef; + fn CGEventCreateMouseEvent( + source: CgEventSourceRef, + mouse_type: u32, + position: CgPoint, + button: u32, + ) -> CgEventRef; + fn CGEventCreateScrollWheelEvent( + source: CgEventSourceRef, + units: u32, + wheel_count: u32, + wheel1: i32, + wheel2: i32, + ) -> CgEventRef; + fn CGEventCreateKeyboardEvent( + source: CgEventSourceRef, + keycode: u16, + key_down: bool, + ) -> CgEventRef; + fn CGEventKeyboardSetUnicodeString(event: CgEventRef, length: usize, string: *const u16); + fn CGEventPost(tap: u32, event: CgEventRef); + fn CGEventCreate(source: CgEventSourceRef) -> CgEventRef; + fn CGEventGetLocation(event: CgEventRef) -> CgPoint; + fn CGWarpMouseCursorPosition(new_cursor_position: CgPoint) -> i32; + fn CFRelease(cf: *const c_void); + } + + const fn button_codes(button: MouseButton, down: bool) -> (u32, u32) { + match button { + MouseButton::Left => (if down { LEFT_DOWN } else { LEFT_UP }, BTN_LEFT), + MouseButton::Right => (if down { RIGHT_DOWN } else { RIGHT_UP }, BTN_RIGHT), + MouseButton::Center => (if down { OTHER_DOWN } else { OTHER_UP }, BTN_CENTER), + } + } + + /// CGEvent-backed sink. Holds an event source for the session. + pub struct MacEventSink { + source: CgEventSourceRef, + } + + impl MacEventSink { + fn new() -> Self { + // SAFETY: `CGEventSourceCreate` returns an owned source (or null, + // which CGEvent creation tolerates); released on drop. + let source = unsafe { CGEventSourceCreate(SOURCE_COMBINED_SESSION) }; + Self { source } + } + + fn post_mouse(&self, at: LogicalPoint, event_type: u32, button: u32) { + let position = CgPoint { x: at.x, y: at.y }; + // SAFETY: `source` is the owned event source; the created event is + // posted and released exactly once. + unsafe { + let event = CGEventCreateMouseEvent(self.source, event_type, position, button); + if !event.is_null() { + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + } + + impl Drop for MacEventSink { + fn drop(&mut self) { + if !self.source.is_null() { + // SAFETY: `source` is owned, non-null, and not used after release. + unsafe { CFRelease(self.source.cast_const()) }; + } + } + } + + impl EventSink for MacEventSink { + fn move_cursor(&mut self, to: LogicalPoint) { + // `CGWarpMouseCursorPosition` reliably relocates the hardware cursor + // (a bare mouseMoved event does not); the moved event then notifies + // apps of the hover at the new point. + let position = CgPoint { x: to.x, y: to.y }; + // SAFETY: pure Core Graphics cursor warp to a point; no ownership. + unsafe { CGWarpMouseCursorPosition(position) }; + self.post_mouse(to, MOUSE_MOVED, BTN_LEFT); + } + + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool) { + let (event_type, code) = button_codes(button, down); + self.post_mouse(at, event_type, code); + } + + fn scroll(&mut self, dx: f64, dy: f64) { + // SAFETY: created scroll event is posted and released exactly once. + unsafe { + let event = CGEventCreateScrollWheelEvent( + self.source, + SCROLL_UNIT_PIXEL, + 2, + dy as i32, + dx as i32, + ); + if !event.is_null() { + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + + fn type_unicode(&mut self, text: &str) { + let utf16: Vec = text.encode_utf16().collect(); + // SAFETY: down/up keyboard events are created, populated with the + // UTF-16 buffer (valid for the call), posted, and released once each. + unsafe { + for down in [true, false] { + let event = CGEventCreateKeyboardEvent(self.source, 0, down); + if event.is_null() { + continue; + } + CGEventKeyboardSetUnicodeString(event, utf16.len(), utf16.as_ptr()); + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + + fn key(&mut self, code: u16, down: bool) { + // SAFETY: created keyboard event is posted and released exactly once. + unsafe { + let event = CGEventCreateKeyboardEvent(self.source, code, down); + if !event.is_null() { + CGEventPost(HID_EVENT_TAP, event); + CFRelease(event.cast_const()); + } + } + } + } + + /// Construct an [`InputController`] backed by real `CGEvent`s — only when + /// Accessibility is granted. + /// + /// # Errors + /// Returns [`PermissionError`] when Accessibility is not granted; no event + /// source is created and no input can be posted. + pub fn guarded_controller() -> Result, PermissionError> { + require_accessibility_for_input()?; + Ok(InputController::new(MacEventSink::new())) + } + + /// Read the current global cursor position in logical points (top-left + /// origin). Used to verify mouse-move injection without clicking. + #[must_use] + pub fn current_cursor_position() -> LogicalPoint { + // SAFETY: `CGEventCreate(null)` returns an event whose location is the + // current cursor; it is released after the read. + unsafe { + let event = CGEventCreate(std::ptr::null_mut()); + if event.is_null() { + return LogicalPoint { x: 0.0, y: 0.0 }; + } + let location = CGEventGetLocation(event); + CFRelease(event.cast_const()); + LogicalPoint { x: location.x, y: location.y } + } + } +} + +#[cfg(test)] +mod tests { + use super::{EventSink, InputController, InputError, MouseButton, SinkOp, key_code_for}; + use crate::computer::coords::{LogicalPoint, NormalizedDisplay}; + + #[derive(Default)] + struct RecordingSink { + ops: Vec, + } + + impl EventSink for RecordingSink { + fn move_cursor(&mut self, to: LogicalPoint) { + self.ops.push(SinkOp::Move(to)); + } + + fn mouse_button(&mut self, at: LogicalPoint, button: MouseButton, down: bool) { + self.ops.push(SinkOp::Button { at, button, down }); + } + + fn scroll(&mut self, dx: f64, dy: f64) { + self.ops.push(SinkOp::Scroll { dx, dy }); + } + + fn type_unicode(&mut self, text: &str) { + self.ops.push(SinkOp::TypeUnicode(text.to_string())); + } + + fn key(&mut self, code: u16, down: bool) { + self.ops.push(SinkOp::Key { code, down }); + } + } + + fn display() -> NormalizedDisplay { + // 200x100 physical px at 2x => clicks map to logical /2. + NormalizedDisplay::new(200, 100, 2.0, 2.0, 0.0, 0.0) + } + + #[test] + fn click_moves_then_presses_and_releases_at_logical_point() { + let mut c = InputController::new(RecordingSink::default()); + c.click(&display(), 100.0, 50.0, MouseButton::Left).unwrap(); + let at = LogicalPoint { x: 50.0, y: 25.0 }; + assert_eq!(c.into_ops(), vec![ + SinkOp::Move(at), + SinkOp::Button { at, button: MouseButton::Left, down: true }, + SinkOp::Button { at, button: MouseButton::Left, down: false }, + ]); + } + + #[test] + fn double_click_emits_two_press_release_pairs() { + let mut c = InputController::new(RecordingSink::default()); + c.double_click(&display(), 10.0, 10.0, MouseButton::Left) + .unwrap(); + let downs = c + .ops_ref() + .iter() + .filter(|op| matches!(op, SinkOp::Button { down: true, .. })) + .count(); + let ups = c + .ops_ref() + .iter() + .filter(|op| matches!(op, SinkOp::Button { down: false, .. })) + .count(); + assert_eq!((downs, ups), (2, 2)); + assert!(!c.has_held_buttons()); + } + + #[test] + fn drag_releases_button_and_leaves_none_held() { + let mut c = InputController::new(RecordingSink::default()); + c.drag(&display(), 0.0, 0.0, 100.0, 50.0, MouseButton::Left) + .unwrap(); + assert!(!c.has_held_buttons()); + let ops = c.into_ops(); + assert_eq!(ops.first(), Some(&SinkOp::Move(LogicalPoint { x: 0.0, y: 0.0 }))); + assert_eq!( + ops.last(), + Some(&SinkOp::Button { + at: LogicalPoint { x: 50.0, y: 25.0 }, + button: MouseButton::Left, + down: false, + }) + ); + } + + #[test] + fn drag_to_out_of_bounds_releases_the_held_button() { + let mut c = InputController::new(RecordingSink::default()); + let err = c + .drag(&display(), 0.0, 0.0, 999.0, 0.0, MouseButton::Left) + .unwrap_err(); + assert!(matches!(err, InputError::Coord(_))); + // Button was pressed then released on the error path; none left held. + assert!(!c.has_held_buttons()); + let releases = c + .ops_ref() + .iter() + .filter(|op| matches!(op, SinkOp::Button { down: false, .. })) + .count(); + assert_eq!(releases, 1); + } + + #[test] + fn release_all_releases_a_stuck_button() { + let mut c = InputController::new(RecordingSink::default()); + // Press without releasing by starting a drag whose destination is invalid + // is covered above; here force a held state via a press through click then + // simulate a held button using a manual press path. + c.move_to(&display(), 10.0, 10.0).unwrap(); + c.press_for_test(MouseButton::Left); + assert!(c.has_held_buttons()); + c.release_all(); + assert!(!c.has_held_buttons()); + assert!(matches!(c.ops_ref().last(), Some(SinkOp::Button { down: false, .. }))); + // release_all is idempotent. + c.release_all(); + assert!(!c.has_held_buttons()); + } + + #[test] + fn move_out_of_bounds_errors_without_emitting_move() { + let mut c = InputController::new(RecordingSink::default()); + let err = c.move_to(&display(), 200.0, 0.0).unwrap_err(); + assert!(matches!(err, InputError::Coord(_))); + assert!(c.ops_ref().is_empty()); + } + + #[test] + fn keypress_maps_names_and_rejects_unknown() { + let mut c = InputController::new(RecordingSink::default()); + c.keypress(&["enter".to_string(), "tab".to_string()]) + .unwrap(); + assert_eq!(c.ops_ref(), &[ + SinkOp::Key { code: 36, down: true }, + SinkOp::Key { code: 36, down: false }, + SinkOp::Key { code: 48, down: true }, + SinkOp::Key { code: 48, down: false }, + ]); + let err = c + .keypress(&["definitely-not-a-key".to_string()]) + .unwrap_err(); + assert!(matches!(err, InputError::UnknownKey(_))); + } + + #[test] + fn type_text_forwards_unicode() { + let mut c = InputController::new(RecordingSink::default()); + c.type_text("héllo"); + assert_eq!(c.into_ops(), vec![SinkOp::TypeUnicode("héllo".to_string())]); + } + + #[test] + fn key_code_table_covers_common_names() { + assert_eq!(key_code_for("Return"), Some(36)); + assert_eq!(key_code_for("ESC"), Some(53)); + assert_eq!(key_code_for("up"), Some(126)); + assert_eq!(key_code_for("nope"), None); + } + + // Test-only helpers on the controller. + impl InputController { + fn into_ops(self) -> Vec { + self.sink.ops + } + + fn ops_ref(&self) -> &[SinkOp] { + &self.sink.ops + } + + fn press_for_test(&mut self, button: MouseButton) { + let at = self.cursor(); + self.press(at, button); + } + } +} + +#[cfg(all(test, target_os = "macos"))] +mod live_tests { + use super::{MouseButton, current_cursor_position, guarded_controller}; + use crate::computer::capture::capture_primary_display; + + /// Fires a real cursor move (no clicks/keys) and reads the position back to + /// prove the CGEvent input pipeline works end to end. Ignored by default; + /// run with `--ignored` on a macOS host with Accessibility granted. + #[test] + #[ignore = "moves the real cursor; needs macOS + Accessibility granted"] + fn cursor_move_lands_near_target() { + let frame = capture_primary_display().expect("capture (Screen Recording) should be granted"); + let display = frame.display; + let Ok(mut controller) = guarded_controller() else { + panic!("Accessibility must be granted for input injection"); + }; + + // Target the display center — a safe interior point, well away from edges. + let target_px = f64::from(display.width_px) / 2.0; + let target_py = f64::from(display.height_px) / 2.0; + controller + .move_to(&display, target_px, target_py) + .expect("move_to should succeed"); + + let expected = display + .to_logical_point(target_px, target_py) + .expect("center is in bounds"); + let pos = current_cursor_position(); + let dx = (pos.x - expected.x).abs(); + let dy = (pos.y - expected.y).abs(); + assert!( + dx <= 2.0 && dy <= 2.0, + "cursor landed at ({}, {}), expected ~({}, {})", + pos.x, + pos.y, + expected.x, + expected.y + ); + assert_eq!(controller.cursor(), expected); + // We only moved the cursor; nothing should be held. + assert!(!controller.has_held_buttons()); + let _ = MouseButton::Left; // keep the import meaningful for future click tests + } + + /// Durable output directory for G005 live-acceptance artifacts. Override + /// with `COMPUTER_USE_ACCEPTANCE_DIR`; defaults to + /// `/.gjc/ultragoal/artifacts/g005`. + fn acceptance_artifacts_dir() -> std::path::PathBuf { + if let Ok(dir) = std::env::var("COMPUTER_USE_ACCEPTANCE_DIR") { + return std::path::PathBuf::from(dir); + } + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../.gjc/ultragoal/artifacts/g005") + } + + /// G005 acceptance drill: drives all nine primitives through the gated + /// execute_input path against the focused frontmost app, then waits for a + /// human kill-switch press and proves input is blocked afterward. + #[test] + #[ignore = "live G005: drives the focused app + needs a human hotkey press"] + fn all_nine_acceptance_drill() { + use std::{thread::sleep, time::Duration}; + + use crate::computer::{ + capture::capture_primary_display, + executor::{InputAction, MacDisplayContext, MacPermissionGate, execute_input}, + hotkey, + supervisor::Supervisor, + }; + + assert!(hotkey::start(), "kill-switch hotkey listener must be live"); + let frame = capture_primary_display().expect("Screen Recording granted"); // primitive 1: screenshot + let display = frame.display; + let perms = MacPermissionGate; + let dctx = MacDisplayContext; + let cancel = || false; + let cx = f64::from(display.width_px) * 0.5; + let cy = f64::from(display.height_px) * 0.42; + + // Persist the pre-input frame as durable live-proof (primitive 1). + let artifacts = acceptance_artifacts_dir(); + std::fs::create_dir_all(&artifacts).expect("create acceptance artifacts dir"); + std::fs::write(artifacts.join("g005-before.png"), &frame.png) + .expect("write before screenshot"); + + let act = |action: InputAction| { + // Stand in for the listener's periodic heartbeat so input_allowed stays fresh. + Supervisor::global().heartbeat(); + let mut controller = guarded_controller().expect("Accessibility granted"); + execute_input( + &action, + Supervisor::global(), + &perms, + &dctx, + None, + &display, + &mut controller, + &cancel, + ) + .expect("gated action should succeed"); + sleep(Duration::from_millis(350)); + }; + + act(InputAction::Move { x: cx, y: cy }); // 2 move + act(InputAction::Click { x: cx, y: cy, button: MouseButton::Left }); // 3 click (focus body) + act(InputAction::Type { text: "COMPUTER_USE_E2E gajae ".to_string() }); // 4 type + act(InputAction::Keypress { keys: vec!["return".to_string()] }); // 5 keypress + act(InputAction::Type { text: "line two alpha beta gamma delta epsilon".to_string() }); + act(InputAction::DoubleClick { x: cx, y: cy, button: MouseButton::Left }); // 6 double_click + act(InputAction::Drag { + x: cx - 120.0, + y: cy, + to_x: cx + 120.0, + to_y: cy, + button: MouseButton::Left, + }); // 7 drag + act(InputAction::Scroll { x: cx, y: cy, scroll_x: 0.0, scroll_y: -120.0 }); // 8 scroll + act(InputAction::Wait { ms: 300 }); // 9 wait + + println!(">>> KILL-SWITCH DRILL: press Control+Option+Command+Escape now (within ~60s) <<<"); + for _ in 0..300 { + if Supervisor::global().is_suspended() { + break; + } + sleep(Duration::from_millis(200)); + } + assert!( + Supervisor::global().is_suspended(), + "kill-switch should latch after you press the hotkey" + ); + + // Prove input is blocked after the kill-switch, until a user-only reset. + let mut controller = guarded_controller().expect("Accessibility granted"); + let blocked = execute_input( + &InputAction::Move { x: cx, y: cy }, + Supervisor::global(), + &perms, + &dctx, + None, + &display, + &mut controller, + &cancel, + ); + assert!(blocked.is_err(), "input must be blocked while suspended"); + + // Capture + persist the post-kill-switch frame and a transcript so the + // G004 mandatory computer red-team suite has durable native proof on disk. + let after = capture_primary_display().expect("Screen Recording granted"); + std::fs::write(artifacts.join("g005-after-killswitch.png"), &after.png) + .expect("write post-kill-switch screenshot"); + let manifest = serde_json::json!({ + "schemaVersion": 1, + "kind": "computer-use-acceptance", + "surface": "native", + "hotkey": "Control+Option+Command+Escape", + "display": { + "widthPx": display.width_px, + "heightPx": display.height_px, + "epoch": frame.display_epoch + }, + "primitives": [ + "screenshot", + "move", + "click", + "type", + "keypress", + "double_click", + "drag", + "scroll", + "wait" + ], + "killSwitch": { "latched": true, "blockedFurtherInput": true }, + "artifacts": { + "before": "g005-before.png", + "afterKillSwitch": "g005-after-killswitch.png" + } + }); + std::fs::write( + artifacts.join("g005-manifest.json"), + serde_json::to_vec_pretty(&manifest).expect("serialize manifest"), + ) + .expect("write acceptance manifest"); + println!("G005 artifacts written to {}", artifacts.display()); + Supervisor::global().reset(); + println!( + "G005 PASS: all nine primitives executed; kill-switch latched and blocked further input." + ); + } +} diff --git a/crates/pi-natives/src/computer/mod.rs b/crates/pi-natives/src/computer/mod.rs new file mode 100644 index 000000000..567a2daa9 --- /dev/null +++ b/crates/pi-natives/src/computer/mod.rs @@ -0,0 +1,108 @@ +//! Native computer-use primitives (macOS-only v1). +//! +//! # Overview +//! This module backs the model-facing `computer` tool: OS-native control of the +//! real macOS desktop via the `OpenAI` computer-use action set (`screenshot`, +//! `click`, `double_click`, `move`, `drag`, `scroll`, `type`, `keypress`, +//! `wait`). +//! +//! # Status +//! Slice 1 foundation. Only the framework-free coordinate contract +//! ([`coords`]) ships so far; it is unit-testable without a display or granted +//! TCC permissions. The native capture/input backend, the kill-switch +//! supervisor + event-tap lifecycle, and the napi `ComputerController` surface +//! land in later slices. See `docs/computer-use/` for the approved spec, the +//! consensus plan, and the architecture decision record. +//! +//! # Architecture +//! ```text +//! model -> packages/coding-agent (computer tool, exact OpenAI schema) +//! -> packages/natives (napi bindings) +//! -> pi-natives::computer (execute_action state machine + backend) +//! ``` + +#[cfg(test)] +mod bypass_guard; +#[cfg(target_os = "macos")] +pub mod capture; +#[cfg(target_os = "macos")] +pub mod controller; +pub mod coords; +pub mod executor; +#[cfg(target_os = "macos")] +pub mod hotkey; +pub mod input; +#[cfg(target_os = "macos")] +pub mod permissions; +pub mod supervisor; + +#[cfg(target_os = "macos")] +pub use capture::{CaptureError, CapturedFrame, capture_primary_display, current_display_epoch}; +#[cfg(target_os = "macos")] +pub use controller::ComputerController; +pub use coords::{CoordError, LogicalPoint, NormalizedDisplay}; +pub use input::{EventSink, InputController, InputError, MouseButton}; +use napi::bindgen_prelude::Uint8Array; +use napi_derive::napi; +#[cfg(target_os = "macos")] +pub use permissions::{PermissionError, PreflightStatus, TccPermission, preflight}; +pub use supervisor::{Supervisor, SupervisorStatus}; + +/// A captured primary-display screenshot returned to JS. +/// +/// `width_px`/`height_px` are the physical pixels that define the action +/// coordinate space (see the coordinate contract); the scale/origin map them to +/// macOS logical points. +#[napi(object)] +pub struct ComputerScreenshot { + /// PNG-encoded image bytes. + pub png: Uint8Array, + /// Screenshot width in physical pixels. + pub width_px: u32, + /// Screenshot height in physical pixels. + pub height_px: u32, + /// Physical-pixels-per-logical-point along X. + pub scale_x: f64, + /// Physical-pixels-per-logical-point along Y. + pub scale_y: f64, + /// Logical origin X of the display (points). + pub origin_x: f64, + /// Logical origin Y of the display (points). + pub origin_y: f64, + /// Stable hash of the display geometry used for stale-display checks. + pub display_epoch: f64, + /// Process-local opaque capture id. + pub capture_id: u32, +} + +/// Capture the primary display for JS callers (macOS). +/// +/// Requires the Screen Recording permission. This is the read-only `screenshot` +/// primitive of the computer-use tool; input primitives land behind the same +/// surface once the Accessibility gate is satisfied in a granted `gjc` process. +/// +/// # Errors +/// Returns an error when capture fails (e.g. Screen Recording not granted). +#[napi(js_name = "computerScreenshot")] +pub fn computer_screenshot() -> napi::Result { + #[cfg(target_os = "macos")] + { + let frame = capture::capture_primary_display() + .map_err(|err| napi::Error::from_reason(format!("{err}")))?; + Ok(ComputerScreenshot { + png: Uint8Array::from(frame.png), + width_px: frame.display.width_px, + height_px: frame.display.height_px, + scale_x: frame.display.scale_x, + scale_y: frame.display.scale_y, + origin_x: frame.display.origin_x, + origin_y: frame.display.origin_y, + display_epoch: frame.display_epoch as f64, + capture_id: frame.capture_id, + }) + } + #[cfg(not(target_os = "macos"))] + { + Err(napi::Error::from_reason("computer screenshot capture is only supported on macOS")) + } +} diff --git a/crates/pi-natives/src/computer/permissions.rs b/crates/pi-natives/src/computer/permissions.rs new file mode 100644 index 000000000..6bb0ea7d4 --- /dev/null +++ b/crates/pi-natives/src/computer/permissions.rs @@ -0,0 +1,185 @@ +//! macOS TCC permission preflight for computer-use (macOS). +//! +//! # Overview +//! Two distinct TCC permissions gate the computer tool: +//! - **Screen Recording** — required for `screenshot` capture (see +//! [`super::capture`]). +//! - **Accessibility** — required for input injection (click/type/etc.). This +//! is a *separate* grant from Screen Recording. +//! +//! This module performs non-prompting preflight checks and can open the correct +//! System Settings pane so the user can grant a missing permission, then retry. +//! It never injects input and never blocks; callers gate side effects on +//! [`preflight`] and surface [`PermissionError`] when a required grant is +//! missing rather than acting on a stale assumption. + +use std::process::Command; + +#[link(name = "ApplicationServices", kind = "framework")] +unsafe extern "C" { + /// Returns whether the current process is a trusted Accessibility client + /// (no prompt). Equivalent to `AXIsProcessTrustedWithOptions(NULL)`. + fn AXIsProcessTrusted() -> bool; +} + +#[link(name = "CoreGraphics", kind = "framework")] +unsafe extern "C" { + /// Returns whether the current process already has Screen Recording access, + /// without prompting. + fn CGPreflightScreenCaptureAccess() -> bool; +} + +/// A TCC permission the computer tool depends on. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TccPermission { + /// Accessibility — required for input injection. + Accessibility, + /// Screen Recording — required for screen capture. + ScreenRecording, +} + +impl TccPermission { + /// The `x-apple.systempreferences:` URL for this permission's settings pane. + #[must_use] + pub const fn settings_url(self) -> &'static str { + match self { + Self::Accessibility => { + "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility" + }, + Self::ScreenRecording => { + "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture" + }, + } + } +} + +/// Current grant state for the permissions the computer tool needs. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PreflightStatus { + /// Whether Accessibility (input injection) is granted. + pub accessibility: bool, + /// Whether Screen Recording (capture) is granted. + pub screen_recording: bool, +} + +/// Error returned when a required permission is missing. Carries the offending +/// permission so the caller can open the right Settings pane and ask the user +/// to grant it, then retry. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PermissionError { + /// The missing permission. + pub missing: TccPermission, +} + +impl std::fmt::Display for PermissionError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let (name, what) = match self.missing { + TccPermission::Accessibility => ("Accessibility", "inject input"), + TccPermission::ScreenRecording => ("Screen Recording", "capture the screen"), + }; + write!( + f, + "COMPUTER_PERMISSION_REQUIRED: {name} permission is required to {what}. Grant it in \ + System Settings (opened for you), then retry." + ) + } +} + +impl std::error::Error for PermissionError {} + +/// Whether the process is a trusted Accessibility client (no prompt). +#[must_use] +pub fn accessibility_granted() -> bool { + // SAFETY: `AXIsProcessTrusted` takes no arguments and only reads the current + // process's TCC trust state. + unsafe { AXIsProcessTrusted() } +} + +/// Whether the process already has Screen Recording access (no prompt). +#[must_use] +pub fn screen_recording_granted() -> bool { + // SAFETY: `CGPreflightScreenCaptureAccess` takes no arguments and only reads + // the current process's capture-access state. + unsafe { CGPreflightScreenCaptureAccess() } +} + +/// Read the current grant state for both required permissions. +#[must_use] +pub fn preflight() -> PreflightStatus { + PreflightStatus { + accessibility: accessibility_granted(), + screen_recording: screen_recording_granted(), + } +} + +/// Open the System Settings pane for `permission` via `open(1)`. Best-effort; +/// returns whether the launch was spawned successfully. +pub fn open_settings(permission: TccPermission) -> bool { + Command::new("open") + .arg(permission.settings_url()) + .status() + .is_ok_and(|status| status.success()) +} + +/// Ensure Accessibility is granted for input injection. +/// +/// On failure, opens the Accessibility settings pane and returns +/// [`PermissionError`] so the caller can fail closed and prompt a +/// grant-then-retry — never proceeding to inject input. +/// +/// # Errors +/// Returns [`PermissionError`] when Accessibility is not granted. +pub fn require_accessibility_for_input() -> Result<(), PermissionError> { + if accessibility_granted() { + return Ok(()); + } + let _ = open_settings(TccPermission::Accessibility); + Err(PermissionError { missing: TccPermission::Accessibility }) +} + +/// Ensure Screen Recording is granted for capture. +/// +/// On failure, opens the Screen Recording settings pane and returns +/// [`PermissionError`]. +/// +/// # Errors +/// Returns [`PermissionError`] when Screen Recording is not granted. +pub fn require_screen_recording_for_capture() -> Result<(), PermissionError> { + if screen_recording_granted() { + return Ok(()); + } + let _ = open_settings(TccPermission::ScreenRecording); + Err(PermissionError { missing: TccPermission::ScreenRecording }) +} + +#[cfg(test)] +mod tests { + use super::{TccPermission, preflight}; + + #[test] + fn settings_urls_target_the_privacy_panes() { + assert!( + TccPermission::Accessibility + .settings_url() + .contains("Privacy_Accessibility") + ); + assert!( + TccPermission::ScreenRecording + .settings_url() + .contains("Privacy_ScreenCapture") + ); + } + + /// Reports the live TCC grant state. Ignored by default (result depends on + /// the host's granted permissions); run explicitly to learn whether input + /// injection (Accessibility) is currently possible. + #[test] + #[ignore = "reports live TCC grant state; environment-dependent"] + fn report_live_preflight() { + let status = preflight(); + println!( + "TCC preflight: accessibility={} screen_recording={}", + status.accessibility, status.screen_recording + ); + } +} diff --git a/crates/pi-natives/src/computer/supervisor.rs b/crates/pi-natives/src/computer/supervisor.rs new file mode 100644 index 000000000..4f4482f8f --- /dev/null +++ b/crates/pi-natives/src/computer/supervisor.rs @@ -0,0 +1,204 @@ +//! Process-global kill-switch supervisor for computer-use. +//! +//! # Role +//! The supervisor is the safety authority for autonomous input: side-effecting +//! actions may fire only while [`Supervisor::input_allowed`] holds, and a stop +//! (global hotkey or TUI key) latches [`Supervisor::is_suspended`] until a +//! **user-only** [`Supervisor::reset`]. The model-facing surface can never +//! reset suspension. +//! +//! `input_allowed` is fail-closed: it requires the stop path to be live +//! (`hotkey_live`), a fresh heartbeat from that stop path, and a non-suspended +//! state. If the hotkey listener dies (heartbeat goes stale or liveness drops), +//! input is disabled automatically. +//! +//! This module is pure state (atomics + timestamps) so the safety logic is +//! unit-tested deterministically without OS event taps; the OS hotkey listener +//! (a `CFRunLoop` `CGEventTap`) drives +//! `set_hotkey_live`/`heartbeat`/`trigger_stop` and is verified separately. + +use std::{ + sync::{ + OnceLock, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, + time::{SystemTime, UNIX_EPOCH}, +}; + +/// Max age of the stop-path heartbeat before input is disabled (ms). +pub const HEARTBEAT_FRESH_MS: u64 = 2_000; + +/// Snapshot of supervisor state used for gating and status reporting. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SupervisorStatus { + /// Input is latched off until a user-only reset. + pub suspended: bool, + /// The global stop path (hotkey/event-tap) reports itself live. + pub hotkey_live: bool, + /// The stop path's heartbeat is within [`HEARTBEAT_FRESH_MS`]. + pub heartbeat_fresh: bool, +} + +impl SupervisorStatus { + /// Whether side-effecting input may fire: live, fresh, and not suspended. + #[must_use] + pub const fn input_allowed(self) -> bool { + self.hotkey_live && self.heartbeat_fresh && !self.suspended + } +} + +/// Process-global kill-switch state. +pub struct Supervisor { + suspended: AtomicBool, + hotkey_live: AtomicBool, + last_heartbeat_ms: AtomicU64, +} + +fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| u64::try_from(d.as_millis()).unwrap_or(u64::MAX)) +} + +impl Supervisor { + /// Construct a fresh supervisor: not suspended, stop path not yet live. + #[must_use] + pub const fn new() -> Self { + Self { + suspended: AtomicBool::new(false), + hotkey_live: AtomicBool::new(false), + last_heartbeat_ms: AtomicU64::new(0), + } + } + + /// The process-global supervisor singleton. + pub fn global() -> &'static Self { + static GLOBAL: OnceLock = OnceLock::new(); + GLOBAL.get_or_init(Self::new) + } + + /// Record that the stop path is live (or not) and refresh its heartbeat. + pub fn set_hotkey_live(&self, live: bool) { + self.hotkey_live.store(live, Ordering::SeqCst); + if live { + self.last_heartbeat_ms.store(now_ms(), Ordering::SeqCst); + } + } + + /// Heartbeat from the live stop path (call on a fixed interval). + pub fn heartbeat(&self) { + self.heartbeat_at(now_ms()); + } + + /// Heartbeat with an explicit timestamp (deterministic in tests). + pub fn heartbeat_at(&self, at_ms: u64) { + self.last_heartbeat_ms.store(at_ms, Ordering::SeqCst); + } + + /// Latch suspension: abort further input until a user-only [`reset`]. + /// Invoked by the global hotkey or TUI stop key. + /// + /// [`reset`]: Supervisor::reset + pub fn trigger_stop(&self) { + self.suspended.store(true, Ordering::SeqCst); + } + + /// Clear suspension. **User-only** — never wire this to the model-facing + /// tool schema or generic tool dispatch. + pub fn reset(&self) { + self.suspended.store(false, Ordering::SeqCst); + } + + /// Whether input is currently latched off. + #[must_use] + pub fn is_suspended(&self) -> bool { + self.suspended.load(Ordering::SeqCst) + } + + /// Status as of `now_ms` (explicit for tests). + #[must_use] + pub fn status_at(&self, now_ms: u64) -> SupervisorStatus { + let last = self.last_heartbeat_ms.load(Ordering::SeqCst); + SupervisorStatus { + suspended: self.suspended.load(Ordering::SeqCst), + hotkey_live: self.hotkey_live.load(Ordering::SeqCst), + heartbeat_fresh: now_ms.saturating_sub(last) <= HEARTBEAT_FRESH_MS, + } + } + + /// Status as of now. + #[must_use] + pub fn status(&self) -> SupervisorStatus { + self.status_at(now_ms()) + } + + /// Whether side-effecting input may fire right now. + #[must_use] + pub fn input_allowed(&self) -> bool { + self.status().input_allowed() + } +} + +impl Default for Supervisor { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::{HEARTBEAT_FRESH_MS, Supervisor}; + + #[test] + fn fresh_supervisor_disallows_input_until_stop_path_is_live() { + let s = Supervisor::new(); + assert!(!s.status_at(1_000).input_allowed(), "not live yet"); + assert!(!s.is_suspended()); + } + + #[test] + fn live_and_fresh_allows_input() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + assert!(s.status_at(10_500).input_allowed()); + } + + #[test] + fn stale_heartbeat_disables_input() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + let stale = 10_000 + HEARTBEAT_FRESH_MS + 1; + assert!(!s.status_at(stale).input_allowed(), "stale heartbeat must fail closed"); + } + + #[test] + fn trigger_stop_latches_until_user_reset() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + assert!(s.status_at(10_100).input_allowed()); + + s.trigger_stop(); + assert!(s.is_suspended()); + // Even with a live, fresh stop path, suspension keeps input off. + s.heartbeat_at(10_200); + assert!(!s.status_at(10_250).input_allowed()); + + s.reset(); + assert!(!s.is_suspended()); + s.heartbeat_at(10_300); + assert!(s.status_at(10_350).input_allowed()); + } + + #[test] + fn losing_hotkey_liveness_disables_input() { + let s = Supervisor::new(); + s.set_hotkey_live(true); + s.heartbeat_at(10_000); + assert!(s.status_at(10_100).input_allowed()); + s.set_hotkey_live(false); + assert!(!s.status_at(10_150).input_allowed(), "dead stop path must fail closed"); + } +} diff --git a/crates/pi-natives/src/lib.rs b/crates/pi-natives/src/lib.rs index e53446e2b..a2cf2e25d 100644 --- a/crates/pi-natives/src/lib.rs +++ b/crates/pi-natives/src/lib.rs @@ -25,6 +25,7 @@ pub mod appearance; pub mod ast; pub mod build_info; pub mod clipboard; +pub mod computer; pub mod crash; pub mod edit_fuzzy; pub mod fd; diff --git a/docs/computer-use/README.md b/docs/computer-use/README.md new file mode 100644 index 000000000..b0ae5feae --- /dev/null +++ b/docs/computer-use/README.md @@ -0,0 +1,88 @@ +# Native computer-use tool + +Status: **in progress (draft)** — coordinate contract + native `screenshot` +capture landed and verified; input primitives, kill-switch, and napi/TS surface +to follow. + +A new, model-agnostic `computer` tool that lets any model drive the user's real +macOS desktop via the OpenAI computer-use action set. Built fresh (the +open-source `openai/codex` repo has no GUI computer-use source to copy; only the +public action *schema* is mirrored). + +This feature was scoped through GJC's deep-interview (requirements) and ralplan +(Planner/Architect/Critic consensus) workflows. The full deep-interview spec and +the consensus plan + ADR are the authoritative source of truth; this document is +the committed summary and roadmap. + +## Locked decisions (ADR summary) + +- **Target:** the user's real macOS desktop, OS-native control. v1 is macOS-only + (Linux/Windows deferred behind the same tool schema). +- **Driver:** any model via a generic structured tool-call interface — no + provider-specific computer-use API. +- **Action set:** the exact OpenAI computer-use primitives — `screenshot`, + `click`, `double_click`, `move`, `drag`, `scroll`, `type`, `keypress`, `wait`. +- **Implementation:** built fresh in the Rust `pi-natives` crate (napi), + exposed through `packages/natives` to a new + `packages/coding-agent/src/tools/computer.ts`, kept deliberately lower-level + than the existing `browser` tool (coordinate/input primitives only, no web + semantics). +- **Coordinate contract:** a single normalized virtual display. The returned + screenshot's pixel dimensions *are* the action coordinate space; Rust owns the + transform to macOS logical points (Retina/HiDPI-safe) and display selection. +- **Permissions:** macOS TCC (Accessibility + Screen Recording) auto-preflighted; + on a missing grant, open the relevant Settings pane and return a clear + "grant then retry/relaunch" error. +- **Gating:** off by default; opt-in config flag (per session) plus a persistent + always-on option. +- **Safety:** no per-action approval (autonomous), **but** a daemon-enforced + global kill-switch outside model control (global hotkey OR TUI stop key) that + aborts queued actions, releases held keys/buttons, suspends further input, and + snapshots the last screen. Reset is user-only, never via the model-facing tool. +- **Architecture:** every primitive delegates to one central Rust + `execute_action` state machine (preflight, validation, cancellation, audit, + screenshot policy, release-all) so per-primitive methods cannot drift past the + safety contract. The in-process supervisor sits behind a `SupervisorClient` + boundary so an out-of-process daemon can replace it later without changing the + napi surface. + +## Capture + coordinate contract (shipped) + +`crates/pi-natives/src/computer/coords.rs` implements the pure, framework-free +core: `NormalizedDisplay` maps a screenshot-space pixel `(x, y)` to a macOS +logical point via per-axis scale and the display's logical origin, rejecting +out-of-bounds and non-finite inputs. It is unit-tested (scale 1.0/2.0, +fractional and anisotropic scale, non-zero origins, edges, out-of-bounds, +invalid scale) and requires no display or granted permissions. + +`crates/pi-natives/src/computer/capture.rs` (macOS) implements the read-only +`screenshot` primitive: it captures the primary display via CoreGraphics into a +PNG and derives the `NormalizedDisplay` scale from captured physical pixels vs +logical bounds, surfacing a missing Screen Recording grant as +`CaptureError::CaptureFailed` (never a silent black frame). Verified live: a +real, non-uniform primary-display capture decodes as a PNG with matching +dimensions (`cargo test -p pi-natives --ignored captures_non_uniform_primary_display`). + +## Delivery roadmap + +Delivery ships a `screenshot`+`click`+`type` vertical slice first; the remaining +six primitives fast-follow; v1 acceptance = all nine primitives drive a real +macOS app end-to-end plus a kill-switch drill (per-primitive napi unit tests + +manual macOS E2E). + +| Slice | Scope | Status | +|-------|-------|--------| +| Coordinate contract + planning docs | `coords` module + unit tests + this doc | **done (this PR)** | +| Native screen capture (`screenshot`) | `capture` module, primary display, PNG + scale | **done (this PR, verified live)** | +| TCC preflight (`permissions`) | Accessibility + Screen Recording checks, Settings openers, fail-closed guards | **done (this PR, verified live)** | +| napi screenshot binding (`computerScreenshot`) | napi → `packages/natives` → TS, verified live | **done (this PR)** | +| Native input orchestration (`input`) | `InputController` click/double_click/move/drag/scroll/type/keypress + release_all over an `EventSink` | **done (this PR)** — logic unit-tested; **live cursor-move injection verified** (Accessibility granted) | +| Central `execute_action` state machine | preflight + supervisor + cancellation + audit + release-all | planned | +| Kill-switch supervisor + global-hotkey event-tap | `supervisor` (fail-closed `input_allowed`, user-only reset) + `hotkey` CGEventTap on a CFRunLoop thread | **done (this PR)** — supervisor unit-tested; **synthetic-hotkey latch verified live** | +| Supervisor-gated `execute_action` + napi/TS `computer` tool | wire input through `input_allowed` + cancellation; `ComputerController` napi; `computer.ts` schema/gating/prompt/renderer | next | +| Manual macOS E2E acceptance | TextEdit all-nine + kill-switch drill | planned (requires macOS hardware + granted TCC + human operator) | + +The remaining input backend, kill-switch, napi/TS surface, and manual +end-to-end acceptance still require injecting events into a live desktop and a +human-operated drill, so they are tracked as follow-up work rather than landed +in this draft. diff --git a/docs/tools/computer.md b/docs/tools/computer.md new file mode 100644 index 000000000..f25b5307c --- /dev/null +++ b/docs/tools/computer.md @@ -0,0 +1,71 @@ +# computer + +> Explicitly enabled macOS desktop screenshot and input control through the native supervisor-gated computer controller. + +## Source + +- Entry: `packages/coding-agent/src/tools/computer.ts` +- Model-facing prompt: `packages/coding-agent/src/prompts/tools/computer.md` +- Renderer: `packages/coding-agent/src/tools/computer/render.ts` +- Native controller: `@gajae-code/natives` `ComputerController` + +## Availability + +`computer` is first-class in the product catalog and documentation, but it is not a callable tool by default. + +Callable activation requires all of: + +1. macOS (`process.platform === "darwin"`), and +2. `computer.enabled` or `computer.alwaysOn` set to `true`. + +When disabled, every action including `screenshot` returns `COMPUTER_DISABLED`. Disabled catalog/listing paths do not construct `ComputerController`, start hotkeys, probe Screen Recording, probe Accessibility, capture screenshots, or expose the callable schema to `search_tool_bm25`. + +## Inputs + +The model action object uses an exact snake_case discriminated schema. CamelCase fields are rejected. + +### Shared fields + +| Field | Type | Required | Description | +| --- | --- | --- | --- | +| `action` | see actions below | Yes | Dispatch action. | +| `timeout` | `number` | No | Maximum action time in seconds. | +| `include_screenshot` | `boolean` | No | Request a bounded post-action screenshot when supported. | + +### Actions + +| Action | Required fields | Optional fields | +| --- | --- | --- | +| `screenshot` | none | shared | +| `click` | `x`, `y` | `button`, shared | +| `double_click` | `x`, `y` | `button`, shared | +| `move` | `x`, `y` | `button`, shared | +| `drag` | `x`, `y`, `to_x`, `to_y` | `button`, shared | +| `scroll` | `x`, `y`, `scroll_x`, `scroll_y` | shared | +| `type` | `text` | shared | +| `keypress` | `keys` | shared | +| `wait` | `ms` | shared | + +`button` is one of `left`, `right`, or `middle`. + +## Coordinate contract + +`x`, `y`, `to_x`, and `to_y` are screenshot pixels in the latest screenshot coordinate frame. They are not CSS pixels and not normalized fractions. The screenshot result records dimensions, scale, origin, display epoch, and capture id when supplied by native code. Coordinate actions must not clamp invalid coordinates; native code returns `COMPUTER_COORD_INVALID` or `COMPUTER_DISPLAY_STALE` before input when the coordinate/display contract cannot be satisfied. + +## Errors + +Stable computer error codes include: + +- `COMPUTER_DISABLED` +- `COMPUTER_SUSPENDED` +- `COMPUTER_SUPERVISOR_NOT_LIVE` +- `COMPUTER_PERMISSION_REQUIRED` +- `COMPUTER_DISPLAY_STALE` +- `COMPUTER_COORD_INVALID` +- `COMPUTER_CANCELLED` + +TS handles settings/platform exposure and UX mapping. Native `execute_action` remains the side-effect authority for supervisor state, permissions, display freshness, coordinate validation, cancellation, and release-all behavior. + +## Rendering + +The TUI renderer is bounded: it shows action, coordinates, scroll/key/wait summary, screenshot dimensions/byte count/capture id, supervisor status, and error code. It never renders raw screenshot base64. diff --git a/packages/coding-agent/src/config/settings-schema.ts b/packages/coding-agent/src/config/settings-schema.ts index 8cb493f24..afcfefb09 100644 --- a/packages/coding-agent/src/config/settings-schema.ts +++ b/packages/coding-agent/src/config/settings-schema.ts @@ -2160,6 +2160,66 @@ export const SETTINGS_SCHEMA = { }, }, + "computer.enabled": { + type: "boolean", + default: false, + ui: { + tab: "tools", + label: "Computer", + description: "Enable the macOS computer tool for this session. Off by default.", + }, + }, + + "computer.alwaysOn": { + type: "boolean", + default: false, + ui: { + tab: "tools", + label: "Computer Always On", + description: "Keep the macOS computer tool callable without per-session enablement.", + }, + }, + + "computer.autoScreenshot": { + type: "boolean", + default: false, + ui: { + tab: "tools", + label: "Computer Auto Screenshot", + description: "Automatically request bounded screenshots after computer actions when supported.", + }, + }, + + "computer.screenshotMaxBytes": { + type: "number", + default: 5_000_000, + ui: { + tab: "tools", + label: "Computer Screenshot Max Bytes", + description: "Maximum screenshot payload size for computer action results.", + }, + }, + + "computer.killSwitchHotkey": { + type: "string", + default: "Control+Option+Command+Escape", + ui: { + tab: "tools", + label: "Computer Kill Switch Hotkey", + description: "Native stop/suspend hotkey shown to users for computer-use sessions.", + }, + }, + + "computer.auditLog.enabled": { + type: "boolean", + default: true, + ui: { + tab: "tools", + label: "Computer Audit Log", + description: "Persist audit records for enabled computer-use actions.", + }, + }, + // Tool execution "tools.intentTracing": { type: "boolean", diff --git a/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts b/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts index c450113d1..df04e5014 100644 --- a/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts +++ b/packages/coding-agent/src/gjc-runtime/ultragoal-runtime.ts @@ -801,13 +801,128 @@ function evidenceKindMatches(kind: string, words: string[]): boolean { type SurfaceFamily = "web" | "cli" | "native" | "api-package" | "algorithm-math" | "unknown"; +type UltragoalChangeStatus = "added" | "modified" | "deleted" | "renamed" | "copied" | "unknown"; +type UltragoalChangeCategory = + | "code" + | "generated-binding" + | "tool" + | "settings-registry" + | "prompt-doc-behavior" + | "docs-static" + | "other"; +interface UltragoalChangeSetPath extends JsonObject { + path: string; + status: UltragoalChangeStatus; + oldPath?: string; + category?: UltragoalChangeCategory; +} +interface UltragoalChangeSet extends JsonObject { + source: "checkpoint-git" | "review-pr" | "review-branch" | "review-worktree" | "review-spec"; + baseRef?: string; + headRef?: string; + mergeBase?: string; + paths: UltragoalChangeSetPath[]; + rawDiffStat?: string; + trusted: true; +} + +const COMPUTER_SURFACE_TOKENS = new Set(["computer", "computer-use", "desktop-input", "native-input"]); +const MANDATORY_COMPUTER_CASE_IDS = [ + "kill-switch-bypass", + "suspended-enforcement", + "permission-revoked", + "display-stale", + "out-of-bounds-drift", + "runaway-loop-halt", + "blast-radius", +] as const; + +function normalizeRepoPath(value: string): string { + return value.replaceAll("\\\\", "/").replace(/^\.\//, ""); +} + +function categorizeComputerChangePath(value: string): UltragoalChangeCategory { + const normalized = normalizeRepoPath(value); + if (normalized.startsWith("crates/pi-natives/src/computer/")) return "code"; + if (/^packages\/natives\/native\/index\.(?:d\.ts|js)$/.test(normalized)) return "generated-binding"; + if ( + normalized === "packages/coding-agent/src/tools/computer.ts" || + normalized.startsWith("packages/coding-agent/src/tools/computer/") + ) + return "tool"; + if ( + normalized === "packages/coding-agent/src/tools/index.ts" || + normalized === "packages/coding-agent/src/tools/renderers.ts" || + normalized === "packages/coding-agent/src/config/settings-schema.ts" + ) + return "settings-registry"; + if ( + normalized === "packages/coding-agent/src/prompts/tools/computer.md" || + normalized === "packages/coding-agent/src/defaults/gjc/skills/ultragoal/SKILL.md" || + normalized === "packages/coding-agent/src/prompts/agents/executor.md" + ) + return "prompt-doc-behavior"; + if (normalized === "docs/tools/computer.md" || normalized === "docs/computer-use/README.md") return "docs-static"; + return "other"; +} + +function isComputerChangePath(row: UltragoalChangeSetPath): boolean { + return ( + categorizeComputerChangePath(row.path) !== "other" || + (row.oldPath ? categorizeComputerChangePath(row.oldPath) !== "other" : false) + ); +} + +function isDocsOnlyStaticComputerChangeSet(changeSet: UltragoalChangeSet | undefined): boolean { + if (!changeSet || changeSet.paths.length === 0) return false; + return changeSet.paths.every(row => { + const category = row.category ?? categorizeComputerChangePath(row.path); + const oldCategory = row.oldPath ? categorizeComputerChangePath(row.oldPath) : category; + return category === "docs-static" && oldCategory === "docs-static"; + }); +} + +function trustedChangeSetRequiresComputerSuite(changeSet: UltragoalChangeSet | undefined): boolean { + if (!changeSet || !changeSet.trusted) return false; + if (isDocsOnlyStaticComputerChangeSet(changeSet)) return false; + return changeSet.paths.some(isComputerChangePath); +} + +function executorQaDeclaresComputerTouching(executorQa: JsonObject): boolean { + if (executorQa.computerTouching === true) return true; + const surfaces = Array.isArray(executorQa.surfaces) ? executorQa.surfaces : []; + if (surfaces.some(value => typeof value === "string" && COMPUTER_SURFACE_TOKENS.has(normalizeSurfaceToken(value)))) + return true; + const surfaceRows = Array.isArray(executorQa.surfaceEvidence) ? executorQa.surfaceEvidence : []; + return surfaceRows.some(row => { + const object = qualityGateObject(row); + const surface = object ? nonEmptyString(object.surface) : null; + return surface ? COMPUTER_SURFACE_TOKENS.has(normalizeSurfaceToken(surface)) : false; + }); +} + +function requiresComputerRedTeamSuite(executorQa: JsonObject, changeSet: UltragoalChangeSet | undefined): boolean { + if (trustedChangeSetRequiresComputerSuite(changeSet)) return true; + const declaredPaths = Array.isArray(executorQa.changedPaths) ? executorQa.changedPaths : []; + return declaredPaths.some(value => typeof value === "string" && categorizeComputerChangePath(value) !== "other"); +} + +function normalizeAdversarialCaseId(value: string): string { + return normalizeSurfaceToken(value).replace(/\s+/g, "-"); +} + export function normalizeSurfaceToken(value: string): string { return value.toLowerCase().replaceAll("_", "-").trim(); } export function surfaceFamily(value: string): SurfaceFamily { const normalized = normalizeSurfaceToken(value); - if (["native", "desktop", "tui"].some(word => normalized.includes(word))) return "native"; + if ( + ["computer", "computer-use", "desktop-input", "native-input", "native", "desktop", "tui"].some(word => + normalized.includes(word), + ) + ) + return "native"; if (["gui", "web", "browser", "ui", "visual"].some(word => normalized.includes(word))) return "web"; if (["cli", "terminal", "command"].some(word => normalized.includes(word))) return "cli"; if (["api", "package", "library", "sdk"].some(word => normalized.includes(word))) return "api-package"; @@ -1836,12 +1951,92 @@ function validateAdversarialCases( return idMap; } +async function validateMandatoryComputerAdversarialCases( + cwd: string, + contractCoverage: JsonObject[], + adversarialCases: Map, + artifactRefs: Map, +): Promise { + const linkedCaseIds = new Set(); + for (const [index, row] of contractCoverage.entries()) { + const ids = optionalStringLinks(row, "adversarialCaseRefs", `executorQa.contractCoverage[${index}]`); + for (const id of ids ?? []) linkedCaseIds.add(normalizeAdversarialCaseId(id)); + } + for (const caseId of MANDATORY_COMPUTER_CASE_IDS) { + const row = adversarialCases.get(caseId); + if (!row) + throw new Error( + `COMPUTER_REDTEAM_CASE_MISSING: qualityGate executorQa.adversarialCases must include ${caseId}`, + ); + if (optionalStatusField(row, `executorQa.adversarialCases.${caseId}`) === NOT_APPLICABLE_STATUS) { + throw new Error( + `COMPUTER_REDTEAM_CASE_NOT_APPLICABLE: mandatory computer adversarial case ${caseId} must not be not_applicable`, + ); + } + if (!linkedCaseIds.has(caseId)) { + throw new Error( + `COMPUTER_REDTEAM_CASE_UNLINKED: mandatory computer adversarial case ${caseId} must be linked from contractCoverage.adversarialCaseRefs`, + ); + } + const artifactIds = requireStringLinks(row.artifactRefs, `executorQa.adversarialCases.${caseId}.artifactRefs`); + let hasValidLiveNativeProof = false; + let sawInlineOnly = false; + let sawReceiptOnly = false; + let sawMetadataOnly = false; + for (const artifactId of artifactIds) { + const artifact = artifactRefs.get(artifactId); + if (!artifact) + throw new Error( + `qualityGate executorQa.adversarialCases.${caseId}.artifactRefs references unknown id ${artifactId}`, + ); + const fieldName = `executorQa.artifactRefs.${artifactId}`; + if (artifact.inlineEvidence !== undefined && !nonEmptyString(artifact.path)) sawInlineOnly = true; + if ( + (artifact.verifiedReceipt !== undefined || artifact.receipt !== undefined) && + !nonEmptyString(artifact.path) + ) + sawReceiptOnly = true; + if ( + !nonEmptyString(artifact.path) && + artifact.inlineEvidence === undefined && + artifact.verifiedReceipt === undefined && + artifact.receipt === undefined + ) + sawMetadataOnly = true; + try { + await validateArtifactProof(cwd, artifact, fieldName, { surfaceFamily: "native", live: true }); + if (await validateStructuralArtifact(cwd, artifact, fieldName, { surfaceFamily: "native", live: true })) + hasValidLiveNativeProof = true; + } catch { + // Preserve the explicit computer red-team error taxonomy below. + } + } + if (!hasValidLiveNativeProof) { + if (sawInlineOnly) + throw new Error( + `COMPUTER_REDTEAM_INLINE_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`, + ); + if (sawReceiptOnly) + throw new Error( + `COMPUTER_REDTEAM_RECEIPT_ONLY: mandatory computer adversarial case ${caseId} requires live structural native proof`, + ); + if (sawMetadataOnly) + throw new Error( + `COMPUTER_REDTEAM_ARTIFACT_METADATA_ONLY: mandatory computer adversarial case ${caseId} requires durable live structural native proof`, + ); + throw new Error( + `COMPUTER_REDTEAM_ARTIFACT_MISSING: mandatory computer adversarial case ${caseId} requires at least one valid live structural native proof artifact`, + ); + } + } +} + function validateContractCoverage( executorQa: JsonObject, surfaceEvidence: Map, adversarialCases: Map, artifactRefs: Map, -): void { +): JsonObject[] { const rows = requireObjectArray(executorQa.contractCoverage, "executorQa.contractCoverage"); buildRowIdMap(rows, "executorQa.contractCoverage"); let hasSuccessfulContractCoverage = false; @@ -1892,32 +2087,47 @@ function validateContractCoverage( "qualityGate executorQa.contractCoverage must include at least one row with status covered, passed, or verified", ); } + return rows; } async function validateExecutorQaRedTeamEvidenceInternal( cwd: string, executorQa: JsonObject, - _options: { mode?: "checkpoint" | "review" } = {}, + options: { mode?: "checkpoint" | "review"; changeSet?: UltragoalChangeSet } = {}, ): Promise { const artifactRefs = await validateArtifactRefs(cwd, executorQa); const surfaceEvidence = await validateSurfaceEvidence(cwd, executorQa, artifactRefs); const adversarialCases = validateAdversarialCases(executorQa, artifactRefs); - validateContractCoverage(executorQa, surfaceEvidence, adversarialCases, artifactRefs); + const contractCoverage = validateContractCoverage(executorQa, surfaceEvidence, adversarialCases, artifactRefs); + if (requiresComputerRedTeamSuite(executorQa, options.changeSet)) { + await validateMandatoryComputerAdversarialCases(cwd, contractCoverage, adversarialCases, artifactRefs); + } } -async function validateExecutorQaRedTeamEvidence(cwd: string, executorQa: JsonObject): Promise { - await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa, { mode: "checkpoint" }); +async function validateExecutorQaRedTeamEvidence( + cwd: string, + executorQa: JsonObject, + options: { changeSet?: UltragoalChangeSet } = {}, +): Promise { + await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa, { + mode: "checkpoint", + changeSet: options.changeSet, + }); } export async function validateExecutorQaRedTeamEvidenceForReview( cwd: string, executorQa: Record, - options: { mode?: "review" } = {}, + options: { mode?: "review"; changeSet?: UltragoalChangeSet } = {}, ): Promise { await validateExecutorQaRedTeamEvidenceInternal(cwd, executorQa as JsonObject, options); } -async function validateCompletionQualityGate(cwd: string, gate: JsonObject): Promise { +async function validateCompletionQualityGate( + cwd: string, + gate: JsonObject, + options: { changeSet?: UltragoalChangeSet } = {}, +): Promise { const codeReview = qualityGateObject(gate.codeReview); if (codeReview) { throw new Error( @@ -1962,7 +2172,7 @@ async function validateCompletionQualityGate(cwd: string, gate: JsonObject): Pro } requireNonEmptyString(executorQa.evidence, "executorQa.evidence"); requireEmptyBlockers(executorQa.blockers, "executorQa.blockers"); - await validateExecutorQaRedTeamEvidence(cwd, executorQa); + await validateExecutorQaRedTeamEvidence(cwd, executorQa, { changeSet: options.changeSet }); if (iteration.status !== PASSED_STATUS || iteration.fullRerun !== true) { throw new Error("qualityGate iteration must be passed with fullRerun true"); } @@ -1973,7 +2183,11 @@ async function validateCompletionQualityGate(cwd: string, gate: JsonObject): Pro requireEmptyBlockers(iteration.blockers, "iteration.blockers"); } -async function readRequiredCompletionQualityGate(cwd: string, value: string | undefined): Promise { +async function readRequiredCompletionQualityGate( + cwd: string, + value: string | undefined, + options: { changeSet?: UltragoalChangeSet } = {}, +): Promise { if (!value?.trim()) { throw new Error( "complete checkpoints require --quality-gate-json with architectReview, executorQa, and iteration evidence", @@ -1982,7 +2196,7 @@ async function readRequiredCompletionQualityGate(cwd: string, value: string | un const gate = await readStructuredValue(cwd, value); const gateObject = qualityGateObject(gate); if (!gateObject) throw new Error("qualityGate must be a JSON object"); - await validateCompletionQualityGate(cwd, gateObject); + await validateCompletionQualityGate(cwd, gateObject, { changeSet: options.changeSet }); return gate; } @@ -2085,9 +2299,10 @@ export async function checkpointUltragoalGoal(input: { // instead of silently dropping it. return plan; } + const changeSet = input.status === "complete" ? await computeCheckpointChangeSet(input.cwd) : undefined; const qualityGateJson = input.status === "complete" - ? await readRequiredCompletionQualityGate(input.cwd, input.qualityGateJson) + ? await readRequiredCompletionQualityGate(input.cwd, input.qualityGateJson, { changeSet }) : input.qualityGateJson ? await readStructuredValue(input.cwd, input.qualityGateJson) : undefined; @@ -2686,20 +2901,140 @@ async function resolveGitBase(cwd: string, branch?: string): Promise { } const mergeBase = await spawnText(["git", "merge-base", "HEAD", "origin/main"], { cwd, timeoutMs: 3000 }); if (mergeBase.ok && mergeBase.stdout.trim()) return mergeBase.stdout.trim(); - return "HEAD"; + return "HEAD~1"; +} + +function parseGitNameStatus(output: string): UltragoalChangeSetPath[] { + const rows: UltragoalChangeSetPath[] = []; + for (const line of output.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) continue; + const parts = trimmed.split(/\s+/); + const statusCode = parts[0] ?? ""; + let status: UltragoalChangeStatus = "unknown"; + if (statusCode.startsWith("A")) status = "added"; + else if (statusCode.startsWith("M")) status = "modified"; + else if (statusCode.startsWith("D")) status = "deleted"; + else if (statusCode.startsWith("R")) status = "renamed"; + else if (statusCode.startsWith("C")) status = "copied"; + const pathValue = status === "renamed" || status === "copied" ? parts[2] : parts[1]; + if (!pathValue) continue; + const oldPath = status === "renamed" || status === "copied" ? parts[1] : undefined; + rows.push({ + path: normalizeRepoPath(pathValue), + oldPath: oldPath ? normalizeRepoPath(oldPath) : undefined, + status, + category: categorizeComputerChangePath(pathValue), + }); + } + return rows; +} + +function mergeChangeSetPaths(groups: UltragoalChangeSetPath[][]): UltragoalChangeSetPath[] { + const byKey = new Map(); + for (const row of groups.flat()) byKey.set(`${row.oldPath ?? ""}\u0000${row.path}`, row); + return [...byKey.values()]; +} + +async function computeCheckpointChangeSet(cwd: string): Promise { + const inGit = await spawnText(["git", "rev-parse", "--is-inside-work-tree"], { cwd, timeoutMs: 3000 }); + if (!inGit.ok || inGit.stdout.trim() !== "true") return undefined; + if (!(await Bun.file(path.join(cwd, ".git")).exists())) return undefined; + const baseRef = await resolveGitBase(cwd); + const base = baseRef; + const mergeBase = await spawnText(["git", "merge-base", "HEAD", baseRef], { cwd, timeoutMs: 3000 }); + const [committed, unstaged, staged, stat] = await Promise.all([ + spawnText(["git", "diff", "--name-status", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--name-status"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--cached", "--name-status"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--stat", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + ]); + if (!committed.ok && !unstaged.ok && !staged.ok) return undefined; + return { + source: "checkpoint-git", + baseRef, + mergeBase: mergeBase.ok && mergeBase.stdout.trim() ? mergeBase.stdout.trim() : undefined, + headRef: "HEAD", + paths: mergeChangeSetPaths([ + parseGitNameStatus(committed.stdout), + parseGitNameStatus(unstaged.stdout), + parseGitNameStatus(staged.stdout), + ]), + rawDiffStat: stat.stdout, + trusted: true, + }; +} + +function parseUnifiedDiffPaths(diff: string): UltragoalChangeSetPath[] { + const paths: UltragoalChangeSetPath[] = []; + for (const line of diff.split("\n")) { + if (!line.startsWith("diff --git ")) continue; + const match = /^diff --git a\/(.+?) b\/(.+)$/.exec(line); + if (!match) continue; + const oldPath = normalizeRepoPath(match[1]!); + const newPath = normalizeRepoPath(match[2]!); + paths.push({ + path: newPath, + oldPath: oldPath === newPath ? undefined : oldPath, + status: oldPath === newPath ? "modified" : "renamed", + category: categorizeComputerChangePath(newPath), + }); + } + return paths; +} + +function changeSetFromReviewSource(source: JsonObject): UltragoalChangeSet | undefined { + const kind = nonEmptyString(source.kind); + if (kind === "spec") return { source: "review-spec", paths: [], trusted: true }; + if (kind === "pr" && typeof source.diff === "string") + return { + source: "review-pr", + paths: parseUnifiedDiffPaths(source.diff), + rawDiffStat: source.diff, + trusted: true, + }; + const local = qualityGateObject(source.local); + if (kind === "pr" && local) return changeSetFromReviewSource(local); + if (kind === "worktree") + return { + source: "review-worktree", + paths: parseGitNameStatus(String(source.nameStatus ?? source.status ?? "")), + rawDiffStat: String(source.diffStat ?? ""), + trusted: true, + }; + if (kind === "branch" || kind === "pr-fallback") + return { + source: "review-branch", + baseRef: nonEmptyString(source.base) ?? undefined, + headRef: "HEAD", + paths: parseGitNameStatus(String(source.nameStatus ?? "")), + rawDiffStat: String(source.diffStat ?? ""), + trusted: true, + }; + return undefined; } async function localDiffSource(cwd: string, sourceKind: string, branch?: string): Promise { if (sourceKind === "worktree") { - const [status, diff] = await Promise.all([ + const [status, diff, unstaged, staged] = await Promise.all([ spawnText(["git", "status", "--short"], { cwd, timeoutMs: 5000 }), spawnText(["git", "diff", "--stat"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--name-status"], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--cached", "--name-status"], { cwd, timeoutMs: 5000 }), ]); - return { kind: "worktree", status: status.stdout, diffStat: diff.stdout }; + return { + kind: "worktree", + status: status.stdout, + diffStat: diff.stdout, + nameStatus: `${unstaged.stdout}\n${staged.stdout}`, + }; } const base = await resolveGitBase(cwd, branch); - const diff = await spawnText(["git", "diff", "--stat", `${base}...HEAD`], { cwd, timeoutMs: 5000 }); - return { kind: sourceKind, base, branch, diffStat: diff.stdout }; + const [diff, nameStatus] = await Promise.all([ + spawnText(["git", "diff", "--stat", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + spawnText(["git", "diff", "--name-status", `${base}...HEAD`], { cwd, timeoutMs: 5000 }), + ]); + return { kind: sourceKind, base, branch, diffStat: diff.stdout, nameStatus: nameStatus.stdout }; } async function resolveReviewSource( @@ -2813,13 +3148,14 @@ export async function runUltragoalReview(cwd: string, args: readonly string[]): const mode = parseReviewMode(flagValue(args, "--mode")); const specPath = flagValue(args, "--spec"); const { contractStrength, source } = await resolveReviewSource(cwd, args, specPath); + const changeSet = changeSetFromReviewSource(source); const executorQa = await readOptionalExecutorQa( cwd, flagValue(args, "--executor-qa-json") ?? flagValue(args, "--executor-qa"), ); const findings: UltragoalReviewFinding[] = []; try { - await validateExecutorQaRedTeamEvidenceForReview(cwd, executorQa, { mode: "review" }); + await validateExecutorQaRedTeamEvidenceForReview(cwd, executorQa, { mode: "review", changeSet }); } catch (error) { findings.push(findingFromError(error)); } diff --git a/packages/coding-agent/src/prompts/tools/computer.md b/packages/coding-agent/src/prompts/tools/computer.md new file mode 100644 index 000000000..f8c3dbc4f --- /dev/null +++ b/packages/coding-agent/src/prompts/tools/computer.md @@ -0,0 +1,32 @@ +# computer + +Use `computer` only when the session has explicitly enabled macOS computer-use. It controls the real desktop and is off by default. + +## Safety contract + +- Disabled means disabled: when `computer.enabled` and `computer.alwaysOn` are both false, every action including `screenshot` fails with `COMPUTER_DISABLED` and captures nothing. +- The tool is macOS-only in v1. +- Native execution remains supervisor-gated. If the stop/suspend supervisor is unavailable, stale, suspended, permissioned off, display-stale, or cancelled, the action fails closed with a `COMPUTER_*` code. +- Respect the user's stop/suspend request immediately. Do not loop desktop actions after a stop/suspend/error. + +## Coordinate contract + +Coordinates are screenshot pixels, not CSS pixels and not normalized fractions. Use the latest successful `screenshot` dimensions and origin/scale metadata as the coordinate frame. Do not guess coordinates outside the screenshot bounds. + +## Actions + +The model action object uses exactly these snake_case actions and fields: + +- `screenshot` — capture the enabled desktop. +- `click` — `x`, `y`, optional `button` (`left`, `right`, `middle`). +- `double_click` — `x`, `y`, optional `button`. +- `move` — `x`, `y`, optional `button`. +- `drag` — `x`, `y`, `to_x`, `to_y`, optional `button`. +- `scroll` — `x`, `y`, `scroll_x`, `scroll_y`. +- `type` — `text`. +- `keypress` — `keys` string array. +- `wait` — `ms`. + +Shared optional fields: `timeout` seconds and `include_screenshot` for a bounded post-action screenshot when supported. + +Do not use camelCase fields such as `doubleClick`, `toX`, `scrollX`, or `includeScreenshot` in the model action object. diff --git a/packages/coding-agent/src/tools/computer.ts b/packages/coding-agent/src/tools/computer.ts new file mode 100644 index 000000000..10344f0b5 --- /dev/null +++ b/packages/coding-agent/src/tools/computer.ts @@ -0,0 +1,403 @@ +import type { AgentTool, AgentToolContext, AgentToolResult, AgentToolUpdateCallback } from "@gajae-code/agent-core"; +import { prompt } from "@gajae-code/utils"; +import * as z from "zod/v4"; +import computerDescription from "../prompts/tools/computer.md" with { type: "text" }; +import type { ToolSession } from "./index"; +import type { OutputMeta } from "./output-meta"; +import { ToolAbortError, ToolError, throwIfAborted } from "./tool-errors"; +import { toolResult } from "./tool-result"; +import { clampTimeout } from "./tool-timeouts"; + +const buttonSchema = z.enum(["left", "right", "middle"]); +const shared = { + timeout: z.number().positive().optional().describe("Maximum time in seconds for this action."), + include_screenshot: z.boolean().optional().describe("Capture a bounded post-action screenshot when supported."), +}; + +const screenshotSchema = z.object({ action: z.literal("screenshot"), ...shared }).strict(); +const clickSchema = z + .object({ action: z.literal("click"), x: z.number(), y: z.number(), button: buttonSchema.optional(), ...shared }) + .strict(); +const doubleClickSchema = z + .object({ + action: z.literal("double_click"), + x: z.number(), + y: z.number(), + button: buttonSchema.optional(), + ...shared, + }) + .strict(); +const moveSchema = z + .object({ action: z.literal("move"), x: z.number(), y: z.number(), button: buttonSchema.optional(), ...shared }) + .strict(); +const dragSchema = z + .object({ + action: z.literal("drag"), + x: z.number(), + y: z.number(), + to_x: z.number(), + to_y: z.number(), + button: buttonSchema.optional(), + ...shared, + }) + .strict(); +const scrollSchema = z + .object({ + action: z.literal("scroll"), + x: z.number(), + y: z.number(), + scroll_x: z.number(), + scroll_y: z.number(), + ...shared, + }) + .strict(); +const typeSchema = z.object({ action: z.literal("type"), text: z.string(), ...shared }).strict(); +const keypressSchema = z + .object({ action: z.literal("keypress"), keys: z.array(z.string()).min(1), ...shared }) + .strict(); +const waitSchema = z.object({ action: z.literal("wait"), ms: z.number().int().nonnegative(), ...shared }).strict(); + +export const computerSchema = z.discriminatedUnion("action", [ + screenshotSchema, + clickSchema, + doubleClickSchema, + moveSchema, + dragSchema, + scrollSchema, + typeSchema, + keypressSchema, + waitSchema, +]); + +export type ComputerParams = z.infer; +export type ComputerActionName = ComputerParams["action"]; + +export interface ComputerScreenshotDetails { + widthPx: number; + heightPx: number; + scaleX?: number; + scaleY?: number; + originX?: number; + originY?: number; + displayEpoch?: string; + captureId?: string; + pngBytes?: number; +} + +export interface ComputerToolDetails { + action: ComputerActionName; + status: "success" | "disabled" | "error"; + code?: string; + message?: string; + x?: number; + y?: number; + toX?: number; + toY?: number; + scrollX?: number; + scrollY?: number; + button?: string; + keys?: string[]; + ms?: number; + screenshot?: ComputerScreenshotDetails; + supervisor?: string; + meta?: OutputMeta; +} + +type NativeController = { + screenshot?: (payload?: unknown, options?: { signal?: AbortSignal }) => Promise | NativeScreenshot; + click?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + doubleClick?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + move?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + drag?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + scroll?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + type?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + keypress?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; + wait?: (payload: unknown, options?: { signal?: AbortSignal }) => Promise | unknown; +}; + +type NativeScreenshot = { + png?: Uint8Array | Buffer | ArrayBuffer | string; + widthPx?: number; + heightPx?: number; + scaleX?: number; + scaleY?: number; + originX?: number; + originY?: number; + displayEpoch?: string; + captureId?: string; +}; + +export type ComputerControllerFactory = () => NativeController; + +export const COMPUTER_DISABLED_CODE = "COMPUTER_DISABLED"; + +const NATIVE_ERROR_CODES = new Set([ + "COMPUTER_SUSPENDED", + "COMPUTER_SUPERVISOR_NOT_LIVE", + "COMPUTER_PERMISSION_REQUIRED", + "COMPUTER_DISPLAY_STALE", + "COMPUTER_COORD_INVALID", + "COMPUTER_CANCELLED", +]); + +function createNativeComputerController(): NativeController { + const natives = require("@gajae-code/natives") as { ComputerController?: new () => NativeController }; + if (!natives.ComputerController) { + throw new ToolError("ComputerController is unavailable in @gajae-code/natives.", { + code: "COMPUTER_UNAVAILABLE", + }); + } + return new natives.ComputerController(); +} + +let controllerFactory: ComputerControllerFactory = createNativeComputerController; +let platformOverrideForTests: NodeJS.Platform | undefined; + +export function setComputerControllerFactoryForTests(factory: ComputerControllerFactory | undefined): void { + controllerFactory = factory ?? createNativeComputerController; +} + +export function setComputerPlatformForTests(platform: NodeJS.Platform | undefined): void { + platformOverrideForTests = platform; +} + +function currentComputerPlatform(): NodeJS.Platform { + return platformOverrideForTests ?? process.platform; +} + +export function isComputerSupportedPlatform(platform: NodeJS.Platform = currentComputerPlatform()): boolean { + return platform === "darwin"; +} + +/** + * Whether the computer capability is loaded/advertised at all on this platform. + * macOS is callable; Linux is listable (support planned); Windows is fully absent. + */ +export function isComputerLoadablePlatform(platform: NodeJS.Platform = process.platform): boolean { + return platform !== "win32"; +} + +export function isComputerEnabled(session: Pick): boolean { + return Boolean(session.settings.get("computer.enabled") || session.settings.get("computer.alwaysOn")); +} + +export function isComputerCallable( + session: Pick, + platform: NodeJS.Platform = currentComputerPlatform(), +): boolean { + return isComputerSupportedPlatform(platform) && isComputerEnabled(session); +} + +export class ComputerTool implements AgentTool { + readonly name = "computer"; + readonly label = "Computer"; + readonly loadMode = "discoverable"; + readonly summary = + "Control the explicitly enabled macOS desktop with screenshot, pointer, keyboard, scroll, and wait actions"; + readonly parameters = computerSchema; + readonly strict = true; + #description?: string; + + constructor(private readonly session: ToolSession) {} + + static createIf(session: ToolSession): ComputerTool | null { + return isComputerCallable(session) ? new ComputerTool(session) : null; + } + + get description(): string { + this.#description ??= prompt.render(computerDescription, {}); + return this.#description; + } + + async execute( + _toolCallId: string, + params: ComputerParams, + signal?: AbortSignal, + _onUpdate?: AgentToolUpdateCallback, + _ctx?: AgentToolContext, + ): Promise> { + const details = detailsFromParams(params); + if (!isComputerCallable(this.session)) { + details.status = "disabled"; + details.code = COMPUTER_DISABLED_CODE; + details.message = + "The computer tool is disabled. Enable computer.enabled or computer.alwaysOn on macOS to use it."; + return { ...toolResult(details).text(`${COMPUTER_DISABLED_CODE}: ${details.message}`).done(), isError: true }; + } + + try { + throwIfAborted(signal); + const timeoutSeconds = clampTimeout("computer", params.timeout); + const timeoutSignal = timeoutSeconds > 0 ? AbortSignal.timeout(timeoutSeconds * 1000) : undefined; + const combinedSignal = + signal && timeoutSignal ? AbortSignal.any([signal, timeoutSignal]) : (signal ?? timeoutSignal); + const result = await dispatchComputerAction(controllerFactory(), params, combinedSignal); + const screenshot = normalizeScreenshot(result); + if (screenshot) details.screenshot = screenshot; + details.status = "success"; + details.message = describeComputerSuccess(details); + return toolResult(details).text(details.message).done(); + } catch (error) { + if (error instanceof ToolAbortError) throw error; + const mapped = mapComputerError(error); + details.status = mapped.code === COMPUTER_DISABLED_CODE ? "disabled" : "error"; + details.code = mapped.code; + details.message = mapped.message; + return { ...toolResult(details).text(`${mapped.code}: ${mapped.message}`).done(), isError: true }; + } + } +} + +async function dispatchComputerAction( + controller: NativeController, + params: ComputerParams, + signal?: AbortSignal, +): Promise { + const options = { signal }; + switch (params.action) { + case "screenshot": + return controller.screenshot?.( + { timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + case "click": + return controller.click?.( + { + x: params.x, + y: params.y, + button: params.button ?? "left", + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "double_click": + return controller.doubleClick?.( + { + x: params.x, + y: params.y, + button: params.button ?? "left", + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "move": + return controller.move?.( + { + x: params.x, + y: params.y, + button: params.button, + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "drag": + return controller.drag?.( + { + x: params.x, + y: params.y, + toX: params.to_x, + toY: params.to_y, + button: params.button ?? "left", + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "scroll": + return controller.scroll?.( + { + x: params.x, + y: params.y, + scrollX: params.scroll_x, + scrollY: params.scroll_y, + timeoutMs: secondsToMs(params.timeout), + includeScreenshot: params.include_screenshot, + }, + options, + ); + case "type": + return controller.type?.( + { text: params.text, timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + case "keypress": + return controller.keypress?.( + { keys: params.keys, timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + case "wait": + return controller.wait?.( + { ms: params.ms, timeoutMs: secondsToMs(params.timeout), includeScreenshot: params.include_screenshot }, + options, + ); + } +} + +function detailsFromParams(params: ComputerParams): ComputerToolDetails { + const details: ComputerToolDetails = { action: params.action, status: "success" }; + if ("x" in params) details.x = params.x; + if ("y" in params) details.y = params.y; + if ("to_x" in params) details.toX = params.to_x; + if ("to_y" in params) details.toY = params.to_y; + if ("scroll_x" in params) details.scrollX = params.scroll_x; + if ("scroll_y" in params) details.scrollY = params.scroll_y; + if ("button" in params) details.button = params.button; + if ("keys" in params) details.keys = params.keys; + if ("ms" in params) details.ms = params.ms; + return details; +} + +function secondsToMs(seconds: number | undefined): number | undefined { + return typeof seconds === "number" ? seconds * 1000 : undefined; +} + +function normalizeScreenshot(value: unknown): ComputerScreenshotDetails | undefined { + const candidate = + value && typeof value === "object" && "screenshot" in value + ? (value as { screenshot?: unknown }).screenshot + : value; + if (!candidate || typeof candidate !== "object") return undefined; + const shot = candidate as NativeScreenshot; + if (typeof shot.widthPx !== "number" || typeof shot.heightPx !== "number") return undefined; + return { + widthPx: shot.widthPx, + heightPx: shot.heightPx, + scaleX: shot.scaleX, + scaleY: shot.scaleY, + originX: shot.originX, + originY: shot.originY, + displayEpoch: shot.displayEpoch, + captureId: shot.captureId, + pngBytes: getPngByteLength(shot.png), + }; +} + +function getPngByteLength(png: NativeScreenshot["png"]): number | undefined { + if (png === undefined) return undefined; + if (typeof png === "string") return Buffer.byteLength(png, "base64"); + if (png instanceof ArrayBuffer) return png.byteLength; + return png.byteLength; +} + +function mapComputerError(error: unknown): { code: string; message: string } { + if (error instanceof Error && error.name === "AbortError") { + return { code: "COMPUTER_CANCELLED", message: "Computer action was cancelled." }; + } + const maybe = error as { code?: unknown; message?: unknown }; + const rawCode = typeof maybe?.code === "string" ? maybe.code : undefined; + const code = + rawCode && (NATIVE_ERROR_CODES.has(rawCode) || rawCode.startsWith("COMPUTER_")) ? rawCode : "COMPUTER_ERROR"; + const message = + typeof maybe?.message === "string" && maybe.message.length > 0 ? maybe.message : "Computer action failed."; + return { code, message }; +} + +function describeComputerSuccess(details: ComputerToolDetails): string { + if (details.screenshot) { + return `Computer ${details.action} completed (${details.screenshot.widthPx}x${details.screenshot.heightPx}).`; + } + return `Computer ${details.action} completed.`; +} diff --git a/packages/coding-agent/src/tools/computer/render.ts b/packages/coding-agent/src/tools/computer/render.ts new file mode 100644 index 000000000..592d6c1a3 --- /dev/null +++ b/packages/coding-agent/src/tools/computer/render.ts @@ -0,0 +1,68 @@ +import type { Component } from "@gajae-code/tui"; +import { Text } from "@gajae-code/tui"; +import type { RenderResultOptions } from "../../extensibility/custom-tools/types"; +import type { Theme } from "../../modes/theme/theme"; +import type { ComputerToolDetails } from "../computer"; +import { formatBadge, formatErrorMessage } from "../render-utils"; + +function asRecord(value: unknown): Record { + return value && typeof value === "object" ? (value as Record) : {}; +} + +function summarizeArgs(args: unknown): string { + const input = asRecord(args); + const action = typeof input.action === "string" ? input.action : "computer"; + const parts = [action]; + if (typeof input.x === "number" && typeof input.y === "number") parts.push(`@ ${input.x},${input.y}`); + if (typeof input.to_x === "number" && typeof input.to_y === "number") parts.push(`→ ${input.to_x},${input.to_y}`); + if (typeof input.scroll_x === "number" || typeof input.scroll_y === "number") { + parts.push(`scroll ${input.scroll_x ?? 0},${input.scroll_y ?? 0}`); + } + if (Array.isArray(input.keys)) parts.push(`keys ${input.keys.join("+")}`); + if (typeof input.ms === "number") parts.push(`${input.ms}ms`); + return parts.join(" "); +} + +export function summarizeComputerDetails( + details: ComputerToolDetails | undefined, + isError: boolean, + theme: Theme, +): string { + if (!details) return isError ? "Computer action failed" : "Computer action completed"; + const parts: string[] = [details.action]; + if (details.x !== undefined && details.y !== undefined) parts.push(`@ ${details.x},${details.y}`); + if (details.toX !== undefined && details.toY !== undefined) parts.push(`→ ${details.toX},${details.toY}`); + if (details.scrollX !== undefined || details.scrollY !== undefined) + parts.push(`scroll ${details.scrollX ?? 0},${details.scrollY ?? 0}`); + if (details.screenshot) { + const shot = details.screenshot; + parts.push(`screenshot ${shot.widthPx}x${shot.heightPx}`); + if (shot.pngBytes !== undefined) parts.push(`${shot.pngBytes} bytes`); + if (shot.captureId) parts.push(`capture ${shot.captureId}`); + } + if (details.supervisor) parts.push(`supervisor ${details.supervisor}`); + if (details.code) parts.push(theme.fg(isError ? "error" : "muted", details.code)); + return parts.join(" "); +} + +export const computerToolRenderer = { + renderCall(args: unknown, _options: RenderResultOptions, theme: Theme): Component { + return new Text(`${formatBadge("computer", "accent", theme)} ${summarizeArgs(args)}`); + }, + renderResult( + result: { content: Array<{ type: string; text?: string }>; details?: unknown; isError?: boolean }, + _options: RenderResultOptions, + theme: Theme, + ): Component { + if (result.isError) { + const details = result.details as ComputerToolDetails | undefined; + return new Text( + formatErrorMessage(details?.message ?? result.content.find(c => c.type === "text")?.text, theme), + ); + } + return new Text( + `${formatBadge("computer", "success", theme)} ${summarizeComputerDetails(result.details as ComputerToolDetails | undefined, false, theme)}`, + ); + }, + mergeCallAndResult: true, +}; diff --git a/packages/coding-agent/src/tools/index.ts b/packages/coding-agent/src/tools/index.ts index 0156e1acd..c9281fa6d 100644 --- a/packages/coding-agent/src/tools/index.ts +++ b/packages/coding-agent/src/tools/index.ts @@ -36,6 +36,7 @@ import { BashTool } from "./bash"; import { BrowserTool } from "./browser"; import { CalculatorTool } from "./calculator"; import { type CheckpointState, CheckpointTool, RewindTool } from "./checkpoint"; +import { ComputerTool, isComputerCallable, isComputerLoadablePlatform } from "./computer"; import { CronCreateTool, CronDeleteTool, CronListTool } from "./cron"; import { DebugTool } from "./debug"; import { EvalTool } from "./eval"; @@ -73,6 +74,7 @@ export * from "./bash"; export * from "./browser"; export * from "./calculator"; export * from "./checkpoint"; +export * from "./computer"; export * from "./cron"; export * from "./debug"; export * from "./eval"; @@ -312,6 +314,29 @@ export function computeEssentialBuiltinNames(settings: Settings): string[] { * Hindsight memory helpers are intentionally excluded: memory is a private backend * integration, not a public gajae-code tool surface. */ +export interface BuiltinCapabilityCatalogEntry { + name: string; + label: string; + summary: string; + docsPath: string; + callableBuiltin: boolean; + defaultEnabled: boolean; +} + +export const BUILTIN_CAPABILITY_CATALOG: readonly BuiltinCapabilityCatalogEntry[] = isComputerLoadablePlatform() + ? [ + { + name: "computer", + label: "Computer", + summary: + "Explicitly enabled macOS desktop screenshot and input control; off by default and supervisor-gated.", + docsPath: "docs/tools/computer.md", + callableBuiltin: false, + defaultEnabled: false, + }, + ] + : []; + export const BUILTIN_TOOLS: Record = { read: s => new ReadTool(s), bash: s => new BashTool(s), @@ -330,6 +355,7 @@ export const BUILTIN_TOOLS: Record = { lsp: LspTool.createIf, inspect_image: s => new InspectImageTool(s), browser: s => new BrowserTool(s), + ...(isComputerLoadablePlatform() ? { computer: ComputerTool.createIf } : {}), checkpoint: CheckpointTool.createIf, rewind: RewindTool.createIf, task: s => TaskTool.create(s), @@ -504,6 +530,7 @@ export async function createTools(session: ToolSession, toolNames?: string[]): P if (name === "calc") return session.settings.get("calc.enabled"); if (name === "skill") return session.settings.get("skill.enabled"); if (name === "browser") return session.settings.get("browser.enabled"); + if (name === "computer") return isComputerCallable(session); if (name === "checkpoint" || name === "rewind") return session.settings.get("checkpoint.enabled"); if (name === "irc") { if (!session.settings.get("irc.enabled")) return false; diff --git a/packages/coding-agent/src/tools/renderers.ts b/packages/coding-agent/src/tools/renderers.ts index d1d803f44..c23f965af 100644 --- a/packages/coding-agent/src/tools/renderers.ts +++ b/packages/coding-agent/src/tools/renderers.ts @@ -17,6 +17,7 @@ import { astGrepToolRenderer } from "./ast-grep"; import { bashToolRenderer } from "./bash"; import { browserToolRenderer } from "./browser/render"; import { calculatorToolRenderer } from "./calculator"; +import { computerToolRenderer } from "./computer/render"; import { debugToolRenderer } from "./debug"; import { evalToolRenderer } from "./eval"; import { findToolRenderer } from "./find"; @@ -52,6 +53,7 @@ export const toolRenderers: Record = { ast_edit: astEditToolRenderer as ToolRenderer, bash: bashToolRenderer as ToolRenderer, browser: browserToolRenderer as ToolRenderer, + computer: computerToolRenderer as ToolRenderer, recipe: recipeToolRenderer as ToolRenderer, debug: debugToolRenderer as ToolRenderer, eval: evalToolRenderer as ToolRenderer, diff --git a/packages/coding-agent/src/tools/tool-timeouts.ts b/packages/coding-agent/src/tools/tool-timeouts.ts index cbd6ddc54..0f91ccea6 100644 --- a/packages/coding-agent/src/tools/tool-timeouts.ts +++ b/packages/coding-agent/src/tools/tool-timeouts.ts @@ -11,6 +11,7 @@ export const TOOL_TIMEOUTS = { bash: { default: 300, min: 1, max: 3600 }, eval: { default: 30, min: 1, max: 600 }, browser: { default: 30, min: 1, max: 300 }, + computer: { default: 30, min: 1, max: 300 }, ssh: { default: 60, min: 1, max: 3600 }, fetch: { default: 20, min: 1, max: 45 }, lsp: { default: 20, min: 5, max: 60 }, diff --git a/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts b/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts new file mode 100644 index 000000000..ba95ef0a8 --- /dev/null +++ b/packages/coding-agent/test/gjc-runtime/computer-red-team-fixtures.test.ts @@ -0,0 +1,335 @@ +import { afterEach, describe, expect, it } from "bun:test"; +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; +import { deflateSync } from "node:zlib"; + +import { + createUltragoalPlan, + runNativeUltragoalCommand, + startNextUltragoalGoal, +} from "@gajae-code/coding-agent/gjc-runtime/ultragoal-runtime"; + +const tempRoots: string[] = []; + +async function tempDir(): Promise { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "gjc-computer-red-team-")); + tempRoots.push(root); + return root; +} + +afterEach(async () => { + await Promise.all(tempRoots.splice(0).map(dir => fs.rm(dir, { recursive: true, force: true }))); +}); + +async function runGit(cwd: string, args: string[]): Promise { + const proc = Bun.spawn(["git", ...args], { cwd, stdout: "pipe", stderr: "pipe" }); + const [stdout, stderr, exitCode] = await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + proc.exited, + ]); + if (exitCode !== 0) throw new Error(`git ${args.join(" ")} failed: ${stdout}${stderr}`); +} + +async function initRepo(root: string): Promise { + await runGit(root, ["init"]); + await runGit(root, ["config", "user.email", "test@example.com"]); + await runGit(root, ["config", "user.name", "Test User"]); + await fs.writeFile(path.join(root, "README.md"), "base\n"); + await runGit(root, ["add", "README.md"]); + await runGit(root, ["commit", "-m", "base"]); + await runGit(root, ["branch", "-M", "main"]); +} + +const PNG_SIGNATURE = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); +const PNG_CRC_TABLE = new Uint32Array(256).map((_, index) => { + let crc = index; + for (let bit = 0; bit < 8; bit++) crc = crc & 1 ? 0xedb88320 ^ (crc >>> 1) : crc >>> 1; + return crc >>> 0; +}); + +function pngCrc32(bytes: Buffer): number { + let crc = 0xffffffff; + for (const byte of bytes) crc = PNG_CRC_TABLE[(crc ^ byte) & 0xff]! ^ (crc >>> 8); + return (crc ^ 0xffffffff) >>> 0; +} + +function pngChunk(type: string, data = Buffer.alloc(0)): Buffer { + const typeBytes = Buffer.from(type, "ascii"); + const length = Buffer.alloc(4); + length.writeUInt32BE(data.length, 0); + const crc = Buffer.alloc(4); + crc.writeUInt32BE(pngCrc32(Buffer.concat([typeBytes, data])), 0); + return Buffer.concat([length, typeBytes, data, crc]); +} + +function syntheticPng(): Buffer { + const width = 320; + const height = 180; + const ihdr = Buffer.alloc(13); + ihdr.writeUInt32BE(width, 0); + ihdr.writeUInt32BE(height, 4); + ihdr[8] = 8; + ihdr[9] = 2; + const raw = Buffer.alloc((width * 3 + 1) * height); + for (let y = 0; y < height; y++) { + const row = y * (width * 3 + 1); + raw[row] = 0; + for (let x = 0; x < width; x++) { + const offset = row + 1 + x * 3; + raw[offset] = x % 256; + raw[offset + 1] = y % 256; + raw[offset + 2] = (x + y) % 256; + } + } + return Buffer.concat([PNG_SIGNATURE, pngChunk("IHDR", ihdr), pngChunk("IDAT", deflateSync(raw)), pngChunk("IEND")]); +} + +let activeObjective = ""; +async function seedPlan(root: string): Promise { + const created = await createUltragoalPlan({ + cwd: root, + brief: "@goal computer gate fixture", + }); + await runGit(root, ["add", ".gjc/ultragoal/goals.json", ".gjc/ultragoal/ledger.jsonl"]); + await runGit(root, ["commit", "-m", "plan"]); + activeObjective = created.gjcObjective; + await startNextUltragoalGoal({ cwd: root }); +} + +function goalSnapshot(): string { + return JSON.stringify({ + goal: { + threadId: "test-thread", + objective: activeObjective, + status: "active", + createdAt: Date.now(), + updatedAt: Date.now(), + }, + }); +} + +function artifact(kind = "native screenshot"): Record { + return { id: "surface-proof", kind, description: "live structural native proof", path: "artifacts/native.png" }; +} + +const CASES = [ + "kill-switch-bypass", + "suspended-enforcement", + "permission-revoked", + "display-stale", + "out-of-bounds-drift", + "runaway-loop-halt", + "blast-radius", +]; + +function executorQa( + overrides: { + cases?: Record[]; + artifacts?: Record[]; + computerTouching?: boolean; + surface?: string; + } = {}, +): Record { + const cases = + overrides.cases ?? + CASES.map(id => ({ + id, + status: "passed", + contractRef: "computer-safety", + scenario: `${id} adversarial scenario exercises the computer safety boundary`, + expectedBehavior: "fail closed before unsafe desktop input can continue", + verdict: "passed", + artifactRefs: ["case-proof"], + })); + return { + status: "passed", + e2eStatus: "passed", + redTeamStatus: "passed", + evidence: "executor QA covered the requested contract with durable proof artifacts", + e2eCommands: ["bun test fixture"], + redTeamCommands: ["bun test fixture"], + changedPaths: overrides.computerTouching === true ? ["crates/pi-natives/src/computer/executor.rs"] : undefined, + computerTouching: overrides.computerTouching, + artifactRefs: overrides.artifacts ?? [ + artifact("native screenshot"), + { ...artifact("native screenshot"), id: "case-proof" }, + ], + surfaceEvidence: [ + { + id: "surface-native", + contractRef: "computer-safety", + surface: overrides.surface ?? "native", + status: "passed", + invocation: "native fixture invocation", + verdict: "passed", + artifactRefs: ["surface-proof"], + }, + ], + adversarialCases: cases, + contractCoverage: [ + { + id: "coverage", + contractRef: "computer-safety", + status: "covered", + obligation: "all mandatory computer red-team cases are covered", + surfaceEvidenceRefs: ["surface-native"], + adversarialCaseRefs: cases.map(row => String(row.id)), + }, + ], + blockers: [], + }; +} + +function qualityGate(qa: Record): string { + return JSON.stringify({ + architectReview: { + architectureStatus: "CLEAR", + productStatus: "CLEAR", + codeStatus: "CLEAR", + recommendation: "APPROVE", + commands: ["review"], + evidence: "architect review passed with no blockers", + blockers: [], + }, + executorQa: qa, + iteration: { + status: "passed", + fullRerun: true, + rerunCommands: ["bun test fixture"], + evidence: "targeted fixture rerun passed", + blockers: [], + }, + }); +} + +async function writeQaArtifacts(root: string): Promise { + await fs.mkdir(path.join(root, "artifacts"), { recursive: true }); + await fs.writeFile(path.join(root, "artifacts/native.png"), syntheticPng()); +} + +async function checkpoint(root: string, qa: Record): Promise { + const result = await runNativeUltragoalCommand( + [ + "checkpoint", + "--goal-id", + "G001", + "--status", + "complete", + "--evidence", + "fixture complete", + "--gjc-goal-json", + goalSnapshot(), + "--quality-gate-json", + qualityGate(qa), + ], + root, + ); + return (result.stderr ?? "") + (result.stdout ?? ""); +} + +async function seedComputerChange(root: string, file = "crates/pi-natives/src/computer/executor.rs"): Promise { + await fs.mkdir(path.dirname(path.join(root, file)), { recursive: true }); + await fs.writeFile(path.join(root, file), "// computer change\n"); + await runGit(root, ["add", file]); +} + +describe("computer red-team fixture matrix", () => { + it("preserves non-computer validation when unchanged", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + expect(await checkpoint(root, executorQa())).toContain("Checkpointed G001 as complete"); + }); + + it("fails computer code change missing a mandatory case", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root); + const message = await checkpoint( + root, + executorQa({ + computerTouching: true, + cases: (executorQa().adversarialCases as Record[]).filter( + row => row.id !== "blast-radius", + ), + }), + ).catch(error => String(error)); + expect(message).toContain("COMPUTER_REDTEAM_CASE_MISSING"); + }); + + it("fails not_applicable on a mandatory case", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root); + const cases = CASES.map(id => ({ + id, + status: id === "blast-radius" ? "not_applicable" : "passed", + contractRef: "computer-safety", + scenario: "scenario text", + expectedBehavior: "expected behavior", + verdict: "passed", + artifactRefs: ["case-proof"], + })); + const message = await checkpoint(root, executorQa({ cases })).catch(error => String(error)); + expect(message).toContain("not_applicable"); + }); + + it("fails mandatory case with inline-only metadata artifact", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await seedComputerChange(root); + await writeQaArtifacts(root); + const message = await checkpoint( + root, + executorQa({ + computerTouching: true, + artifacts: [ + artifact("native screenshot"), + { + id: "case-proof", + kind: "native metadata", + description: "inline only", + inlineEvidence: "inline proof is not durable live structural evidence", + }, + ], + }), + ).catch(error => String(error)); + expect(message).toContain("COMPUTER_REDTEAM_INLINE_ONLY"); + }); + + it("passes full valid computer gate", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root); + expect(await checkpoint(root, executorQa({ computerTouching: true }))).toContain("Checkpointed G001 as complete"); + }); + + it("does not trigger from declaration-only without trusted computer change", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + const qa = executorQa({ computerTouching: false, surface: "native" }); + expect(await checkpoint(root, qa)).toContain("Checkpointed G001 as complete"); + }); + + it("allows non-operational docs-only computer tiering", async () => { + const root = await tempDir(); + await initRepo(root); + await seedPlan(root); + await writeQaArtifacts(root); + await seedComputerChange(root, "docs/computer-use/README.md"); + const qa = executorQa({ computerTouching: false, surface: "native" }); + expect(await checkpoint(root, qa)).toContain("Checkpointed G001 as complete"); + }); +}); diff --git a/packages/coding-agent/test/tool-discovery/initial-tools.test.ts b/packages/coding-agent/test/tool-discovery/initial-tools.test.ts index 307a42f60..3c31a1a44 100644 --- a/packages/coding-agent/test/tool-discovery/initial-tools.test.ts +++ b/packages/coding-agent/test/tool-discovery/initial-tools.test.ts @@ -5,7 +5,9 @@ import { AgentRegistry, MAIN_AGENT_ID } from "../../src/registry/agent-registry" import type { ToolSession } from "../../src/tools/index"; import { AskTool, + BUILTIN_CAPABILITY_CATALOG, BUILTIN_TOOLS, + ComputerTool, computeEssentialBuiltinNames, createTools, DEFAULT_ESSENTIAL_TOOL_NAMES, @@ -90,6 +92,7 @@ async function getToolMetadata(): Promise [tool.name, { loadMode: tool.loadMode, summary: tool.summary }])); for (const tool of [ new AskTool({ ...toolSession, hasUI: true }), + new ComputerTool(toolSession), new SshTool(toolSession, [], new Map(), ""), new JobTool(toolSession), new RecipeTool(toolSession, []), @@ -97,6 +100,13 @@ async function getToolMetadata(): Promise entry.name === "computer"); + if (computerCapability) { + metadata.set("computer", { loadMode: "discoverable", summary: computerCapability.summary }); + } return metadata; } describe("BUILTIN_TOOLS public factory map", () => { diff --git a/packages/coding-agent/test/tools/computer.test.ts b/packages/coding-agent/test/tools/computer.test.ts new file mode 100644 index 000000000..28fdcbf9c --- /dev/null +++ b/packages/coding-agent/test/tools/computer.test.ts @@ -0,0 +1,197 @@ +import { afterEach, describe, expect, it } from "bun:test"; +import { Settings } from "@gajae-code/coding-agent/config/settings"; +import { + BUILTIN_CAPABILITY_CATALOG, + ComputerTool, + computerSchema, + createTools, + isComputerCallable, + isComputerLoadablePlatform, + setComputerControllerFactoryForTests, + setComputerPlatformForTests, + type ToolSession, +} from "@gajae-code/coding-agent/tools"; +import { summarizeComputerDetails } from "@gajae-code/coding-agent/tools/computer/render"; +import { toolRenderers } from "@gajae-code/coding-agent/tools/renderers"; + +function createSession(settings = Settings.isolated()): ToolSession { + return { + cwd: "/tmp/test", + hasUI: false, + getSessionFile: () => null, + getSessionSpawns: () => "*", + settings, + }; +} + +function textOf(result: { content: Array<{ type: string; text?: string }> }): string { + return result.content.map(c => c.text ?? "").join("\n"); +} + +describe("computer tool schema", () => { + const validCases = [ + { action: "screenshot" }, + { action: "click", x: 1, y: 2, button: "left" }, + { action: "double_click", x: 1, y: 2, button: "right" }, + { action: "move", x: 1, y: 2, button: "middle" }, + { action: "drag", x: 1, y: 2, to_x: 3, to_y: 4 }, + { action: "scroll", x: 1, y: 2, scroll_x: 0, scroll_y: -10 }, + { action: "type", text: "hello" }, + { action: "keypress", keys: ["Meta", "K"] }, + { action: "wait", ms: 250 }, + ]; + + it("accepts exactly the nine OpenAI snake_case actions", () => { + expect(validCases.map(value => computerSchema.parse(value).action)).toEqual([ + "screenshot", + "click", + "double_click", + "move", + "drag", + "scroll", + "type", + "keypress", + "wait", + ]); + }); + + it("rejects camelCase actions and fields", () => { + expect(() => computerSchema.parse({ action: "doubleClick", x: 1, y: 2 })).toThrow(); + expect(() => computerSchema.parse({ action: "drag", x: 1, y: 2, toX: 3, toY: 4 })).toThrow(); + expect(() => computerSchema.parse({ action: "scroll", x: 1, y: 2, scrollX: 0, scrollY: 1 })).toThrow(); + expect(() => computerSchema.parse({ action: "screenshot", includeScreenshot: true })).toThrow(); + }); +}); + +describe("computer tool gating", () => { + afterEach(() => { + setComputerControllerFactoryForTests(undefined); + setComputerPlatformForTests(undefined); + }); + + it("is metadata-only by default and not callable/discoverable", async () => { + const session = createSession(Settings.isolated({ "tools.discoveryMode": "all" })); + const tools = await createTools(session); + const names = tools.map(t => t.name); + expect(names).not.toContain("computer"); + const catalogEntry = BUILTIN_CAPABILITY_CATALOG.find(entry => entry.name === "computer"); + if (isComputerLoadablePlatform()) { + expect(catalogEntry).toMatchObject({ callableBuiltin: false, defaultEnabled: false }); + } else { + expect(catalogEntry).toBeUndefined(); + } + const discoverable = tools.filter(t => t.loadMode === "discoverable").map(t => t.name); + expect(discoverable).not.toContain("computer"); + }); + + it("is callable with per-session enable or alwaysOn on macOS", async () => { + setComputerPlatformForTests("darwin"); + const enabledNames = (await createTools(createSession(Settings.isolated({ "computer.enabled": true })))).map( + t => t.name, + ); + const alwaysOnNames = (await createTools(createSession(Settings.isolated({ "computer.alwaysOn": true })))).map( + t => t.name, + ); + expect(enabledNames).toContain("computer"); + expect(alwaysOnNames).toContain("computer"); + }); + + it("is absent on non-macOS even when settings enable it", () => { + expect(isComputerCallable(createSession(Settings.isolated({ "computer.enabled": true })), "linux")).toBe(false); + }); + + it("is loadable on macOS and Linux but not loaded at all on Windows", () => { + expect(isComputerLoadablePlatform("darwin")).toBe(true); + expect(isComputerLoadablePlatform("linux")).toBe(true); + expect(isComputerLoadablePlatform("win32")).toBe(false); + }); + + it("returns COMPUTER_DISABLED without constructing native controller when directly invoked while disabled", async () => { + let constructed = false; + setComputerControllerFactoryForTests(() => { + constructed = true; + return {}; + }); + const tool = new ComputerTool(createSession()); + const result = await tool.execute("call", { action: "screenshot" }); + expect(result.isError).toBe(true); + expect(result.details?.code).toBe("COMPUTER_DISABLED"); + expect(textOf(result)).toContain("COMPUTER_DISABLED"); + expect(constructed).toBe(false); + }); +}); + +describe("computer tool dispatch", () => { + afterEach(() => { + setComputerControllerFactoryForTests(undefined); + setComputerPlatformForTests(undefined); + }); + + it("maps snake_case model actions to native controller methods and forwards AbortSignal", async () => { + setComputerPlatformForTests("darwin"); + const calls: Array<{ method: string; payload: unknown; signal?: AbortSignal }> = []; + setComputerControllerFactoryForTests(() => ({ + screenshot: (payload, options) => { + calls.push({ method: "screenshot", payload, signal: options?.signal }); + return { widthPx: 20, heightPx: 10, png: new Uint8Array([1, 2, 3]), captureId: "cap-1" }; + }, + doubleClick: (payload, options) => calls.push({ method: "doubleClick", payload, signal: options?.signal }), + drag: (payload, options) => calls.push({ method: "drag", payload, signal: options?.signal }), + scroll: (payload, options) => calls.push({ method: "scroll", payload, signal: options?.signal }), + })); + const tool = new ComputerTool(createSession(Settings.isolated({ "computer.enabled": true }))); + const controller = new AbortController(); + const shot = await tool.execute("shot", { action: "screenshot", timeout: 2 }, controller.signal); + await tool.execute("dbl", { action: "double_click", x: 1, y: 2, button: "right" }, controller.signal); + await tool.execute("drag", { action: "drag", x: 1, y: 2, to_x: 3, to_y: 4 }, controller.signal); + await tool.execute("scroll", { action: "scroll", x: 1, y: 2, scroll_x: 5, scroll_y: -6 }, controller.signal); + + expect(shot.details?.screenshot).toMatchObject({ widthPx: 20, heightPx: 10, pngBytes: 3, captureId: "cap-1" }); + expect(calls.map(call => call.method)).toEqual(["screenshot", "doubleClick", "drag", "scroll"]); + expect(calls[1].payload).toMatchObject({ x: 1, y: 2, button: "right" }); + expect(calls[2].payload).toMatchObject({ x: 1, y: 2, toX: 3, toY: 4, button: "left" }); + expect(calls[3].payload).toMatchObject({ x: 1, y: 2, scrollX: 5, scrollY: -6 }); + expect(calls.every(call => call.signal instanceof AbortSignal)).toBe(true); + }); + + it("maps native COMPUTER_* errors into bounded tool errors", async () => { + setComputerPlatformForTests("darwin"); + setComputerControllerFactoryForTests(() => ({ + click: () => { + const error = new Error("supervisor is not live") as Error & { code: string }; + error.code = "COMPUTER_SUPERVISOR_NOT_LIVE"; + throw error; + }, + })); + const tool = new ComputerTool(createSession(Settings.isolated({ "computer.enabled": true }))); + const result = await tool.execute("click", { action: "click", x: 1, y: 2 }); + expect(result.isError).toBe(true); + expect(result.details?.code).toBe("COMPUTER_SUPERVISOR_NOT_LIVE"); + expect(textOf(result)).toContain("supervisor is not live"); + }); +}); + +describe("computer renderer", () => { + it("renders bounded output without raw screenshot data", () => { + const renderer = toolRenderers.computer; + expect(renderer).toBeDefined(); + const fakeTheme = { + fg: (_name: string, text: string) => text, + format: { bracketLeft: "[", bracketRight: "]" }, + styledSymbol: () => "!", + sep: { dot: " · " }, + } as never; + const output = summarizeComputerDetails( + { + action: "screenshot", + status: "success", + screenshot: { widthPx: 640, heightPx: 480, pngBytes: 1234, captureId: "cap-1" }, + }, + false, + fakeTheme, + ); + expect(output).toContain("640x480"); + expect(output).toContain("1234 bytes"); + expect(output).not.toContain("iVBOR"); + }); +}); diff --git a/packages/natives/native/index.d.ts b/packages/natives/native/index.d.ts index 9733d3c43..b89f03912 100644 --- a/packages/natives/native/index.d.ts +++ b/packages/natives/native/index.d.ts @@ -1,5 +1,18 @@ /* auto-generated by NAPI-RS */ /* eslint-disable */ +export declare class ComputerController { + constructor() + screenshot(): ComputerScreenshot + click(expectedEpoch: number | undefined | null, x: number, y: number, button?: string | undefined | null): void + doubleClick(expectedEpoch: number | undefined | null, x: number, y: number, button?: string | undefined | null): void + move(expectedEpoch: number | undefined | null, x: number, y: number): void + drag(expectedEpoch: number | undefined | null, x: number, y: number, toX: number, toY: number, button?: string | undefined | null): void + scroll(expectedEpoch: number | undefined | null, x: number, y: number, scrollX: number, scrollY: number): void + type(expectedEpoch: number | undefined | null, text: string): void + keypress(expectedEpoch: number | undefined | null, keys: Array): void + wait(expectedEpoch: number | undefined | null, ms: number): void +} + /** * Long-lived macOS appearance observer. * @@ -357,6 +370,46 @@ export interface ClipboardImage { mimeType: string } +/** + * Capture the primary display for JS callers (macOS). + * + * Requires the Screen Recording permission. This is the read-only `screenshot` + * primitive of the computer-use tool; input primitives land behind the same + * surface once the Accessibility gate is satisfied in a granted `gjc` process. + * + * # Errors + * Returns an error when capture fails (e.g. Screen Recording not granted). + */ +export declare function computerScreenshot(): ComputerScreenshot + +/** + * A captured primary-display screenshot returned to JS. + * + * `width_px`/`height_px` are the physical pixels that define the action + * coordinate space (see the coordinate contract); the scale/origin map them to + * macOS logical points. + */ +export interface ComputerScreenshot { + /** PNG-encoded image bytes. */ + png: Uint8Array + /** Screenshot width in physical pixels. */ + widthPx: number + /** Screenshot height in physical pixels. */ + heightPx: number + /** Physical-pixels-per-logical-point along X. */ + scaleX: number + /** Physical-pixels-per-logical-point along Y. */ + scaleY: number + /** Logical origin X of the display (points). */ + originX: number + /** Logical origin Y of the display (points). */ + originY: number + /** Stable hash of the display geometry used for stale-display checks. */ + displayEpoch: number + /** Process-local opaque capture id. */ + captureId: number +} + /** A context line (before or after a match). */ export interface ContextLine { /** 1-indexed line number in the source file. */ diff --git a/packages/natives/native/index.js b/packages/natives/native/index.js index 7bba4aa90..702311cfc 100644 --- a/packages/natives/native/index.js +++ b/packages/natives/native/index.js @@ -17,6 +17,7 @@ const nativeBindings = loadNative(); nativeBindings.initNativeCrashDiagnostics?.(); // --- generated native exports (do not edit) --- // classes +export const ComputerController = nativeBindings.ComputerController; export const MacAppearanceObserver = nativeBindings.MacAppearanceObserver; export const MacOSPowerAssertion = nativeBindings.MacOSPowerAssertion; export const Process = nativeBindings.Process; @@ -28,6 +29,7 @@ export const __piNativesV0_5_2 = nativeBindings.__piNativesV0_5_2; export const applyBashFixups = nativeBindings.applyBashFixups; export const astEdit = nativeBindings.astEdit; export const astGrep = nativeBindings.astGrep; +export const computerScreenshot = nativeBindings.computerScreenshot; export const copyToClipboard = nativeBindings.copyToClipboard; export const countTokens = nativeBindings.countTokens; export const detectMacOSAppearance = nativeBindings.detectMacOSAppearance; diff --git a/packages/natives/test/computer.test.ts b/packages/natives/test/computer.test.ts new file mode 100644 index 000000000..ce3a52291 --- /dev/null +++ b/packages/natives/test/computer.test.ts @@ -0,0 +1,72 @@ +import { describe, expect, it } from "bun:test"; + +const isMacOS = process.platform === "darwin"; + +type NativeComputerModule = { + ComputerController: new () => Record; + computerScreenshot: () => { + widthPx: number; + heightPx: number; + scaleX: number; + scaleY: number; + png: Uint8Array; + displayEpoch: number; + captureId: number; + }; +}; + +async function loadNativeComputerModule(): Promise { + return (await import("../native/index.js")) as unknown as NativeComputerModule; +} + +describe.if(isMacOS)("ComputerController napi binding", () => { + it("exists with expected methods", async () => { + const { ComputerController } = await loadNativeComputerModule(); + const controller = new ComputerController(); + expect(controller).toBeInstanceOf(ComputerController); + for (const method of [ + "screenshot", + "click", + "doubleClick", + "move", + "drag", + "scroll", + "type", + "keypress", + "wait", + ]) { + expect(typeof controller[method]).toBe("function"); + } + }); +}); + +// The native `computerScreenshot` binding is macOS-only and captures the real +// primary display, so it requires the Screen Recording permission. Gate on +// platform and skip gracefully when capture is unavailable in the environment. +describe.if(isMacOS)("computer screenshot napi binding", () => { + it("returns a decodable PNG whose dimensions match the descriptor", async () => { + const { computerScreenshot } = await loadNativeComputerModule(); + let shot: ReturnType; + try { + shot = computerScreenshot(); + } catch (err) { + // Screen Recording not granted to this process — surfaced, not silent. + console.warn(`skipping: computerScreenshot unavailable (${String(err)})`); + return; + } + + expect(shot.widthPx).toBeGreaterThan(0); + expect(shot.heightPx).toBeGreaterThan(0); + expect(shot.scaleX).toBeGreaterThan(0); + expect(shot.scaleY).toBeGreaterThan(0); + expect(shot.png.byteLength).toBeGreaterThan(0); + expect(shot.displayEpoch).toBeGreaterThan(0); + expect(shot.captureId).toBeGreaterThan(0); + + // PNG magic number: 89 50 4E 47 0D 0A 1A 0A. + const sig = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]; + for (let i = 0; i < sig.length; i++) { + expect(shot.png[i]).toBe(sig[i]); + } + }); +});