From c285b24746ad229c7132379dc45538d1d2fa3d94 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 20:00:09 +0000 Subject: [PATCH 01/10] feat: implement working clean/smudge round-trip for a Rust subset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turn the filter skeleton into a real, git-invoked AST round-trip: - printer: parse Rust with Tree-sitter and re-emit canonical source by walking the tree. Fail-closed — syntax errors and unsupported node kinds error rather than silently corrupting code. - pktline: implement Git's long-running filter pkt-line codec. - filters: speak the real `filter-process` protocol; `clean` canonicalizes `*.rs`, `smudge` is identity, non-Rust passes through. - setup: `git-ast setup` registers the filter + .gitattributes in a repo. - examples/demo.sh: end-to-end proof that reformatting produces no diff while a real change shows a clean one. Scope stays honest: one language, a documented subset, fail-closed. Diff/merge drivers remain placeholders pending stable node identity, which this does not address. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- Cargo.lock | 90 +++++++++++++ Cargo.toml | 9 +- README.md | 37 ++++-- examples/demo.sh | 66 ++++++++++ src/filters.rs | 238 +++++++++++++++++++++++++++------- src/lib.rs | 17 ++- src/main.rs | 18 ++- src/pktline.rs | 165 ++++++++++++++++++++++++ src/printer.rs | 323 +++++++++++++++++++++++++++++++++++++++++++++++ src/setup.rs | 63 +++++++++ 10 files changed, 950 insertions(+), 76 deletions(-) create mode 100755 examples/demo.sh create mode 100644 src/pktline.rs create mode 100644 src/printer.rs create mode 100644 src/setup.rs diff --git a/Cargo.lock b/Cargo.lock index b26d987..4c536f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,96 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "cc" +version = "1.2.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e228eec9be7c17ccb640b59b36a5cd805ea2a564a4c5e162c2f659fea30d3b96" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + [[package]] name = "git-ast" version = "0.1.0" +dependencies = [ + "tree-sitter", + "tree-sitter-rust", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "tree-sitter" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df7cc499ceadd4dcdf7ec6d4cbc34ece92c3fa07821e287aedecd4416c516dca" +dependencies = [ + "cc", + "regex", +] + +[[package]] +name = "tree-sitter-rust" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "277690f420bf90741dea984f3da038ace46c4fe6047cba57a66822226cde1c93" +dependencies = [ + "cc", + "tree-sitter", +] diff --git a/Cargo.toml b/Cargo.toml index 526c82d..9a918e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,10 @@ description = "Language-aware Git: AST-based diffs and merges instead of line-ba license = "MIT" repository = "https://github.com/bounded-systems/git-ast" -# No dependencies yet. The design uses Tree-sitter for parsing and libgit2 -# (git2) for plumbing; those are added when the corresponding logic lands, -# rather than declared while unused. +# Tree-sitter parses source into a concrete syntax tree; the `printer` module +# walks that tree and re-emits canonical source. No libgit2 needed: the +# clean/smudge filter speaks Git's pkt-line `filter-process` protocol over +# stdin/stdout, implemented in `pktline`. [dependencies] +tree-sitter = "0.22" +tree-sitter-rust = "0.21" diff --git a/README.md b/README.md index 61f2cf6..90be2b8 100644 --- a/README.md +++ b/README.md @@ -48,18 +48,31 @@ For a full documentation overview, see [Documentation Index](./docs/README.md). ## Project Status -**Design stage — not yet a working tool.** This repository is primarily a design -worked out in [`docs/`](./docs/README.md), plus a small Rust skeleton that -compiles and exposes the subcommand surface (`git-ast filter-process | -diff-driver | merge-driver`). The filter, diff, and merge logic are -**placeholders**: parsing, serialization, and pretty-printing are not implemented -yet. The hardest open problem — stable AST node identity across versions, which -structural diff/merge and refactor-aware history depend on — is described in -[`docs/planning/scope.md`](./docs/planning/scope.md) and explicitly out of scope -for the initial MVP. - -If you are evaluating this repo: the value here is the architecture and the -problem framing, not a runnable extension. +**Working clean/smudge round-trip for a Rust subset.** The core pipeline is +implemented and runs through real Git: + +- `git-ast setup` registers the filter in a repository. +- On `git add`, the `clean` filter parses Rust with Tree-sitter and stores its + **canonical** form; on `git checkout`, `smudge` returns it. Reformatting + therefore never reaches history — two differently-formatted inputs that parse + to the same tree produce byte-identical blobs. +- It speaks Git's real `filter-process` pkt-line protocol, so `git add` / + `git checkout` / `git diff` all work end to end. See + [`examples/demo.sh`](./examples/demo.sh). + +Honest boundaries: + +- **One language, a subset of it.** The pretty-printer covers the constructs in + the example (functions, params, blocks, `let`, binary/call/macro expressions, + literals, comments). It is **fail-closed**: syntax errors reject the commit, + and any unsupported construct returns an error rather than corrupting code. + Widening coverage is additive — one more arm per node kind. +- **Diff and merge drivers are still placeholders.** Making those *structural* + depends on the hardest open problem — **stable AST node identity across + versions** — which this does **not** solve. Canonical formatting removes + formatting churn from history; it does not yet track a node through a move or + rename. That problem is described in + [`docs/planning/scope.md`](./docs/planning/scope.md) and remains out of scope. ## License diff --git a/examples/demo.sh b/examples/demo.sh new file mode 100755 index 0000000..3dd3196 --- /dev/null +++ b/examples/demo.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# End-to-end demo of the git-ast clean/smudge round-trip. +# +# Builds the binary, creates a throwaway git repo with the filter installed, +# and shows three things: +# 1. messy Rust is stored canonically (clean), +# 2. a pure reformat produces no diff (formatting never enters history), +# 3. a real logic change still shows a clean, minimal diff. +# +# Usage: examples/demo.sh +set -euo pipefail + +repo_root="$(cd "$(dirname "$0")/.." && pwd)" +cd "$repo_root" + +echo "==> building git-ast (release)" +cargo build --release --quiet +bin="$repo_root/target/release/git-ast" + +work="$(mktemp -d)" +trap 'rm -rf "$work"' EXIT +cd "$work" +git init -q +"$bin" setup >/dev/null + +echo +echo "==> writing deliberately messy Rust" +cat > calc.rs <<'EOF' +fn add(a:i32,b:i32)->i32{ +// Simple addition + a+b} +fn main(){let x=5;let y =10; + let sum=add(x,y);println!("Sum: {}",sum);} +EOF +cat calc.rs + +echo +echo "==> git add, then show the STORED blob (clean filter output)" +git add calc.rs +git cat-file -p :calc.rs +git commit -qm "add calc.rs" + +echo +echo "==> reformat the file wildly, then check git diff" +cat > calc.rs <<'EOF' +fn add(a: i32, b: i32) -> i32 { // Simple addition + a+b } + + +fn main() { + let x = 5; + let y = 10; + let sum = add( x , y ); + println!( "Sum: {}" , sum ); +} +EOF +if git diff --quiet; then + echo "(no diff — formatting churn never entered history)" +else + echo "UNEXPECTED: reformatting produced a diff"; git diff; exit 1 +fi + +echo +echo "==> make a real change (a + b -> a - b) and show the diff" +sed -i 's/a+b/a-b/' calc.rs +git --no-pager diff diff --git a/src/filters.rs b/src/filters.rs index 5dc7a7a..a280d13 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -1,68 +1,210 @@ -//! Clean/smudge filter. +//! Clean/smudge filter over Git's long-running `filter-process` protocol. //! -//! `clean` turns source text into a serialized tree on `git add`; `smudge` turns -//! it back into source on checkout. Both are placeholders: `clean` prefixes the -//! content with a marker and `smudge` strips it. A real implementation would -//! parse with Tree-sitter and pretty-print deterministically. - -use crate::Error; -use std::io::Read; - -/// Marker that the placeholder clean/smudge round-trip uses to stand in for a -/// serialized tree. -const SERIALIZED_MARKER: &[u8] = b"SERIALIZED:"; - -/// Run the long-running filter process. -/// -/// Placeholder: reads stdin to EOF and reports the byte count instead of -/// speaking Git's pkt-line filter protocol. +//! `clean` (on `git add`) parses Rust source and stores its canonical form, so +//! reformatting never reaches history. `smudge` (on `git checkout`) is identity: +//! the stored bytes are already canonical source. Only `*.rs` paths are +//! transformed; anything else passes through untouched. +//! +//! The conversation is the standard one documented in +//! `Documentation/gitattributes.txt`: +//! +//! 1. Handshake — exchange `git-filter-client`/`git-filter-server` + `version=2`. +//! 2. Capabilities — we advertise `clean` and `smudge`. +//! 3. Per blob — read `command`/`pathname` metadata then content, reply with a +//! status line and the transformed content. + +use crate::pktline::{self, Packet}; +use crate::{printer, Error}; +use std::collections::HashMap; +use std::io::{self, Read, Write}; +use std::path::Path; + +/// Run the long-running filter process against stdin/stdout. pub fn run_long_running_filter() -> Result<(), Error> { - eprintln!("[filter] long-running filter process (placeholder)"); - let mut buffer = Vec::new(); - std::io::stdin().read_to_end(&mut buffer)?; - eprintln!("[filter] read {} bytes (no-op)", buffer.len()); + let stdin = io::stdin(); + let stdout = io::stdout(); + let mut input = stdin.lock(); + let mut output = stdout.lock(); + converse(&mut input, &mut output) +} + +/// Drive the whole protocol over arbitrary streams (so it is testable without a +/// real Git process). +pub fn converse(input: &mut impl Read, output: &mut impl Write) -> Result<(), Error> { + handshake(input, output)?; + capabilities(input, output)?; + + // Process blobs until Git closes the pipe. + while process_one(input, output)? {} Ok(()) } -/// `clean`: source text -> serialized tree. -/// -/// Placeholder: prefixes the content with [`SERIALIZED_MARKER`]. -#[allow(dead_code)] -fn perform_clean(input: &[u8], pathname: &str) -> Result, Error> { - eprintln!("[filter] clean {pathname}"); - let mut out = SERIALIZED_MARKER.to_vec(); - out.extend_from_slice(input); - Ok(out) +fn handshake(input: &mut impl Read, output: &mut impl Write) -> Result<(), Error> { + let intro = pktline::read_until_flush(input)? + .ok_or_else(|| protocol("client closed during handshake"))?; + let intro = String::from_utf8_lossy(&intro); + if !intro.contains("git-filter-client") { + return Err(protocol("missing git-filter-client welcome")); + } + if !intro.contains("version=2") { + return Err(protocol("client did not offer version=2")); + } + pktline::write_text_packet(output, "git-filter-server")?; + pktline::write_text_packet(output, "version=2")?; + pktline::write_flush(output)?; + output.flush()?; + Ok(()) +} + +fn capabilities(input: &mut impl Read, output: &mut impl Write) -> Result<(), Error> { + // Read (and ignore the specifics of) the client's advertised capabilities. + pktline::read_until_flush(input)? + .ok_or_else(|| protocol("client closed during capabilities"))?; + pktline::write_text_packet(output, "capability=clean")?; + pktline::write_text_packet(output, "capability=smudge")?; + pktline::write_flush(output)?; + output.flush()?; + Ok(()) } -/// `smudge`: serialized tree -> source text. -/// -/// Placeholder: strips [`SERIALIZED_MARKER`] if present, otherwise passes through. -#[allow(dead_code)] -fn perform_smudge(input: &[u8], pathname: &str) -> Result, Error> { - eprintln!("[filter] smudge {pathname}"); - match input.strip_prefix(SERIALIZED_MARKER) { - Some(rest) => Ok(rest.to_vec()), - None => Ok(input.to_vec()), +/// Handle a single blob request. Returns `Ok(false)` when the client has closed +/// the stream (no more work), `Ok(true)` after a blob was processed. +fn process_one(input: &mut impl Read, output: &mut impl Write) -> Result { + // Metadata section: key=value lines terminated by a flush. EOF here is the + // normal shutdown signal. + let meta = match read_meta(input)? { + Some(meta) => meta, + None => return Ok(false), + }; + let command = meta.get("command").map(String::as_str).unwrap_or_default(); + let pathname = meta.get("pathname").cloned().unwrap_or_default(); + + // Content section. + let content = + pktline::read_until_flush(input)?.ok_or_else(|| protocol("client closed mid-content"))?; + + match transform(command, &pathname, &content) { + Ok(out) => { + pktline::write_text_packet(output, "status=success")?; + pktline::write_flush(output)?; + pktline::write_content(output, &out)?; + // Trailing empty status list: leaves status=success in effect. + pktline::write_flush(output)?; + } + Err(e) => { + // Report the blob as failed; Git aborts the add/checkout for it. + eprintln!("git-ast: {pathname}: {e}"); + pktline::write_text_packet(output, "status=error")?; + pktline::write_flush(output)?; + } } + output.flush()?; + Ok(true) +} + +/// Apply the requested transform. `clean` canonicalizes Rust; `smudge` is +/// identity. Non-Rust paths pass through unchanged in both directions. +fn transform(command: &str, pathname: &str, content: &[u8]) -> Result, Error> { + let is_rust = Path::new(pathname).extension().is_some_and(|e| e == "rs"); + match command { + "clean" if is_rust => printer::canonicalize(content), + "smudge" | "clean" => Ok(content.to_vec()), + other => Err(Error::Driver(format!("unknown filter command `{other}`"))), + } +} + +/// Read the metadata section into key/value pairs. Returns `None` at clean EOF. +fn read_meta(input: &mut impl Read) -> Result>, Error> { + let mut map = HashMap::new(); + let mut saw_any = false; + loop { + match pktline::read_packet(input)? { + None if !saw_any => return Ok(None), + None => return Err(protocol("EOF in metadata section")), + Some(Packet::Flush) => return Ok(Some(map)), + Some(Packet::Data(d)) => { + saw_any = true; + let line = String::from_utf8_lossy(&d); + let line = line.trim_end_matches('\n'); + if let Some((k, v)) = line.split_once('=') { + map.insert(k.to_string(), v.to_string()); + } + } + } + } +} + +fn protocol(msg: &str) -> Error { + Error::Driver(format!("filter protocol: {msg}")) } #[cfg(test)] mod tests { use super::*; + use crate::pktline::write_text_packet; + + /// Build a client-side request stream: handshake, capabilities, then one + /// blob with the given command/path/content. + fn client_stream(command: &str, pathname: &str, content: &[u8]) -> Vec { + let mut w = Vec::new(); + write_text_packet(&mut w, "git-filter-client").unwrap(); + write_text_packet(&mut w, "version=2").unwrap(); + pktline::write_flush(&mut w).unwrap(); + write_text_packet(&mut w, "capability=clean").unwrap(); + write_text_packet(&mut w, "capability=smudge").unwrap(); + pktline::write_flush(&mut w).unwrap(); + write_text_packet(&mut w, &format!("command={command}")).unwrap(); + write_text_packet(&mut w, &format!("pathname={pathname}")).unwrap(); + pktline::write_flush(&mut w).unwrap(); + pktline::write_content(&mut w, content).unwrap(); + w + } + + /// Pull the content of the (single) blob response back out of the server's + /// reply stream, skipping the handshake/capability/status sections. + fn response_content(reply: &[u8]) -> Vec { + let mut r = reply; + pktline::read_until_flush(&mut r).unwrap(); // server handshake + pktline::read_until_flush(&mut r).unwrap(); // server capabilities + pktline::read_until_flush(&mut r).unwrap(); // status list + pktline::read_until_flush(&mut r).unwrap().unwrap() // content + } + + #[test] + fn clean_canonicalizes_rust() { + let req = client_stream("clean", "a.rs", b"fn f()->i32{1+2}"); + let mut out = Vec::new(); + converse(&mut &req[..], &mut out).unwrap(); + assert_eq!(response_content(&out), b"fn f() -> i32 {\n 1 + 2\n}\n"); + } + + #[test] + fn smudge_is_identity() { + let canonical = b"fn f() -> i32 {\n 1 + 2\n}\n"; + let req = client_stream("smudge", "a.rs", canonical); + let mut out = Vec::new(); + converse(&mut &req[..], &mut out).unwrap(); + assert_eq!(response_content(&out), canonical); + } #[test] - fn clean_then_smudge_round_trips() { - let src = b"fn main() {}\n"; - let cleaned = perform_clean(src, "a.rs").unwrap(); - assert!(cleaned.starts_with(SERIALIZED_MARKER)); - let smudged = perform_smudge(&cleaned, "a.rs").unwrap(); - assert_eq!(smudged, src); + fn non_rust_passes_through_clean() { + let req = client_stream("clean", "notes.txt", b" unchanged "); + let mut out = Vec::new(); + converse(&mut &req[..], &mut out).unwrap(); + assert_eq!(response_content(&out), b" unchanged "); } #[test] - fn smudge_passes_through_unmarked_content() { - let raw = b"plain text"; - assert_eq!(perform_smudge(raw, "a.rs").unwrap(), raw); + fn clean_reports_error_on_unparseable_rust() { + let req = client_stream("clean", "bad.rs", b"fn main( {"); + let mut out = Vec::new(); + converse(&mut &req[..], &mut out).unwrap(); + // Status section should carry status=error and no content follows. + let mut r = &out[..]; + pktline::read_until_flush(&mut r).unwrap(); // handshake + pktline::read_until_flush(&mut r).unwrap(); // capabilities + let status = pktline::read_until_flush(&mut r).unwrap().unwrap(); + assert_eq!(String::from_utf8_lossy(&status).trim_end(), "status=error"); } } diff --git a/src/lib.rs b/src/lib.rs index e4d8393..ecd4a10 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,12 +7,14 @@ //! //! ## Status //! -//! **Design-stage skeleton.** This crate currently exposes the *shape* of the -//! integration — a subcommand entry point plus placeholder filter/diff/merge -//! logic — so the wiring compiles and can be exercised end to end. The actual -//! parsing, serialization and pretty-printing are not implemented yet; the -//! relevant functions are clearly marked as placeholders. See the `docs/` -//! directory for the design and roadmap. +//! **Working clean/smudge round-trip for a Rust subset.** The `clean` filter +//! parses Rust with Tree-sitter and re-emits canonical source ([`printer`]), +//! driven over Git's real `filter-process` pkt-line protocol ([`pktline`], +//! [`filters`]) — so `git add`/`git checkout` normalize formatting end to end. +//! The printer covers a documented subset and is fail-closed outside it. The +//! diff and merge drivers ([`drivers`]) remain placeholders: making those +//! structural depends on stable node identity, which is out of scope (see +//! `docs/planning/scope.md`). //! //! ## Integration points //! @@ -27,6 +29,9 @@ pub mod config; pub mod drivers; pub mod filters; +pub mod pktline; +pub mod printer; +pub mod setup; use std::fmt; diff --git a/src/main.rs b/src/main.rs index a6a4a12..a01657a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,12 @@ //! `git-ast` command-line entry point. //! -//! Dispatches the Git integration subcommands. The subcommands themselves are -//! placeholders — see the crate docs and `docs/` for the design. +//! Dispatches the Git integration subcommands. `setup` and `filter-process` +//! implement the working clean/smudge round-trip; `diff-driver`/`merge-driver` +//! remain placeholders (they await stable node identity — see `docs/`). use std::process::ExitCode; -use git_ast::{drivers, filters}; +use git_ast::{drivers, filters, setup}; fn main() -> ExitCode { let args: Vec = std::env::args().skip(1).collect(); @@ -15,6 +16,7 @@ fn main() -> ExitCode { }; let result = match cmd.as_str() { + "setup" => setup::run().map(|()| 0u8), "filter-process" => filters::run_long_running_filter().map(|()| 0u8), "diff-driver" => drivers::run_diff_driver(rest).map(|()| 0u8), "merge-driver" => drivers::run_merge_driver(rest).map(|()| 0u8), @@ -44,19 +46,21 @@ fn main() -> ExitCode { fn print_help() { eprintln!( - "git-ast — language-aware Git (design-stage skeleton)\n\ + "git-ast — language-aware Git\n\ \n\ USAGE:\n \ git-ast \n\ \n\ SUBCOMMANDS:\n \ - filter-process Clean/smudge long-running filter (placeholder)\n \ + setup Enable the *.rs clean/smudge filter in this repo\n \ + filter-process Clean/smudge long-running filter (canonicalizes Rust)\n \ diff-driver Git diff driver (placeholder)\n \ merge-driver Git merge driver (placeholder)\n \ --version, -V Print version\n \ --help, -h Print this help\n\ \n\ - NOTE: subcommands are placeholders; parsing/printing are not implemented yet.\n\ - See docs/ for the design." + The clean/smudge round-trip works for a documented Rust subset and is\n\ + fail-closed outside it. Structural diff/merge await stable node identity;\n\ + see docs/ for the design and scope." ); } diff --git a/src/pktline.rs b/src/pktline.rs new file mode 100644 index 0000000..3f0c33d --- /dev/null +++ b/src/pktline.rs @@ -0,0 +1,165 @@ +//! Minimal pkt-line codec for Git's long-running filter protocol. +//! +//! Git frames the `filter-process` conversation in *pkt-lines*: a 4-byte +//! lowercase-hex length prefix (counting the 4 prefix bytes themselves) +//! followed by that many bytes of payload. The special length `0000` is a +//! *flush* packet, used as a delimiter. See `gitprotocol-common` and +//! `Documentation/gitattributes.txt` ("Long Running Filter Process"). +//! +//! This module is transport-only: it reads and writes frames and knows nothing +//! about clean/smudge. [`crate::filters`] drives the conversation. + +use std::io::{self, Read, Write}; + +/// Largest payload Git accepts in a single pkt-line (65520 total minus the +/// 4-byte length prefix). +pub const MAX_PAYLOAD: usize = 65516; + +/// One frame read from the wire. +#[derive(Debug, PartialEq, Eq)] +pub enum Packet { + /// A data packet and its payload (newline included, if any). + Data(Vec), + /// A flush packet (`0000`) — the delimiter between sections. + Flush, +} + +/// Read a single pkt-line frame. +/// +/// Returns `Ok(None)` only at clean end-of-stream (Git closed the pipe), which +/// the caller treats as "shut down". +pub fn read_packet(reader: &mut impl Read) -> io::Result> { + let mut len_buf = [0u8; 4]; + if !read_exact_or_eof(reader, &mut len_buf)? { + return Ok(None); + } + let len = parse_hex4(&len_buf)?; + if len == 0 { + return Ok(Some(Packet::Flush)); + } + if len < 4 { + return Err(invalid(format!("pkt-line length {len} < 4"))); + } + let mut payload = vec![0u8; len - 4]; + reader.read_exact(&mut payload)?; + Ok(Some(Packet::Data(payload))) +} + +/// Read a section of data packets up to the next flush, concatenating payloads. +/// +/// Returns `Ok(None)` at end-of-stream before any packet. +pub fn read_until_flush(reader: &mut impl Read) -> io::Result>> { + let mut buf = Vec::new(); + let mut saw_any = false; + loop { + match read_packet(reader)? { + None if !saw_any => return Ok(None), + None => return Err(invalid("unexpected EOF before flush")), + Some(Packet::Flush) => return Ok(Some(buf)), + Some(Packet::Data(d)) => { + saw_any = true; + buf.extend_from_slice(&d); + } + } + } +} + +/// Write a single data packet. `payload` must be <= [`MAX_PAYLOAD`]. +pub fn write_packet(writer: &mut impl Write, payload: &[u8]) -> io::Result<()> { + debug_assert!(payload.len() <= MAX_PAYLOAD); + let len = payload.len() + 4; + write!(writer, "{len:04x}")?; + writer.write_all(payload)?; + Ok(()) +} + +/// Write a `key=value` text packet (a trailing newline is appended, as Git +/// expects for metadata lines). +pub fn write_text_packet(writer: &mut impl Write, line: &str) -> io::Result<()> { + let mut payload = line.as_bytes().to_vec(); + payload.push(b'\n'); + write_packet(writer, &payload) +} + +/// Write a flush packet (`0000`). +pub fn write_flush(writer: &mut impl Write) -> io::Result<()> { + writer.write_all(b"0000") +} + +/// Write a payload of arbitrary size as a sequence of data packets, chunked to +/// [`MAX_PAYLOAD`], followed by a flush. +pub fn write_content(writer: &mut impl Write, content: &[u8]) -> io::Result<()> { + for chunk in content.chunks(MAX_PAYLOAD).filter(|c| !c.is_empty()) { + write_packet(writer, chunk)?; + } + write_flush(writer) +} + +fn parse_hex4(buf: &[u8; 4]) -> io::Result { + let s = std::str::from_utf8(buf).map_err(|_| invalid("non-ascii pkt-line length"))?; + usize::from_str_radix(s, 16).map_err(|_| invalid(format!("bad pkt-line length {s:?}"))) +} + +/// Read exactly `buf.len()` bytes, or report clean EOF if nothing was read. +fn read_exact_or_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { + let mut filled = 0; + while filled < buf.len() { + match reader.read(&mut buf[filled..])? { + 0 if filled == 0 => return Ok(false), + 0 => return Err(invalid("EOF in middle of pkt-line length")), + n => filled += n, + } + } + Ok(true) +} + +fn invalid(msg: impl Into) -> io::Error { + io::Error::new(io::ErrorKind::InvalidData, msg.into()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn roundtrips_a_text_packet() { + let mut buf = Vec::new(); + write_text_packet(&mut buf, "version=2").unwrap(); + assert_eq!(buf, b"000eversion=2\n"); + let pkt = read_packet(&mut &buf[..]).unwrap().unwrap(); + assert_eq!(pkt, Packet::Data(b"version=2\n".to_vec())); + } + + #[test] + fn flush_roundtrips() { + let mut buf = Vec::new(); + write_flush(&mut buf).unwrap(); + assert_eq!(buf, b"0000"); + assert_eq!(read_packet(&mut &buf[..]).unwrap().unwrap(), Packet::Flush); + } + + #[test] + fn reads_a_section_until_flush() { + let mut wire = Vec::new(); + write_packet(&mut wire, b"hello ").unwrap(); + write_packet(&mut wire, b"world").unwrap(); + write_flush(&mut wire).unwrap(); + let got = read_until_flush(&mut &wire[..]).unwrap().unwrap(); + assert_eq!(got, b"hello world"); + } + + #[test] + fn chunks_large_content() { + let big = vec![b'x'; MAX_PAYLOAD * 2 + 7]; + let mut wire = Vec::new(); + write_content(&mut wire, &big).unwrap(); + let got = read_until_flush(&mut &wire[..]).unwrap().unwrap(); + assert_eq!(got, big); + } + + #[test] + fn eof_before_any_packet_is_none() { + let empty: &[u8] = b""; + assert!(read_packet(&mut &empty[..]).unwrap().is_none()); + } +} diff --git a/src/printer.rs b/src/printer.rs new file mode 100644 index 0000000..feb38de --- /dev/null +++ b/src/printer.rs @@ -0,0 +1,323 @@ +//! AST-native canonical printer. +//! +//! [`canonicalize`] parses Rust source with Tree-sitter and re-emits it in a +//! single canonical style by walking the parse tree. This is what the `clean` +//! filter stores, so reformatting never reaches history: two differently +//! formatted inputs that parse to the same tree produce byte-identical output. +//! +//! ## Scope +//! +//! This printer covers a documented *subset* of Rust — enough to round-trip the +//! kinds of items in `examples/rust_simple_addition/` (functions, parameters, +//! blocks, `let` bindings, binary/call/macro expressions, literals, and line and +//! block comments). It is deliberately **fail-closed**: a syntax error or any +//! node kind the printer does not understand returns an [`Error`] rather than +//! guessing, so the filter can never silently corrupt code it cannot represent. +//! Widening the subset is additive — each new node kind is one more arm in +//! [`Printer::expr`] / [`Printer::stmt`]. + +use crate::Error; +use tree_sitter::{Node, Parser}; + +const INDENT: &str = " "; + +/// Parse `source` as Rust and return its canonical form. +/// +/// Returns [`Error::Parsing`] if the source does not parse cleanly, and +/// [`Error::Generation`] if it parses but contains a construct outside the +/// supported subset. +pub fn canonicalize(source: &[u8]) -> Result, Error> { + let mut parser = Parser::new(); + parser + .set_language(&tree_sitter_rust::language()) + .map_err(|e| Error::Parsing(format!("loading Rust grammar: {e}")))?; + let tree = parser + .parse(source, None) + .ok_or_else(|| Error::Parsing("parser returned no tree".to_string()))?; + let root = tree.root_node(); + if root.has_error() { + return Err(Error::Parsing( + "source has syntax errors; fix them or bypass the filter".to_string(), + )); + } + + let mut printer = Printer { + src: source, + out: String::new(), + }; + printer.source_file(root)?; + Ok(printer.out.into_bytes()) +} + +struct Printer<'a> { + src: &'a [u8], + out: String, +} + +impl<'a> Printer<'a> { + /// Raw source text of a node. + fn text(&self, node: Node) -> Result<&'a str, Error> { + node.utf8_text(self.src) + .map_err(|e| Error::Generation(format!("non-utf8 token: {e}"))) + } + + /// A required named field, or a fail-closed error naming what was missing. + fn field<'n>(&self, node: Node<'n>, name: &str) -> Result, Error> { + node.child_by_field_name(name).ok_or_else(|| { + Error::Generation(format!("`{}` node is missing field `{name}`", node.kind())) + }) + } + + fn unsupported(&self, node: Node, context: &str) -> Error { + Error::Generation(format!( + "unsupported {context}: `{}` (offset {})", + node.kind(), + node.start_byte() + )) + } + + /// Top level: emit each item, one blank line between items. + fn source_file(&mut self, root: Node) -> Result<(), Error> { + let mut cursor = root.walk(); + for (i, item) in root.named_children(&mut cursor).enumerate() { + if i > 0 { + self.out.push('\n'); + } + self.item(item, 0)?; + } + Ok(()) + } + + /// An item is anything that can appear at the top level or inside a block as + /// a statement-like line. Returns canonical text terminated by a newline. + fn item(&mut self, node: Node, depth: usize) -> Result<(), Error> { + match node.kind() { + "function_item" => self.function(node, depth), + "line_comment" | "block_comment" => { + self.indent(depth); + self.out.push_str(self.text(node)?); + self.out.push('\n'); + Ok(()) + } + _ => Err(self.unsupported(node, "top-level item")), + } + } + + fn function(&mut self, node: Node, depth: usize) -> Result<(), Error> { + let name = self.field(node, "name")?; + let params = self.field(node, "parameters")?; + let body = self.field(node, "body")?; + + self.indent(depth); + self.out.push_str("fn "); + self.out.push_str(self.text(name)?); + self.parameters(params)?; + if let Some(ret) = node.child_by_field_name("return_type") { + self.out.push_str(" -> "); + self.out.push_str(&self.expr(ret)?); + } + self.out.push(' '); + self.block(body, depth) + } + + fn parameters(&mut self, node: Node) -> Result<(), Error> { + self.out.push('('); + let mut cursor = node.walk(); + for (i, param) in node.named_children(&mut cursor).enumerate() { + if param.kind() != "parameter" { + return Err(self.unsupported(param, "parameter")); + } + if i > 0 { + self.out.push_str(", "); + } + let pattern = self.field(param, "pattern")?; + let ty = self.field(param, "type")?; + self.out.push_str(self.text(pattern)?); + self.out.push_str(": "); + self.out.push_str(&self.expr(ty)?); + } + self.out.push(')'); + Ok(()) + } + + /// A `{ ... }` block. Emits `{`, each inner statement on its own indented + /// line, then the closing `}` at the block's own depth. + fn block(&mut self, node: Node, depth: usize) -> Result<(), Error> { + self.out.push_str("{\n"); + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + self.stmt(child, depth + 1)?; + } + self.indent(depth); + self.out.push_str("}\n"); + Ok(()) + } + + /// A statement inside a block. Statement nodes carry their own terminator; + /// a bare expression is treated as a trailing (implicit-return) expression + /// and emitted without a semicolon. + fn stmt(&mut self, node: Node, depth: usize) -> Result<(), Error> { + match node.kind() { + "line_comment" | "block_comment" => { + self.indent(depth); + self.out.push_str(self.text(node)?); + self.out.push('\n'); + } + "let_declaration" => { + let pattern = self.field(node, "pattern")?; + self.indent(depth); + self.out.push_str("let "); + self.out.push_str(self.text(pattern)?); + if let Some(value) = node.child_by_field_name("value") { + self.out.push_str(" = "); + self.out.push_str(&self.expr(value)?); + } + self.out.push_str(";\n"); + } + "expression_statement" => { + let inner = node + .named_child(0) + .ok_or_else(|| self.unsupported(node, "empty expression statement"))?; + self.indent(depth); + self.out.push_str(&self.expr(inner)?); + self.out.push_str(";\n"); + } + // Anything else that is a valid expression is a trailing expression. + _ => { + let rendered = self.expr(node)?; + self.indent(depth); + self.out.push_str(&rendered); + self.out.push('\n'); + } + } + Ok(()) + } + + /// Render an expression (or type) to canonical text. No leading indent. + fn expr(&self, node: Node) -> Result { + match node.kind() { + "identifier" | "integer_literal" | "float_literal" | "primitive_type" + | "string_literal" | "char_literal" | "boolean_literal" | "field_identifier" + | "type_identifier" | "self" => Ok(self.text(node)?.to_string()), + "binary_expression" => { + let left = self.field(node, "left")?; + let op = self.field(node, "operator")?; + let right = self.field(node, "right")?; + Ok(format!( + "{} {} {}", + self.expr(left)?, + self.text(op)?, + self.expr(right)? + )) + } + "call_expression" => { + let func = self.field(node, "function")?; + let args = self.field(node, "arguments")?; + Ok(format!("{}{}", self.expr(func)?, self.arguments(args)?)) + } + "arguments" => self.arguments(node), + "parenthesized_expression" => { + let inner = node + .named_child(0) + .ok_or_else(|| self.unsupported(node, "empty parentheses"))?; + Ok(format!("({})", self.expr(inner)?)) + } + "macro_invocation" => { + let name = self.field(node, "macro").or_else(|_| { + node.named_child(0) + .ok_or_else(|| self.unsupported(node, "macro without name")) + })?; + let tokens = node + .named_children(&mut node.walk()) + .find(|c| c.kind() == "token_tree") + .ok_or_else(|| self.unsupported(node, "macro without token tree"))?; + Ok(format!("{}!{}", self.text(name)?, self.token_tree(tokens)?)) + } + _ => Err(self.unsupported(node, "expression")), + } + } + + fn arguments(&self, node: Node) -> Result { + let mut out = String::from("("); + let mut cursor = node.walk(); + for (i, arg) in node.named_children(&mut cursor).enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&self.expr(arg)?); + } + out.push(')'); + Ok(out) + } + + /// Canonicalize a macro `token_tree`. Token trees are unstructured, so we + /// reprint conservatively: brackets verbatim, `, ` after commas, and every + /// other token rendered with no inserted spacing. This is exact for the + /// common `name!(expr, expr)` shape and stays fail-closed via [`Self::expr`] + /// for any named token it does not recognize. + fn token_tree(&self, node: Node) -> Result { + let mut out = String::new(); + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + match self.text(child)? { + "(" | "[" | "{" | ")" | "]" | "}" => out.push_str(self.text(child)?), + "," => out.push_str(", "), + _ if child.is_named() => out.push_str(&self.expr(child)?), + other => out.push_str(other), + } + } + Ok(out) + } + + fn indent(&mut self, depth: usize) { + for _ in 0..depth { + self.out.push_str(INDENT); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const CANONICAL: &str = "fn add(a: i32, b: i32) -> i32 {\n \ + // Simple addition\n a + b\n}\n\n\ + fn main() {\n \ + let x = 5;\n let y = 10;\n let sum = add(x, y);\n \ + println!(\"Sum: {}\", sum);\n}\n"; + + #[test] + fn canonicalizes_the_example() { + let messy = b"fn add(a:i32,b:i32)->i32{\n// Simple addition\n a+b\n}\nfn main(){let x=5;\nlet y =10;let sum= add(x,y);println!(\"Sum: {}\",sum);}"; + let out = canonicalize(messy).unwrap(); + assert_eq!(String::from_utf8(out).unwrap(), CANONICAL); + } + + #[test] + fn is_idempotent() { + // Canonical input must come back byte-for-byte unchanged. + let once = canonicalize(CANONICAL.as_bytes()).unwrap(); + assert_eq!(once, CANONICAL.as_bytes()); + } + + #[test] + fn reformatting_produces_identical_bytes() { + // The property the whole project rests on: formatting differences vanish. + let a = canonicalize(b"fn f()->i32{1+2}").unwrap(); + let b = canonicalize(b"fn f( ) -> i32 {\n\n 1 + 2\n}\n").unwrap(); + assert_eq!(a, b); + } + + #[test] + fn rejects_syntax_errors() { + let err = canonicalize(b"fn main( { ").unwrap_err(); + assert!(matches!(err, Error::Parsing(_))); + } + + #[test] + fn fails_closed_on_unsupported_constructs() { + // `struct` is outside the documented subset: error, never silent loss. + let err = canonicalize(b"struct S { x: i32 }\n").unwrap_err(); + assert!(matches!(err, Error::Generation(_))); + } +} diff --git a/src/setup.rs b/src/setup.rs new file mode 100644 index 0000000..2fa8f01 --- /dev/null +++ b/src/setup.rs @@ -0,0 +1,63 @@ +//! One-command installation of the git-ast filter into a repository. +//! +//! `git-ast setup` registers the long-running filter in the current repo's git +//! config and ensures `*.rs` is routed through it in `.gitattributes`, so a user +//! can enable the canonical-formatting round-trip without memorizing the config +//! incantation. It is idempotent: re-running it changes nothing. + +use crate::Error; +use std::path::Path; +use std::process::Command; + +const ATTR_LINE: &str = "*.rs filter=git-ast"; + +/// Configure the current repository to use git-ast for `*.rs` files. +pub fn run() -> Result<(), Error> { + // The filter invokes this same binary; use its absolute path so the config + // keeps working regardless of the caller's PATH. + let exe = std::env::current_exe() + .map_err(|e| Error::Config(format!("cannot locate the git-ast binary: {e}")))?; + let exe = exe.display(); + + git_config("filter.git-ast.process", &format!("{exe} filter-process"))?; + // `required=true` makes Git fail loudly if the filter is missing rather than + // silently storing unfiltered bytes. + git_config("filter.git-ast.required", "true")?; + + ensure_attribute()?; + + eprintln!("git-ast: configured filter for *.rs in this repository."); + eprintln!("git-ast: re-add existing Rust files to canonicalize them: git add --renormalize ."); + Ok(()) +} + +fn git_config(key: &str, value: &str) -> Result<(), Error> { + let status = Command::new("git") + .args(["config", key, value]) + .status() + .map_err(|e| Error::Config(format!("running git config: {e}")))?; + if !status.success() { + return Err(Error::Config(format!( + "git config {key} failed (are you inside a git repository?)" + ))); + } + Ok(()) +} + +/// Append the `*.rs filter=git-ast` line to `.gitattributes` unless it is +/// already present. +fn ensure_attribute() -> Result<(), Error> { + let path = Path::new(".gitattributes"); + let existing = std::fs::read_to_string(path).unwrap_or_default(); + if existing.lines().any(|l| l.trim() == ATTR_LINE) { + return Ok(()); + } + let mut updated = existing; + if !updated.is_empty() && !updated.ends_with('\n') { + updated.push('\n'); + } + updated.push_str(ATTR_LINE); + updated.push('\n'); + std::fs::write(path, updated)?; + Ok(()) +} From 5dc02d9f52a67c7a4de61f73057453119e2132f2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 20:05:24 +0000 Subject: [PATCH 02/10] docs: explain stable node identity (computed vs stored, notes as transport) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a README section framing the hard, deferred problem precisely: node identity is heuristic not exact, computed by tree-matching rather than stored, helped by content-addressed subtree hashing, and git notes are a transport for attribution across rewrites — not the identity mechanism. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/README.md b/README.md index 90be2b8..811a848 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,40 @@ Honest boundaries: rename. That problem is described in [`docs/planning/scope.md`](./docs/planning/scope.md) and remains out of scope. +## On stable node identity (the hard part) + +"Node identity across versions" means being able to say *this* function in +commit N is the same entity as *that* one in commit N+1 — through a move, a +rename, an extract-method — so attribution follows the node, not its line +position. It is what canonical formatting alone does **not** buy you, and it is +the floor under reliable per-line attribution. A few things worth stating +plainly, because they are easy to get wrong: + +- **It is heuristic, not exact.** "Is this the same function after a rename + *and* a body rewrite?" has no ground truth — it is a judgment. You can get it + very good (a pure move, or a rename with an unchanged body, is near-certain); + you cannot get it provably correct. + +- **Identity is *computed*, not *stored*.** Embedding durable IDs in nodes fails + the moment a plain text editor touches the file (the IDs aren't there to + preserve). Because git-ast stores canonical *text*, identity must be derived + by matching tree N against tree N+1 (GumTree-family algorithms) at the time + you ask — not carried in the blob. + +- **Content-addressed subtree hashing is the lever.** Hash every subtree; an + unchanged-but-moved node has the *same hash* in both commits and matches for + free, with zero heuristics. Fuzzy matching is then needed only for the + subtrees that actually changed — shrinking the uncertain surface to just the + genuinely-edited nodes. + +- **`git notes` are a transport, not the mechanism.** Computing identity needs + no notes. Notes only matter for *persisting* attribution and carrying it + across history rewrites — and they do **not** survive rewrites for free: they + are keyed to commit SHAs, `rebase`/`amend`/cherry-pick copying is per-commit + and not merge-aware, and **squash collapses several commits' notes + ambiguously**. Making attribution "move and merge through every rewrite" is + the hard engineering, not a property notes hand you. + ## License This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. From 0b5ade57d115cd490e5f76b34a60784cf6f5a093 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 01:12:57 +0000 Subject: [PATCH 03/10] docs: link to frond, the JS/TS round-trip sibling frond exercises the same parse -> regenerate -> compare primitive for JavaScript/TypeScript (SWC on Deno) that git-ast does for Rust (Tree-sitter). Cross-link them: frond validates round-trip fidelity, the prerequisite git-ast's canonical printer depends on. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 811a848..b812801 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,16 @@ plainly, because they are easy to get wrong: ambiguously**. Making attribution "move and merge through every rewrite" is the hard engineering, not a property notes hand you. +## Related projects + +- **[frond](https://github.com/bounded-systems/frond)** — the JS/TS counterpart. + It exercises the same core primitive (parse source to an AST, regenerate it, + and compare for fidelity) in the JavaScript/TypeScript ecosystem using **SWC** + on **Deno**, where git-ast uses **Tree-sitter** on Rust. frond focuses on the + round-trip *fidelity* check — proving a printer can reproduce source faithfully + — which is exactly the prerequisite git-ast's canonical printer depends on, so + the two projects validate the same idea across two toolchains. + ## License This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. From 9f1fc667038d7abb8480ba3635370ee1af52b52c Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 11:52:11 +0000 Subject: [PATCH 04/10] docs: add Unison as prior art for content-addressed node identity Unison makes identity = hash of the normalized AST a language primitive, giving rename/move stability for free. Note the two honest caveats: it does not dissolve identity through an edit (namespace history records the succession), and it is greenfield where git-ast must retrofit the same property onto git + mainstream languages. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index b812801..59c0ed7 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,31 @@ plainly, because they are easy to get wrong: ambiguously**. Making attribution "move and merge through every rewrite" is the hard engineering, not a property notes hand you. +### Prior art: Unison + +[Unison](https://www.unison-lang.org/) is the existence proof that this model +works — it makes **identity = the hash of the normalized AST** a language-level +primitive. Definitions are content-addressed (a Merkle DAG of code, dependencies +referenced by hash; bound variables normalized so alpha-equivalent terms hash +the same), and **names are separate metadata** mapping `name → hash`. The payoff +is exactly the node-identity wishlist, for free: a **rename** is an O(1) +repoint that never touches the hash, and a **move** isn't an event at all, so +attribution pinned to a hash survives both with zero heuristics and zero notes. + +Two honest caveats keep this from being a finished answer for git-ast: + +- **It doesn't dissolve identity *through an edit*.** Changing a body yields a + *new* hash — a new entity by construction. Unison records the succession in + the namespace history (`foo: hash₁ → hash₂`); "the same thing, changed" lives + in the name binding's history, not in a structural claim. That is a clean + answer, but the namespace is doing the work, not tree-matching. +- **Unison is greenfield; git-ast is a retrofit.** Unison gets all of this by + being a new language with a custom content-addressed codebase (not text files, + not git). git-ast must import the same property into mainstream languages that + are name- and position-based, stored in git, which is line/blob-addressed. + Unison never had to solve that retrofit — and the retrofit *is* the open + problem here. + ## Related projects - **[frond](https://github.com/bounded-systems/frond)** — the JS/TS counterpart. From 79c371e6b7b74bdd28b74487e917e17236e88c50 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 13:51:28 +0000 Subject: [PATCH 05/10] docs: add the model-store + projection architecture (Dolt for the model) Capture the "how" that makes node identity tractable: split into a content-addressed model store (the AST, identity recorded) and a text projection store (canonical source, what humans/git/CI see), kept in lockstep by the bidirectional transform. Use Dolt for the model store: AST as keyed rows, prolly-tree cell-level merge, and per-node attribution via dolt blame as a primitive. Honest boundaries: Dolt removes the plumbing not the semantics (you still define the keys = node identity), two heterogeneous stores carry a lockstep invariant, and conflicts move to cell-level rather than vanishing. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/README.md b/README.md index 59c0ed7..b666371 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,54 @@ Two honest caveats keep this from being a finished answer for git-ast: Unison never had to solve that retrofit — and the retrofit *is* the open problem here. +### Making it possible: a model store + a projection + +Unison is the *why*; this is a plausible *how* without inventing a new language. +Split the system into two versioned stores: + +- **The model store** holds the **content-addressed AST** — the source of truth, + where node identity lives durably and is *recorded* rather than recomputed. +- **The projection store** holds the **canonical text** — what humans edit and + what GitHub, CI, and ordinary `git` see. It is a *derived view* of the model. + +They stay in lockstep via the bidirectional transform: a text edit is parsed and +folded back into the model as an identity-preserving mutation; a model change is +re-projected to new canonical text. This is **projectional editing** (cf. MPS, +Hazel) married to dual version control — and git-ast's existing `clean`/`smudge` +round-trip is the seed of that transform. The example dir already anticipates +the split: `04_stored_blob` (the tree) and `05_generated_source` (the +projection) are exactly these two artifacts, promoted to two histories. + +**Use [Dolt](https://www.dolthub.com/) for the model store, not a second git.** +The AST is structured data, and the model store's real requirements *are* Dolt's +native features: + +- Model the AST as tables (`nodes(id, kind, …)`, `edges(parent, child, field, + ordinal)`, attribution columns). A node is a **row keyed by stable id** — that + key *is* its identity. +- Dolt's storage is a prolly tree (a Merkle search tree), so you keep + content-addressing and structural sharing **and** get efficient three-way + merge at **cell** granularity. Structural AST merge becomes a native database + merge instead of an algorithm you write. +- `dolt blame` / `dolt history` operate on a **row**, so **per-node attribution + is a built-in query** — the per-line-attribution goal, at node granularity, as + a primitive rather than something reconstructed. + +The honest boundaries, so this is an architecture and not a buzzword: + +- **Dolt removes the plumbing, not the semantics.** It makes identity cheap to + store, version, merge, and blame — but *you still choose the keys*, i.e. define + when two nodes are "the same node" (content hash vs. assigned id). That choice + is the original hard problem; Dolt does not make it for you. +- **Two heterogeneous stores** (Dolt model + git projection) means a lockstep + invariant between systems with different merge semantics, and the text→AST + reconcile heuristic still lives at that boundary — though now it matches an + edit against a *known prior tree with known ids*, which is far more tractable + than blind tree-diff. +- **Cell-level conflicts, not zero conflicts.** Two edits to the same node still + conflict; Dolt just gives a node/cell conflict instead of a line one — strictly + better, not magic. + ## Related projects - **[frond](https://github.com/bounded-systems/frond)** — the JS/TS counterpart. From 59a91b44bef413166bf96da628db74c70252d2bf Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 14:23:14 +0000 Subject: [PATCH 06/10] docs: model node identity as a vector with three ways to establish it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reframe the node-identity section around prior art and a defensible position: - Identity is a vector, not a scalar (cf. Kythe VName). Split the dimensions that aren't atomic: content shallow vs deep (Merkle), name lexeme vs binding, definition vs use/call (export surface as contract), location. Note dimensions differ in epistemic cost, and that content equivalence over-merges clones (equivalence != persistence). - Three families for establishing correspondence: by construction (CRDT TreeId / Unison hash), by operation (RefactoringMiner, CodeShovel), by snapshot matching (GumTree, the fallback). - Thesis: record the edit, don't reconstruct it — agent-authored code can emit edit provenance, making identity durable by construction. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/README.md b/README.md index b666371..8197dfc 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,84 @@ plainly, because they are easy to get wrong: ambiguously**. Making attribution "move and merge through every rewrite" is the hard engineering, not a property notes hand you. +### Identity is a vector, not a scalar + +"Is this the same node?" is underdetermined because identity is not one +property but several **independent dimensions** that come apart under different +edits. This is not new: Google [Kythe](https://kythe.io/docs/kythe-storage.html) +models every semantic node as a `VName` — a 5-tuple `(signature, corpus, root, +path, language)` — and states outright that "a node is a d-dimensional vector, +each dimension a scalar fact." Treat node identity the same way: a tuple, +resolved per purpose, not a single key. Stated precisely, splitting each +dimension that is not atomic: + +- **Content — shallow vs deep.** *Shallow* content is the node's own normalized + structure with identifiers alpha-renamed and dependencies abstracted; *deep* + (Merkle) content folds dependency identities into the hash. They have + *opposite* stability under change propagation: rename a callee `g` and `f`'s + shallow content is unchanged while its deep content changes (the Unison + behaviour — deps by hash). Collapsing the two is a category error — shallow is + stable but coarse, deep is precise but ripples on any transitive edit. +- **Name — lexeme vs binding.** The surface string (`parseConfig`) versus the + resolved declaration a use points to (a compiler `DefId`). A rename changes + the lexeme; the binding persists, and shadowing gives same-lexeme / + different-binding. Most rename-robustness comes from binding identity, not the + string — which is why Kythe keys on `signature`, not the name. +- **Definition vs use/call.** A definition and its call sites are *separate* + dimensions: the def can be stable while callers churn, or the reverse. + "Track the def" and "track who references the def" are different identities + with different lifetimes ([SCIP/LSIF](https://github.com/sourcegraph/scip) + monikers separate them). This is also why the **export surface** is special: + at an API boundary use-identity becomes a *contract* — semver and + breaking-change detection key on it — so the otherwise-weak use axis becomes + the durable one. +- **Location.** Path, offset, sibling order. Weakest (breaks on every move), + most available (it is what text and git already have). Mostly a tiebreaker. + +Two things the single word "identity" hides: + +- **Dimensions differ in epistemic cost, not just in what they track.** Content + and Location are computable from text alone; Name(binding) and use/call need a + resolver or whole-program analysis. In a no-build, partial-file, or + multi-language context, half the vector does not exist — *availability*, not + preference, decides which dimensions you can use. +- **Equivalence is not persistence.** Content identity is *many-to-one*: two + distinct helpers with identical bodies share a content hash. Key blame on + content and you *fuse clones* into one false lineage. Content gives + equivalence classes; "the same entity over time" is a different relation + (correspondence between two versions, then persistence across N). + +### Three ways to establish it — and only one scales + +Given the vector, how do you decide two nodes correspond across versions? Three +families, not equal: + +1. **By construction** — assign a durable id at birth and have the editor carry + it. Kleppmann's [replicated-tree CRDT](https://martin.kleppmann.com/2021/10/07/crdt-tree-move-operation.html) + gives each node a `TreeId` that survives arbitrary concurrent moves (formally + verified); Unison's content hash is a static-language variant. Identity is + *recorded, never inferred*. +2. **By operation** — recognize the *edit*. + [RefactoringMiner](https://users.encs.concordia.ca/~nikolaos/publications/TSE_2020.pdf) + detects 100+ refactoring types at ~99.9% precision by applying AST + replacements until statements match (extract/inline resolved via call-site + context); [CodeShovel](https://www.ncbradley.com/publication/codeshovel/) + builds method histories through rename/move/signature changes — and tellingly + *struggles only when a body changes substantially during a move*, the exact + point where snapshot similarity runs out and only recorded provenance helps. +3. **By snapshot matching** — compare the vector across two anonymous versions + (GumTree-family). This is the **fallback** for when you failed to capture the + first two. + +The thesis: **record the edit, don't reconstruct it.** Snapshot matching is the +degraded mode; the identity vector above is what you fall back to when identity +was not recorded at write time. For an **agent-authored** codebase this flips +the problem — the agent *is* the editor, so it can emit the operation (rename, +extract, move) as first-class provenance, making identity durable by +construction. The strongest per-line attribution is not better post-hoc +matching; it is capturing edit-intent at the moment of the edit so matching is +never needed. + ### Prior art: Unison [Unison](https://www.unison-lang.org/) is the existence proof that this model From 2591388a24122a0e0570bc34b804d47317d9780d Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 14:43:27 +0000 Subject: [PATCH 07/10] test: guard the determinism contract; docs: provenance pipeline - printer: add convergence, idempotence, and purity property tests, plus a "Determinism contract" doc section (convergent, idempotent, no ambient nondeterminism, fail-closed; canonical form versioned by grammar+printer). - README: surface the determinism/idempotence guarantee in status, and add a "provenance pipeline" section grounding each identity form (content shallow/deep, name lexeme/binding, def-vs-use, location, operation, authorship) in a have-today vs what's-needed table with prior art. Thesis: capture provenance as early as possible (the agent sits at stage 1). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 50 ++++++++++++++++++++++++++++++++++++- src/printer.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8197dfc..02eeccd 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,10 @@ implemented and runs through real Git: - On `git add`, the `clean` filter parses Rust with Tree-sitter and stores its **canonical** form; on `git checkout`, `smudge` returns it. Reformatting therefore never reaches history — two differently-formatted inputs that parse - to the same tree produce byte-identical blobs. + to the same tree produce byte-identical blobs. Canonicalization is + **deterministic and idempotent** (guarded by property tests; see the + "Determinism contract" in [`src/printer.rs`](./src/printer.rs)), with the + canonical form versioned by the `(grammar, printer)` pair. - It speaks Git's real `filter-process` pkt-line protocol, so `git add` / `git checkout` / `git diff` all work end to end. See [`examples/demo.sh`](./examples/demo.sh). @@ -259,6 +262,51 @@ The honest boundaries, so this is an architecture and not a buzzword: conflict; Dolt just gives a node/cell conflict instead of a line one — strictly better, not magic. +### A provenance pipeline (grounding each form of identity) + +Tie the pieces into one dataflow, edit → history, and the "record, don't +reconstruct" thesis becomes concrete. Each stage *captures* a form of identity; +the value of the project is moving capture as early (left) as possible, because +everything you fail to capture you must reconstruct heuristically later. + +1. **Capture** — the edit's *intent*. An LSP `rename`, an IDE refactor action, + or an **agent's own edit** is a *typed operation* (rename / extract / move), + not an anonymous text delta. This is where operation-identity is born; today + git-ast captures none of it (it sees only the result at `git add`). +2. **Canonicalize** — parse to a deterministic tree and emit canonical bytes. + This yields *shallow content* identity and a reproducible structure. **git-ast + does this today.** +3. **Resolve & identify** — run a name resolver (`DefId`-style) and build the + reference graph; assign stable node ids (content hash à la Unison, or a + CRDT-style `TreeId`). This populates the rest of the identity vector: + *binding*, *deep/Merkle content*, *def-vs-use*. +4. **Attribute** — record per-node provenance keyed to the id: author, time, and + **who/what** produced it (human vs. which agent/model), ideally signed. +5. **Project & preserve** — render canonical text back out (**git-ast does this**) + and carry identity + attribution through rebase / squash / cherry-pick / merge. + +Grounding the identity forms in *what we have vs. what else there is*: + +| Identity form | Pipeline stage | Have today | What's needed | Prior art | +|---|---|---|---|---| +| Content (shallow) | 2 Canonicalize | ✅ deterministic canonical bytes | expose subtree hashes | Merkle, Unison | +| Content (deep/Merkle) | 3 Resolve | — | dependency-resolved hash | Unison | +| Name — lexeme | 2 Canonicalize | ✅ present in the text | — | — | +| Name — binding | 3 Resolve | — | a resolver (`DefId`) | Kythe `signature`, LSP | +| Location | 2 Canonicalize | ✅ path / offset | — | git | +| Def vs use/call | 3 Resolve | — | reference graph | [SCIP/LSIF](https://github.com/sourcegraph/scip), Kythe | +| Operation / provenance | 1 Capture | — | editor / agent / LSP op log | RefactoringMiner, [CRDT](https://martin.kleppmann.com/2021/10/07/crdt-tree-move-operation.html), [in-toto](https://in-toto.io/) | +| Authorship (who/what) | 4 Attribute | ✅ commit author at file/line, now reformatting-proof | per-node, human-vs-agent, signed | git blame, [W3C PROV](https://www.w3.org/TR/prov-overview/), [SLSA](https://slsa.dev/)/Sigstore | + +Read the table by its columns: **git-ast today owns stages 2 and 5** (the +deterministic canonicalize/project round-trip) and, as a side effect, makes +existing line-level blame survive reformatting. The unbuilt, higher-value work +is stages **1, 3, 4** — capturing the operation, resolving the full identity +vector, and attaching signed per-node authorship. Reframed for an agent-authored +codebase: the agent sits at stage 1, so it can *emit* provenance instead of +leaving it to be recovered — which is exactly the floor reliable per-line +attribution needs. + ## Related projects - **[frond](https://github.com/bounded-systems/frond)** — the JS/TS counterpart. diff --git a/src/printer.rs b/src/printer.rs index feb38de..06f5913 100644 --- a/src/printer.rs +++ b/src/printer.rs @@ -15,6 +15,28 @@ //! guessing, so the filter can never silently corrupt code it cannot represent. //! Widening the subset is additive — each new node kind is one more arm in //! [`Printer::expr`] / [`Printer::stmt`]. +//! +//! ## Determinism contract +//! +//! [`canonicalize`] is a pure function of the parse tree, and the whole design +//! depends on it: +//! +//! - **Convergent** — any two formattings of the same program produce identical +//! bytes (so reformatting never reaches history). +//! - **Idempotent** — `canonicalize(canonicalize(x)) == canonicalize(x)`, which +//! is what lets `smudge` be the identity and avoids edit/checkout/add churn. +//! - **No ambient nondeterminism** — no clock, locale, randomness, or float; the +//! one `HashMap` in the filter is protocol metadata read by key, never +//! iterated into output. Output is always `\n`-terminated UTF-8. +//! - **Fail-closed, not partial** — syntax errors are rejected up front, so +//! Tree-sitter's error recovery never yields a nondeterministic partial parse. +//! +//! The canonical form is *defined by* the pair `(tree-sitter-rust grammar +//! version, this printer)`. Cross-machine reproducibility therefore reduces to +//! pinning that pair (what `Cargo.lock` does). Upgrading either is a deliberate +//! one-time re-canonicalization, not silent per-user drift — the same discipline +//! teams apply to pinning a formatter version. These properties are guarded by +//! the `convergence_*`, `idempotent_*`, and `pure_repeated_calls_*` tests below. use crate::Error; use tree_sitter::{Node, Parser}; @@ -320,4 +342,50 @@ mod tests { let err = canonicalize(b"struct S { x: i32 }\n").unwrap_err(); assert!(matches!(err, Error::Generation(_))); } + + // --- Determinism contract (see the module-level "Determinism" docs) --- + + #[test] + fn convergence_many_formattings_one_program() { + // Every formatting of the same program must canonicalize to identical + // bytes — the property that keeps reformatting out of history. + let variants: &[&[u8]] = &[ + b"fn add(a:i32,b:i32)->i32{a+b}", + b"fn add( a : i32 , b : i32 ) -> i32 { a + b }", + b"fn add(a: i32,b: i32)->i32{\n\n a + b\n\n}\n", + b"fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n", + ]; + let canon = canonicalize(variants[0]).unwrap(); + for v in variants { + assert_eq!(canonicalize(v).unwrap(), canon, "variant diverged: {v:?}"); + } + } + + #[test] + fn idempotent_on_varied_inputs() { + // canonicalize is a fixed point on its own output: clean(clean(x)) == + // clean(x). This is what makes `smudge` safe as identity and prevents + // edit/checkout/add churn. + let inputs: &[&[u8]] = &[ + b"fn f()->i32{1+2}", + b"fn g(a: i32) -> i32 { let x = a; x }", + b"fn main(){let s=add(x,y);println!(\"{}\",s);}", + b"// leading comment\nfn h() -> i32 { 42 }", + ]; + for input in inputs { + let once = canonicalize(input).unwrap(); + let twice = canonicalize(&once).unwrap(); + assert_eq!(once, twice, "not idempotent for: {input:?}"); + } + } + + #[test] + fn pure_repeated_calls_are_byte_identical() { + // No clock/locale/randomness/hash-ordering leaks into the output. + let input = b"fn main(){let x=5;let y=10;let s=add(x,y);println!(\"{}\",s);}"; + let first = canonicalize(input).unwrap(); + for _ in 0..16 { + assert_eq!(canonicalize(input).unwrap(), first); + } + } } From 9b861268c7dd574f02192eb82883cdb0057a35f9 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 14:51:30 +0000 Subject: [PATCH 08/10] test: add executable Gherkin specs for the README's claims A cucumber-rs suite (tests/features/claims.feature) drives real git with the built binary as the clean/smudge filter, verifying end to end: reformatting shows no diff, a real change does, different formattings store byte-identical blobs, checkout round-trips to canonical source, syntax errors are rejected (fail-closed), and non-Rust files pass through unchanged. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- Cargo.lock | 1047 ++++++++++++++++++++++++++++++++- Cargo.toml | 11 + tests/claims.rs | 145 +++++ tests/features/claims.feature | 69 +++ 4 files changed, 1270 insertions(+), 2 deletions(-) create mode 100644 tests/claims.rs create mode 100644 tests/features/claims.feature diff --git a/Cargo.lock b/Cargo.lock index 4c536f0..5769f5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,84 @@ dependencies = [ "memchr", ] +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a4385e2e34eb35d6b3efe798b9eb88096925d87726c0798709bf56d9ed84af3" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "bstr" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cee35f73844aa3014bb606320a6c1f010249dbdf43342fe54b5a4f6a8ed4b79" +dependencies = [ + "memchr", + "serde_core", +] + +[[package]] +name = "bytecount" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" + [[package]] name = "cc" version = "1.2.65" @@ -21,26 +99,597 @@ dependencies = [ "shlex", ] +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.59.0", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "cucumber" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cd12917efc3a8b069a4975ef3cb2f2d835d42d04b3814d90838488f9dd9bf69" +dependencies = [ + "anyhow", + "clap", + "console", + "cucumber-codegen", + "cucumber-expressions", + "derive_more", + "drain_filter_polyfill", + "either", + "futures", + "gherkin", + "globwalk", + "humantime", + "inventory", + "itertools", + "lazy-regex", + "linked-hash-map", + "once_cell", + "pin-project", + "regex", + "sealed", + "smart-default", +] + +[[package]] +name = "cucumber-codegen" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e19cd9e8e7cfd79fbf844eb6a7334117973c01f6bad35571262b00891e60f1c" +dependencies = [ + "cucumber-expressions", + "inflections", + "itertools", + "proc-macro2", + "quote", + "regex", + "syn", + "synthez", +] + +[[package]] +name = "cucumber-expressions" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d794fed319eea24246fb5f57632f7ae38d61195817b7eb659455aa5bdd7c1810" +dependencies = [ + "derive_more", + "either", + "nom", + "nom_locate", + "regex", + "regex-syntax 0.7.5", +] + +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "drain_filter_polyfill" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "669a445ee724c5c69b1b06fe0b63e70a1c84bc9bb7d9696cd4f4e3ec45050408" + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" +dependencies = [ + "cfg-if", + "libc", + "r-efi", +] + +[[package]] +name = "gherkin" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20b79820c0df536d1f3a089a2fa958f61cb96ce9e0f3f8f507f5a31179567755" +dependencies = [ + "heck 0.4.1", + "peg", + "quote", + "serde", + "serde_json", + "syn", + "textwrap", + "thiserror", + "typed-builder", +] + [[package]] name = "git-ast" version = "0.1.0" dependencies = [ + "cucumber", + "tempfile", + "tokio", "tree-sitter", "tree-sitter-rust", ] +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax 0.8.11", +] + +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags", + "ignore", + "walkdir", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + +[[package]] +name = "ignore" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b915661dd01db3f05050265b2477bcc6527b3792388e2749b41623cc592be67d" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "inflections" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a257582fdcde896fd96463bf2d40eefea0580021c0712a0e2b028b60b47a837a" + +[[package]] +name = "inventory" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +dependencies = [ + "rustversion", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "lazy-regex" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bae91019476d3ec7147de9aa291cadb6d870abf2f3015d2da73a90325ac1496" +dependencies = [ + "lazy-regex-proc_macros", + "once_cell", + "regex", +] + +[[package]] +name = "lazy-regex-proc_macros" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4de9c1e1439d8b7b3061b2d209809f447ca33241733d9a3c01eabf2dc8d94358" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn", +] + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" + [[package]] name = "memchr" version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom_locate" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3" +dependencies = [ + "bytecount", + "memchr", + "nom", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "peg" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f76678828272f177ac33b7e2ac2e3e73cc6c1cd1e3e387928aa69562fa51367" +dependencies = [ + "peg-macros", + "peg-runtime", +] + +[[package]] +name = "peg-macros" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "636d60acf97633e48d266d7415a9355d4389cea327a193f87df395d88cd2b14d" +dependencies = [ + "peg-runtime", + "proc-macro2", + "quote", +] + +[[package]] +name = "peg-runtime" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555b1514d2d99d78150d3c799d4c357a3e2c2a8062cd108e93a06d9057629c5" + +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "regex" version = "1.12.4" @@ -50,7 +699,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.11", ] [[package]] @@ -61,21 +710,258 @@ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.11", ] +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + [[package]] name = "regex-syntax" version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "sealed" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a8caec23b7800fb97971a1c6ae365b6239aaeddfb934d6265f8505e795699d" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "shlex" version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smart-default" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eb01866308440fc64d6c44d9e86c5cc17adfe33c4d6eed55da9145044d0ffc1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smawk" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8e2fb0f499abb4d162f2bedad68f5ef91a1682b5a03596ddb67efd37768d100" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synthez" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3d2c2202510a1e186e63e596d9318c91a8cbe85cd1a56a7be0c333e5f59ec8d" +dependencies = [ + "syn", + "synthez-codegen", + "synthez-core", +] + +[[package]] +name = "synthez-codegen" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f724aa6d44b7162f3158a57bccd871a77b39a4aef737e01bcdff41f4772c7746" +dependencies = [ + "syn", + "synthez-core", +] + +[[package]] +name = "synthez-core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bfa6ec52465e2425fd43ce5bbbe0f0b623964f7c63feb6b10980e816c654ea" +dependencies = [ + "proc-macro2", + "quote", + "sealed", + "syn", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "terminal_size" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" +dependencies = [ + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "textwrap" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" +dependencies = [ + "smawk", + "unicode-linebreak", + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tree-sitter" version = "0.22.6" @@ -95,3 +981,160 @@ dependencies = [ "cc", "tree-sitter", ] + +[[package]] +name = "typed-builder" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe83c85a85875e8c4cb9ce4a890f05b23d38cd0d47647db7895d3d2a79566d2" +dependencies = [ + "typed-builder-macro", +] + +[[package]] +name = "typed-builder-macro" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29a3151c41d0b13e3d011f98adc24434560ef06673a155a6c7f66b9879eecce2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-linebreak" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 9a918e1..eacddce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,14 @@ repository = "https://github.com/bounded-systems/git-ast" [dependencies] tree-sitter = "0.22" tree-sitter-rust = "0.21" + +# The Gherkin suite (tests/features/*.feature) executes the README's claims +# against real `git`, driving the built binary as the clean/smudge filter. +[dev-dependencies] +cucumber = "0.21" +tokio = { version = "1", features = ["macros", "rt-multi-thread"] } +tempfile = "3" + +[[test]] +name = "claims" +harness = false diff --git a/tests/claims.rs b/tests/claims.rs new file mode 100644 index 0000000..6c492f7 --- /dev/null +++ b/tests/claims.rs @@ -0,0 +1,145 @@ +//! Executable specification of git-ast's behavioural claims. +//! +//! Each scenario in `tests/features/claims.feature` drives **real git** with the +//! built `git-ast` binary installed as the clean/smudge filter, so the README's +//! claims (reformatting is invisible, determinism, fail-closed, passthrough, +//! round-trip) are verified end to end rather than asserted in prose. +//! +//! Run with `cargo test --test claims`. + +use std::path::Path; +use std::process::Command; + +use cucumber::gherkin::Step; +use cucumber::{given, then, when, World}; +use tempfile::TempDir; + +#[derive(Debug, Default, World)] +struct AstWorld { + repo: Option, + last_add_code: i32, +} + +impl AstWorld { + fn dir(&self) -> &Path { + self.repo + .as_ref() + .expect("repository not initialized") + .path() + } + + /// Run a git command in the repo; return (exit code, stdout). + fn git(&self, args: &[&str]) -> (i32, String) { + let out = Command::new("git") + .args(args) + .current_dir(self.dir()) + .output() + .expect("failed to run git"); + ( + out.status.code().unwrap_or(-1), + String::from_utf8_lossy(&out.stdout).into_owned(), + ) + } + + fn write(&self, name: &str, content: &str) { + std::fs::write(self.dir().join(name), content).expect("failed to write file"); + } + + fn stored_blob(&self, name: &str) -> String { + self.git(&["cat-file", "-p", &format!(":{name}")]).1 + } +} + +#[given("a repository with git-ast installed")] +async fn install(world: &mut AstWorld) { + let repo = tempfile::tempdir().expect("tempdir"); + let dir = repo.path().to_path_buf(); + let run = |args: &[&str]| { + Command::new("git") + .args(args) + .current_dir(&dir) + .status() + .expect("failed to run git"); + }; + run(&["init", "-q"]); + run(&["config", "user.email", "test@example.com"]); + run(&["config", "user.name", "git-ast test"]); + let process = format!("{} filter-process", env!("CARGO_BIN_EXE_git-ast")); + run(&["config", "filter.git-ast.process", &process]); + run(&["config", "filter.git-ast.required", "true"]); + std::fs::write(dir.join(".gitattributes"), "*.rs filter=git-ast\n").expect("write attrs"); + world.repo = Some(repo); +} + +#[when(expr = "I stage {string} containing:")] +async fn stage_doc(world: &mut AstWorld, name: String, step: &Step) { + let body = step.docstring.clone().unwrap_or_default(); + world.write(&name, &body); + world.last_add_code = world.git(&["add", &name]).0; +} + +#[when(expr = "I stage {string} containing {string}")] +async fn stage_inline(world: &mut AstWorld, name: String, content: String) { + world.write(&name, &content); + world.last_add_code = world.git(&["add", &name]).0; +} + +#[when("I commit")] +async fn commit(world: &mut AstWorld) { + world.git(&["commit", "-qm", "snapshot"]); +} + +#[when(expr = "I overwrite {string} with:")] +async fn overwrite(world: &mut AstWorld, name: String, step: &Step) { + world.write(&name, &step.docstring.clone().unwrap_or_default()); +} + +#[when(expr = "I check out {string} fresh")] +async fn checkout_fresh(world: &mut AstWorld, name: String) { + std::fs::remove_file(world.dir().join(&name)).ok(); + world.git(&["checkout", "--", &name]); +} + +#[then(expr = "the stored blobs for {string} and {string} are identical")] +async fn blobs_identical(world: &mut AstWorld, a: String, b: String) { + assert_eq!(world.stored_blob(&a), world.stored_blob(&b)); +} + +#[then(expr = "the stored blob for {string} is {string}")] +async fn blob_is_inline(world: &mut AstWorld, name: String, want: String) { + // Exact compare: proves non-Rust passthrough preserves bytes verbatim. + assert_eq!(world.stored_blob(&name), want); +} + +#[then(expr = "the working file {string} is:")] +async fn working_is(world: &mut AstWorld, name: String, step: &Step) { + let want = step.docstring.clone().unwrap_or_default(); + let got = std::fs::read_to_string(world.dir().join(&name)).expect("read working file"); + // Compare canonical content; exact leading/trailing newline bytes are + // guarded separately by the printer unit tests. + assert_eq!(got.trim_matches('\n'), want.trim_matches('\n')); +} + +#[then(expr = "{string} shows no diff")] +async fn shows_no_diff(world: &mut AstWorld, name: String) { + let (code, _) = world.git(&["diff", "--quiet", "--", &name]); + assert_eq!(code, 0, "expected no diff for {name}"); +} + +#[then(expr = "{string} shows a diff")] +async fn shows_a_diff(world: &mut AstWorld, name: String) { + let (code, _) = world.git(&["diff", "--quiet", "--", &name]); + assert_ne!(code, 0, "expected a diff for {name}"); +} + +#[then(expr = "staging {string} containing {string} is rejected")] +async fn staging_rejected(world: &mut AstWorld, name: String, content: String) { + world.write(&name, &content); + let (code, _) = world.git(&["add", &name]); + assert_ne!(code, 0, "expected `git add {name}` to fail (fail-closed)"); +} + +#[tokio::main] +async fn main() { + AstWorld::cucumber().run_and_exit("tests/features").await; +} diff --git a/tests/features/claims.feature b/tests/features/claims.feature new file mode 100644 index 0000000..7709dfd --- /dev/null +++ b/tests/features/claims.feature @@ -0,0 +1,69 @@ +Feature: git-ast canonical clean/smudge round-trip + These scenarios are the README's behavioural claims, made executable against + real git with the built binary installed as the clean/smudge filter. + + Background: + Given a repository with git-ast installed + + Scenario: Reformatting never reaches history + When I stage "calc.rs" containing: + """ + fn add(a:i32,b:i32)->i32{a+b} + """ + And I commit + And I overwrite "calc.rs" with: + """ + fn add( a : i32 , b : i32 ) -> i32 { + + a + b + } + """ + Then "calc.rs" shows no diff + + Scenario: A real change still shows a diff + When I stage "calc.rs" containing: + """ + fn add(a:i32,b:i32)->i32{a+b} + """ + And I commit + And I overwrite "calc.rs" with: + """ + fn add(a: i32, b: i32) -> i32 { + a - b + } + """ + Then "calc.rs" shows a diff + + Scenario: Different formattings store byte-identical blobs + When I stage "a.rs" containing: + """ + fn add(a:i32,b:i32)->i32{a+b} + """ + And I stage "b.rs" containing: + """ + fn add( a : i32 , b : i32 ) -> i32 { + a + b + } + """ + Then the stored blobs for "a.rs" and "b.rs" are identical + + Scenario: Round-trip restores canonical source on checkout + When I stage "f.rs" containing: + """ + fn f()->i32{1+2} + """ + And I commit + And I check out "f.rs" fresh + Then the working file "f.rs" is: + """ + fn f() -> i32 { + 1 + 2 + } + """ + + Scenario: Syntax errors are rejected (fail-closed) + Then staging "bad.rs" containing "fn main( {" is rejected + + Scenario: Non-Rust files pass through unchanged + When I stage "notes.txt" containing " spaced text " + Then the stored blob for "notes.txt" is " spaced text " From caca55329afb45036f48cf54e6f216ebb72d31e8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 14:53:00 +0000 Subject: [PATCH 09/10] docs: where AST storage hooks in (filter = codec, hooks = recorder) Clarify the operational split: clean is the right place to parse/emit the content-addressed AST (pure, deterministic), but clean/smudge are the wrong place to write the model store (no commit context; filters run during diff/ stash/archive/checkout). Stateful model-store writes belong in commit/ref hooks (post-commit, post-rewrite, post-checkout, post-receive). git stores what is reparseable (canonical text); the model store stores what is not (operation identity, who/which-agent authorship). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/README.md b/README.md index 02eeccd..c0ecf6c 100644 --- a/README.md +++ b/README.md @@ -307,6 +307,43 @@ codebase: the agent sits at stage 1, so it can *emit* provenance instead of leaving it to be recovered — which is exactly the floor reliable per-line attribution needs. +### Where AST storage hooks in + +The natural question is whether the model store is written from the +`clean`/`smudge` filter. Partly: + +- **`clean` is the right place to *parse*** — it already does, to canonicalize — + so emitting the content-addressed AST (subtree hashes, the identity vector) is + nearly free there and stays pure and deterministic. +- **`clean`/`smudge` are the wrong place to *write* the model store.** A filter + has no commit context — at `clean` time the blob is not committed, so there is + no SHA, author, or parent to attribute against or to reconcile the previous + AST with. Worse, filters also run during `git diff`, `stash`, `archive`, and + checkout, so a write there would record spurious or read-only states and break + git's assumption that filters are pure content-in/content-out transforms. + +So split responsibilities: **the filter is the codec, commit/ref hooks are the +recorder.** The stateful model-store writes belong in `post-commit` / +`post-merge` (record the new AST + attribution against the parent), `post-rewrite` +(the squash/rebase/amend path — surviving history rewrites), `post-checkout` +(keep model and projection in lockstep), and server-side `post-receive` +(authoritative build). In pipeline terms: `clean` = stage 2, the hooks = +stages 3–4, `smudge` = stage 5. + +This also clarifies *what* the model store holds, and why it is not redundant +with the canonical text in git: + +- **git** stores the canonical text — the content of record, and (given the + round-trip) the most compressed encoding of it. +- **the model store (Dolt)** holds the *derived* model, which is two things: a + **rebuildable index** (AST structure and subtree hashes — recomputable any + time by reparsing the text, so effectively a cache) and the **non-rebuildable + provenance** (operation identity, and who/which-agent authored each node) that + is *not* a function of the text and therefore must be durably stored. + +That last distinction is the whole reason a model store exists: git stores what +is reparseable; the model store stores what is not. + ## Related projects - **[frond](https://github.com/bounded-systems/frond)** — the JS/TS counterpart. From 3bc71ac0f699878001dea46da91f64416172f170 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 15:01:44 +0000 Subject: [PATCH 10/10] feat: add `inspect` read verb (verbspec POC); docs: the verb interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Small proof-of-concept of the first verbspec read verb — "look at the AST": `git-ast inspect [FILE]` parses Rust and lists top-level definitions, each tagged with a deterministic content hash over canonical form, so identity is invariant under reformatting. Backed by `printer::inspect` + tests. README: add "The interface: verbs (verbspec)" — read verbs (inspect/find/ blame) and write verbs (rename/extract/generate = stage-1 provenance capture), with verbspec as the author-once-project-everywhere delivery vehicle that puts an agent at stage 1. Link verbspec under related projects. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NCp6PSoWKvsbFWyav6CeeC --- README.md | 36 +++++++++++++++++ src/main.rs | 22 ++++++++++- src/printer.rs | 103 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 152 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c0ecf6c..e6f2457 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,9 @@ implemented and runs through real Git: - It speaks Git's real `filter-process` pkt-line protocol, so `git add` / `git checkout` / `git diff` all work end to end. See [`examples/demo.sh`](./examples/demo.sh). +- `git-ast inspect [FILE]` lists top-level definitions with a + **formatting-invariant content hash** — a proof-of-concept of the first + read verb (see "The interface: verbs" below). Honest boundaries: @@ -344,6 +347,34 @@ with the canonical text in git: That last distinction is the whole reason a model store exists: git stores what is reparseable; the model store stores what is not. +### The interface: verbs (verbspec) + +The AST surface is naturally a set of **verbs** — operations with a typed input +and output: + +- **Read verbs — look at the AST, and at history on the AST.** `inspect` / `find` + / `refs` (query a snapshot) and `blame` / `log` / `trace` (per-node history). + The history verbs are the per-line-attribution goal re-expressed on nodes, and + run over the model store. The query side is achievable first. +- **Write verbs — mutate the AST to generate or refactor.** `rename` / `extract` + / `inline` / `move` / `generate`. Mutating the tree directly makes each edit a + *typed operation*, which is stage-1 provenance capture — identity by + construction, the "record, don't reconstruct" thesis made operational. These + depend on the resolver (a safe `rename` must update every reference), so they + sequence after identity. + +[**verbspec**](https://github.com/bounded-systems/verbspec) is the delivery +vehicle: a spec-driven framework where you *author a verb once and project it +everywhere* — CLI, MCP, Anthropic tools — from one schema. Authoring the AST +verbs as verbspec verbs is exactly how an **agent** gets AST query / history / +mutation as first-class tools, which is what puts the agent at stage 1 of the +provenance pipeline. + +A first read verb ships today: [`git-ast inspect`](./src/printer.rs) lists +top-level definitions with a content hash that is invariant under formatting +(`inspect`, shaped as a verb with `input: { source }`, `output: Def[]`). It is a +proof-of-concept of the read surface — history and write verbs are future work. + ## Related projects - **[frond](https://github.com/bounded-systems/frond)** — the JS/TS counterpart. @@ -353,6 +384,11 @@ is reparseable; the model store stores what is not. round-trip *fidelity* check — proving a printer can reproduce source faithfully — which is exactly the prerequisite git-ast's canonical printer depends on, so the two projects validate the same idea across two toolchains. +- **[verbspec](https://github.com/bounded-systems/verbspec)** — a spec-driven CLI + framework: author a verb once (a typed schema with input/output/run) and + project it to CLI, MCP, and Anthropic tools. The intended surface for git-ast's + AST read/write verbs, so the same operations reach humans, agents, and CI from + one definition. See "The interface: verbs" above. ## License diff --git a/src/main.rs b/src/main.rs index a01657a..75bbdc6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,9 +4,10 @@ //! implement the working clean/smudge round-trip; `diff-driver`/`merge-driver` //! remain placeholders (they await stable node identity — see `docs/`). +use std::io::Read; use std::process::ExitCode; -use git_ast::{drivers, filters, setup}; +use git_ast::{drivers, filters, printer, setup, Error}; fn main() -> ExitCode { let args: Vec = std::env::args().skip(1).collect(); @@ -17,6 +18,7 @@ fn main() -> ExitCode { let result = match cmd.as_str() { "setup" => setup::run().map(|()| 0u8), + "inspect" => run_inspect(rest), "filter-process" => filters::run_long_running_filter().map(|()| 0u8), "diff-driver" => drivers::run_diff_driver(rest).map(|()| 0u8), "merge-driver" => drivers::run_merge_driver(rest).map(|()| 0u8), @@ -44,6 +46,23 @@ fn main() -> ExitCode { } } +/// The `inspect` read verb: list top-level definitions with a +/// formatting-invariant content hash. Reads a file argument, or stdin. +fn run_inspect(args: &[String]) -> Result { + let source = match args.first() { + Some(path) => std::fs::read(path)?, + None => { + let mut buf = Vec::new(); + std::io::stdin().read_to_end(&mut buf)?; + buf + } + }; + for def in printer::inspect(&source)? { + println!("{} {} {}", def.kind, def.name, def.content_hash); + } + Ok(0) +} + fn print_help() { eprintln!( "git-ast — language-aware Git\n\ @@ -53,6 +72,7 @@ fn print_help() { \n\ SUBCOMMANDS:\n \ setup Enable the *.rs clean/smudge filter in this repo\n \ + inspect [FILE] List top-level defs with a formatting-invariant hash\n \ filter-process Clean/smudge long-running filter (canonicalizes Rust)\n \ diff-driver Git diff driver (placeholder)\n \ merge-driver Git merge driver (placeholder)\n \ diff --git a/src/printer.rs b/src/printer.rs index 06f5913..50c0f8a 100644 --- a/src/printer.rs +++ b/src/printer.rs @@ -49,6 +49,17 @@ const INDENT: &str = " "; /// [`Error::Generation`] if it parses but contains a construct outside the /// supported subset. pub fn canonicalize(source: &[u8]) -> Result, Error> { + let tree = parse(source)?; + let mut printer = Printer { + src: source, + out: String::new(), + }; + printer.source_file(tree.root_node())?; + Ok(printer.out.into_bytes()) +} + +/// Parse `source` as Rust, rejecting anything that does not parse cleanly. +fn parse(source: &[u8]) -> Result { let mut parser = Parser::new(); parser .set_language(&tree_sitter_rust::language()) @@ -56,19 +67,76 @@ pub fn canonicalize(source: &[u8]) -> Result, Error> { let tree = parser .parse(source, None) .ok_or_else(|| Error::Parsing("parser returned no tree".to_string()))?; - let root = tree.root_node(); - if root.has_error() { + if tree.root_node().has_error() { return Err(Error::Parsing( "source has syntax errors; fix them or bypass the filter".to_string(), )); } + Ok(tree) +} - let mut printer = Printer { - src: source, - out: String::new(), - }; - printer.source_file(root)?; - Ok(printer.out.into_bytes()) +/// A top-level definition surfaced by the [`inspect`] read verb. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Def { + /// Kind of definition (currently always `"fn"`). + pub kind: &'static str, + /// The declared name. + pub name: String, + /// Content identity: a deterministic hash of the node's *canonical* form, + /// so it is **stable across reformatting**. Note this hash couples name and + /// body — separating the pure-content axis (alpha-normalizing the name) is + /// the refinement described under "Identity is a vector" in the README. + pub content_hash: String, +} + +/// The first **verbspec read verb** — "look at the AST." +/// +/// Conceptually a verb with `input: { source }` and `output: Def[]`: it parses +/// Rust and lists the top-level definitions, each tagged with a content hash +/// that is invariant under formatting. This is a small proof-of-concept of the +/// read surface (query the AST); history verbs (per-node blame) need the model +/// store. Definitions whose bodies fall outside the supported subset are +/// skipped rather than failing the whole listing. +pub fn inspect(source: &[u8]) -> Result, Error> { + let tree = parse(source)?; + let root = tree.root_node(); + let mut defs = Vec::new(); + let mut cursor = root.walk(); + for item in root.named_children(&mut cursor) { + if item.kind() != "function_item" { + continue; + } + let mut printer = Printer { + src: source, + out: String::new(), + }; + if printer.function(item, 0).is_err() { + continue; // can't hash what we can't canonicalize + } + let name = item + .child_by_field_name("name") + .and_then(|n| n.utf8_text(source).ok()) + .unwrap_or("?") + .to_string(); + defs.push(Def { + kind: "fn", + name, + content_hash: fnv1a_hex(printer.out.as_bytes()), + }); + } + Ok(defs) +} + +/// Dependency-free, deterministic 64-bit FNV-1a hash, rendered as hex. Adequate +/// for a content-identity proof-of-concept; a real model store would use a +/// cryptographic hash. +fn fnv1a_hex(bytes: &[u8]) -> String { + let mut hash: u64 = 0xcbf2_9ce4_8422_2325; + for &byte in bytes { + hash ^= byte as u64; + hash = hash.wrapping_mul(0x0000_0100_0000_01b3); + } + format!("{hash:016x}") } struct Printer<'a> { @@ -379,6 +447,25 @@ mod tests { } } + #[test] + fn inspect_content_hash_is_stable_across_formatting() { + // The read verb's headline: a definition's content identity survives + // reformatting (the hash is over canonical form). + let messy = canonicalize(b"fn add(a:i32,b:i32)->i32{a+b}").unwrap(); + let tidy = inspect(b"fn add( a : i32 , b : i32 ) -> i32 {\n\n a + b\n}\n").unwrap(); + let from_messy = inspect(&messy).unwrap(); + assert_eq!(from_messy, tidy); + assert_eq!(from_messy.len(), 1); + assert_eq!(from_messy[0].name, "add"); + } + + #[test] + fn inspect_distinguishes_bodies_and_lists_in_order() { + let defs = inspect(b"fn a()->i32{1+2}\nfn b()->i32{1-2}").unwrap(); + assert_eq!(defs.iter().map(|d| &d.name).collect::>(), ["a", "b"]); + assert_ne!(defs[0].content_hash, defs[1].content_hash); + } + #[test] fn pure_repeated_calls_are_byte_identical() { // No clock/locale/randomness/hash-ordering leaks into the output.