Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,133 changes: 1,133 additions & 0 deletions Cargo.lock

Large diffs are not rendered by default.

20 changes: 17 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,21 @@ description = "Language-aware Git: AST-based diffs and merges instead of line-ba
license = "MIT"
repository = "https://github.com/bounded-systems/git-ast"

# No dependencies yet. The design uses Tree-sitter for parsing and libgit2
# (git2) for plumbing; those are added when the corresponding logic lands,
# rather than declared while unused.
# Tree-sitter parses source into a concrete syntax tree; the `printer` module
# walks that tree and re-emits canonical source. No libgit2 needed: the
# clean/smudge filter speaks Git's pkt-line `filter-process` protocol over
# stdin/stdout, implemented in `pktline`.
[dependencies]
tree-sitter = "0.22"
tree-sitter-rust = "0.21"

# The Gherkin suite (tests/features/*.feature) executes the README's claims
# against real `git`, driving the built binary as the clean/smudge filter.
[dev-dependencies]
cucumber = "0.21"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
tempfile = "3"

[[test]]
name = "claims"
harness = false
353 changes: 341 additions & 12 deletions README.md

Large diffs are not rendered by default.

66 changes: 66 additions & 0 deletions examples/demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
# End-to-end demo of the git-ast clean/smudge round-trip.
#
# Builds the binary, creates a throwaway git repo with the filter installed,
# and shows three things:
# 1. messy Rust is stored canonically (clean),
# 2. a pure reformat produces no diff (formatting never enters history),
# 3. a real logic change still shows a clean, minimal diff.
#
# Usage: examples/demo.sh
set -euo pipefail

repo_root="$(cd "$(dirname "$0")/.." && pwd)"
cd "$repo_root"

echo "==> building git-ast (release)"
cargo build --release --quiet
bin="$repo_root/target/release/git-ast"

work="$(mktemp -d)"
trap 'rm -rf "$work"' EXIT
cd "$work"
git init -q
"$bin" setup >/dev/null

echo
echo "==> writing deliberately messy Rust"
cat > calc.rs <<'EOF'
fn add(a:i32,b:i32)->i32{
// Simple addition
a+b}
fn main(){let x=5;let y =10;
let sum=add(x,y);println!("Sum: {}",sum);}
EOF
cat calc.rs

echo
echo "==> git add, then show the STORED blob (clean filter output)"
git add calc.rs
git cat-file -p :calc.rs
git commit -qm "add calc.rs"

echo
echo "==> reformat the file wildly, then check git diff"
cat > calc.rs <<'EOF'
fn add(a: i32, b: i32) -> i32 { // Simple addition
a+b }


fn main() {
let x = 5;
let y = 10;
let sum = add( x , y );
println!( "Sum: {}" , sum );
}
EOF
if git diff --quiet; then
echo "(no diff — formatting churn never entered history)"
else
echo "UNEXPECTED: reformatting produced a diff"; git diff; exit 1
fi

echo
echo "==> make a real change (a + b -> a - b) and show the diff"
sed -i 's/a+b/a-b/' calc.rs
git --no-pager diff
238 changes: 190 additions & 48 deletions src/filters.rs
Original file line number Diff line number Diff line change
@@ -1,68 +1,210 @@
//! Clean/smudge filter.
//! Clean/smudge filter over Git's long-running `filter-process` protocol.
//!
//! `clean` turns source text into a serialized tree on `git add`; `smudge` turns
//! it back into source on checkout. Both are placeholders: `clean` prefixes the
//! content with a marker and `smudge` strips it. A real implementation would
//! parse with Tree-sitter and pretty-print deterministically.

use crate::Error;
use std::io::Read;

/// Marker that the placeholder clean/smudge round-trip uses to stand in for a
/// serialized tree.
const SERIALIZED_MARKER: &[u8] = b"SERIALIZED:";

/// Run the long-running filter process.
///
/// Placeholder: reads stdin to EOF and reports the byte count instead of
/// speaking Git's pkt-line filter protocol.
//! `clean` (on `git add`) parses Rust source and stores its canonical form, so
//! reformatting never reaches history. `smudge` (on `git checkout`) is identity:
//! the stored bytes are already canonical source. Only `*.rs` paths are
//! transformed; anything else passes through untouched.
//!
//! The conversation is the standard one documented in
//! `Documentation/gitattributes.txt`:
//!
//! 1. Handshake — exchange `git-filter-client`/`git-filter-server` + `version=2`.
//! 2. Capabilities — we advertise `clean` and `smudge`.
//! 3. Per blob — read `command`/`pathname` metadata then content, reply with a
//! status line and the transformed content.

use crate::pktline::{self, Packet};
use crate::{printer, Error};
use std::collections::HashMap;
use std::io::{self, Read, Write};
use std::path::Path;

/// Run the long-running filter process against stdin/stdout.
pub fn run_long_running_filter() -> Result<(), Error> {
eprintln!("[filter] long-running filter process (placeholder)");
let mut buffer = Vec::new();
std::io::stdin().read_to_end(&mut buffer)?;
eprintln!("[filter] read {} bytes (no-op)", buffer.len());
let stdin = io::stdin();
let stdout = io::stdout();
let mut input = stdin.lock();
let mut output = stdout.lock();
converse(&mut input, &mut output)
}

/// Drive the whole protocol over arbitrary streams (so it is testable without a
/// real Git process).
pub fn converse(input: &mut impl Read, output: &mut impl Write) -> Result<(), Error> {
handshake(input, output)?;
capabilities(input, output)?;

// Process blobs until Git closes the pipe.
while process_one(input, output)? {}
Ok(())
}

/// `clean`: source text -> serialized tree.
///
/// Placeholder: prefixes the content with [`SERIALIZED_MARKER`].
#[allow(dead_code)]
fn perform_clean(input: &[u8], pathname: &str) -> Result<Vec<u8>, Error> {
eprintln!("[filter] clean {pathname}");
let mut out = SERIALIZED_MARKER.to_vec();
out.extend_from_slice(input);
Ok(out)
fn handshake(input: &mut impl Read, output: &mut impl Write) -> Result<(), Error> {
let intro = pktline::read_until_flush(input)?
.ok_or_else(|| protocol("client closed during handshake"))?;
let intro = String::from_utf8_lossy(&intro);
if !intro.contains("git-filter-client") {
return Err(protocol("missing git-filter-client welcome"));
}
if !intro.contains("version=2") {
return Err(protocol("client did not offer version=2"));
}
pktline::write_text_packet(output, "git-filter-server")?;
pktline::write_text_packet(output, "version=2")?;
pktline::write_flush(output)?;
output.flush()?;
Ok(())
}

fn capabilities(input: &mut impl Read, output: &mut impl Write) -> Result<(), Error> {
// Read (and ignore the specifics of) the client's advertised capabilities.
pktline::read_until_flush(input)?
.ok_or_else(|| protocol("client closed during capabilities"))?;
pktline::write_text_packet(output, "capability=clean")?;
pktline::write_text_packet(output, "capability=smudge")?;
pktline::write_flush(output)?;
output.flush()?;
Ok(())
}

/// `smudge`: serialized tree -> source text.
///
/// Placeholder: strips [`SERIALIZED_MARKER`] if present, otherwise passes through.
#[allow(dead_code)]
fn perform_smudge(input: &[u8], pathname: &str) -> Result<Vec<u8>, Error> {
eprintln!("[filter] smudge {pathname}");
match input.strip_prefix(SERIALIZED_MARKER) {
Some(rest) => Ok(rest.to_vec()),
None => Ok(input.to_vec()),
/// Handle a single blob request. Returns `Ok(false)` when the client has closed
/// the stream (no more work), `Ok(true)` after a blob was processed.
fn process_one(input: &mut impl Read, output: &mut impl Write) -> Result<bool, Error> {
// Metadata section: key=value lines terminated by a flush. EOF here is the
// normal shutdown signal.
let meta = match read_meta(input)? {
Some(meta) => meta,
None => return Ok(false),
};
let command = meta.get("command").map(String::as_str).unwrap_or_default();
let pathname = meta.get("pathname").cloned().unwrap_or_default();

// Content section.
let content =
pktline::read_until_flush(input)?.ok_or_else(|| protocol("client closed mid-content"))?;

match transform(command, &pathname, &content) {
Ok(out) => {
pktline::write_text_packet(output, "status=success")?;
pktline::write_flush(output)?;
pktline::write_content(output, &out)?;
// Trailing empty status list: leaves status=success in effect.
pktline::write_flush(output)?;
}
Err(e) => {
// Report the blob as failed; Git aborts the add/checkout for it.
eprintln!("git-ast: {pathname}: {e}");
pktline::write_text_packet(output, "status=error")?;
pktline::write_flush(output)?;
}
}
output.flush()?;
Ok(true)
}

/// Apply the requested transform. `clean` canonicalizes Rust; `smudge` is
/// identity. Non-Rust paths pass through unchanged in both directions.
fn transform(command: &str, pathname: &str, content: &[u8]) -> Result<Vec<u8>, Error> {
let is_rust = Path::new(pathname).extension().is_some_and(|e| e == "rs");
match command {
"clean" if is_rust => printer::canonicalize(content),
"smudge" | "clean" => Ok(content.to_vec()),
other => Err(Error::Driver(format!("unknown filter command `{other}`"))),
}
}

/// Read the metadata section into key/value pairs. Returns `None` at clean EOF.
fn read_meta(input: &mut impl Read) -> Result<Option<HashMap<String, String>>, Error> {
let mut map = HashMap::new();
let mut saw_any = false;
loop {
match pktline::read_packet(input)? {
None if !saw_any => return Ok(None),
None => return Err(protocol("EOF in metadata section")),
Some(Packet::Flush) => return Ok(Some(map)),
Some(Packet::Data(d)) => {
saw_any = true;
let line = String::from_utf8_lossy(&d);
let line = line.trim_end_matches('\n');
if let Some((k, v)) = line.split_once('=') {
map.insert(k.to_string(), v.to_string());
}
}
}
}
}

fn protocol(msg: &str) -> Error {
Error::Driver(format!("filter protocol: {msg}"))
}

#[cfg(test)]
mod tests {
use super::*;
use crate::pktline::write_text_packet;

/// Build a client-side request stream: handshake, capabilities, then one
/// blob with the given command/path/content.
fn client_stream(command: &str, pathname: &str, content: &[u8]) -> Vec<u8> {
let mut w = Vec::new();
write_text_packet(&mut w, "git-filter-client").unwrap();
write_text_packet(&mut w, "version=2").unwrap();
pktline::write_flush(&mut w).unwrap();
write_text_packet(&mut w, "capability=clean").unwrap();
write_text_packet(&mut w, "capability=smudge").unwrap();
pktline::write_flush(&mut w).unwrap();
write_text_packet(&mut w, &format!("command={command}")).unwrap();
write_text_packet(&mut w, &format!("pathname={pathname}")).unwrap();
pktline::write_flush(&mut w).unwrap();
pktline::write_content(&mut w, content).unwrap();
w
}

/// Pull the content of the (single) blob response back out of the server's
/// reply stream, skipping the handshake/capability/status sections.
fn response_content(reply: &[u8]) -> Vec<u8> {
let mut r = reply;
pktline::read_until_flush(&mut r).unwrap(); // server handshake
pktline::read_until_flush(&mut r).unwrap(); // server capabilities
pktline::read_until_flush(&mut r).unwrap(); // status list
pktline::read_until_flush(&mut r).unwrap().unwrap() // content
}

#[test]
fn clean_canonicalizes_rust() {
let req = client_stream("clean", "a.rs", b"fn f()->i32{1+2}");
let mut out = Vec::new();
converse(&mut &req[..], &mut out).unwrap();
assert_eq!(response_content(&out), b"fn f() -> i32 {\n 1 + 2\n}\n");
}

#[test]
fn smudge_is_identity() {
let canonical = b"fn f() -> i32 {\n 1 + 2\n}\n";
let req = client_stream("smudge", "a.rs", canonical);
let mut out = Vec::new();
converse(&mut &req[..], &mut out).unwrap();
assert_eq!(response_content(&out), canonical);
}

#[test]
fn clean_then_smudge_round_trips() {
let src = b"fn main() {}\n";
let cleaned = perform_clean(src, "a.rs").unwrap();
assert!(cleaned.starts_with(SERIALIZED_MARKER));
let smudged = perform_smudge(&cleaned, "a.rs").unwrap();
assert_eq!(smudged, src);
fn non_rust_passes_through_clean() {
let req = client_stream("clean", "notes.txt", b" unchanged ");
let mut out = Vec::new();
converse(&mut &req[..], &mut out).unwrap();
assert_eq!(response_content(&out), b" unchanged ");
}

#[test]
fn smudge_passes_through_unmarked_content() {
let raw = b"plain text";
assert_eq!(perform_smudge(raw, "a.rs").unwrap(), raw);
fn clean_reports_error_on_unparseable_rust() {
let req = client_stream("clean", "bad.rs", b"fn main( {");
let mut out = Vec::new();
converse(&mut &req[..], &mut out).unwrap();
// Status section should carry status=error and no content follows.
let mut r = &out[..];
pktline::read_until_flush(&mut r).unwrap(); // handshake
pktline::read_until_flush(&mut r).unwrap(); // capabilities
let status = pktline::read_until_flush(&mut r).unwrap().unwrap();
assert_eq!(String::from_utf8_lossy(&status).trim_end(), "status=error");
}
}
17 changes: 11 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
//!
//! ## Status
//!
//! **Design-stage skeleton.** This crate currently exposes the *shape* of the
//! integration — a subcommand entry point plus placeholder filter/diff/merge
//! logic — so the wiring compiles and can be exercised end to end. The actual
//! parsing, serialization and pretty-printing are not implemented yet; the
//! relevant functions are clearly marked as placeholders. See the `docs/`
//! directory for the design and roadmap.
//! **Working clean/smudge round-trip for a Rust subset.** The `clean` filter
//! parses Rust with Tree-sitter and re-emits canonical source ([`printer`]),
//! driven over Git's real `filter-process` pkt-line protocol ([`pktline`],
//! [`filters`]) — so `git add`/`git checkout` normalize formatting end to end.
//! The printer covers a documented subset and is fail-closed outside it. The
//! diff and merge drivers ([`drivers`]) remain placeholders: making those
//! structural depends on stable node identity, which is out of scope (see
//! `docs/planning/scope.md`).
//!
//! ## Integration points
//!
Expand All @@ -27,6 +29,9 @@
pub mod config;
pub mod drivers;
pub mod filters;
pub mod pktline;
pub mod printer;
pub mod setup;

use std::fmt;

Expand Down
Loading
Loading