diff --git a/src/authorship/rebase_authorship.rs b/src/authorship/rebase_authorship.rs index 9abaed14e..c13a0d9bc 100644 --- a/src/authorship/rebase_authorship.rs +++ b/src/authorship/rebase_authorship.rs @@ -707,7 +707,8 @@ fn try_reconstruct_attributions_from_notes_cached( use crate::authorship::attribution_tracker::LineAttribution; use crate::authorship::authorship_log_serialization::AuthorshipLog; - // Get file contents at original_head for all pathspecs in one batch call + // Get file contents at original_head for all pathspecs in one batch call. + // We need all pathspec contents to build line-to-author maps from note attestations. let file_contents = batch_read_file_contents_at_commit(repo, original_head, pathspecs).ok()?; let pathspec_set: HashSet<&str> = pathspecs.iter().map(String::as_str).collect(); @@ -921,7 +922,7 @@ fn try_reconstruct_attributions_from_notes_cached( if !line_attrs.is_empty() { line_attrs.sort_by_key(|a| a.start_line); - // Skip char-level attribution computation — only line_attrs are used in the fast path + // Skip char-level attribution computation — only line_attrs are used for rebase attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); } } @@ -1126,14 +1127,27 @@ pub fn rewrite_authorship_after_rebase_v2( return Ok(()); } - // Step 2: Create attribution state from original_head (before rebase) - // Try fast reconstruction from existing notes first (avoids expensive blame) - let va_phase_start = std::time::Instant::now(); + // Step 2a: Run diff-tree to discover which files actually change during the rebase. + // This is fast (single subprocess) and tells us which files we need to load. + let diff_tree_start = std::time::Instant::now(); + let diff_tree_result = + run_diff_tree_for_commits(repo, &commits_to_process, &pathspecs_lookup, &pathspecs)?; + let actually_changed_files = diff_tree_result.all_changed_files(); + timing_phases.push(( + format!( + "diff_tree ({} commits, {} changed files, {} blobs)", + commits_to_process.len(), + actually_changed_files.len(), + diff_tree_result.all_blob_oids.len(), + ), + diff_tree_start.elapsed().as_millis(), + )); - // Track whether we used the fast note-based reconstruction (avoids building VA on fast path). - let mut used_fast_reconstruction = false; + // Step 2b: Create attribution state from original_head (before rebase) + // Only load file contents for files that actually change — skip unchanged files. + let va_phase_start = std::time::Instant::now(); - let (mut current_attributions, mut current_file_contents, initial_prompts, rebase_ts) = + let (mut current_attributions, mut current_file_contents, initial_prompts, _rebase_ts) = if let Some((attrs, contents, prompts)) = try_reconstruct_attributions_from_notes_cached( repo, original_head, @@ -1143,7 +1157,6 @@ pub fn rewrite_authorship_after_rebase_v2( ¬e_cache, ) { debug_log("Using fast note-based attribution reconstruction (skipping blame)"); - used_fast_reconstruction = true; let ts = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() @@ -1200,9 +1213,23 @@ pub fn rewrite_authorship_after_rebase_v2( va_phase_start.elapsed().as_millis(), )); + // Step 2c: Read blob contents in parallel (multiple git cat-file --batch processes). + let blob_phase_start = std::time::Instant::now(); + let blob_contents = batch_read_blob_contents_parallel(repo, &diff_tree_result.all_blob_oids)?; + let mut changed_contents_by_commit = + assemble_changed_contents(diff_tree_result.commit_deltas, &blob_contents); + drop(blob_contents); // free memory early + timing_phases.push(( + format!( + "blob_read_parallel ({} blobs)", + diff_tree_result.all_blob_oids.len() + ), + blob_phase_start.elapsed().as_millis(), + )); + // Build original_head line-to-author maps for content restoration during transform. - // On the fast path, build directly from current_attributions (before the loop mutates them). - // On the slow path, also build VirtualAttributions wrapper for transform_changed_files_to_final_state. + // Built from current_attributions before the loop mutates them. + // Used as a fallback for files with no previous content in the diff-based transfer. let original_head_line_to_author: HashMap> = { let mut maps = HashMap::new(); for (file_path, (_, line_attrs)) in ¤t_attributions { @@ -1230,30 +1257,9 @@ pub fn rewrite_authorship_after_rebase_v2( maps }; - // Only build VirtualAttributions wrapper on the slow path (needed by transform). - // On the fast path, we skip this entirely — it requires cloning all attributions and file contents. - let original_head_state_va = if !used_fast_reconstruction { - let original_head_state_attrs = current_attributions.clone(); - Some( - crate::authorship::virtual_attribution::VirtualAttributions::new( - repo.clone(), - original_head.to_string(), - original_head_state_attrs, - current_file_contents.clone(), - rebase_ts, - ), - ) - } else { - None - }; - - let mut current_prompts = if let Some(ref va) = original_head_state_va { - crate::authorship::virtual_attribution::VirtualAttributions::merge_prompts_picking_newest( - &[&initial_prompts, va.prompts()], - ) - } else { - initial_prompts.clone() - }; + // No need to build VirtualAttributions wrapper — diff-based transfer replaces + // transform_changed_files_to_final_state entirely, eliminating the need for VA in the loop. + let mut current_prompts = initial_prompts.clone(); let mut prompt_line_metrics = build_prompt_line_metrics_from_attributions(¤t_attributions); apply_prompt_line_metrics_to_prompts(&mut current_prompts, &prompt_line_metrics); @@ -1270,39 +1276,26 @@ pub fn rewrite_authorship_after_rebase_v2( }) .collect(); - let mut current_authorship_log = build_authorship_log_from_state( + let current_authorship_log = build_authorship_log_from_state( original_head, ¤t_prompts, ¤t_attributions, &existing_files, ); - let phase_start = std::time::Instant::now(); - let mut changed_contents_by_commit = collect_changed_file_contents_for_commits( - repo, - &commits_to_process, - &pathspecs_lookup, - &pathspecs, - )?; - timing_phases.push(( - format!("collect_contents ({} commits)", commits_to_process.len()), - phase_start.elapsed().as_millis(), - )); - let mut pending_note_entries: Vec<(String, String)> = - Vec::with_capacity(commits_to_process.len()); - let mut pending_note_debug: Vec<(String, usize)> = Vec::with_capacity(commits_to_process.len()); - let mut original_note_content_by_new_commit: HashMap = HashMap::new(); - let mut original_note_content_loaded = false; - - // Determine whether we can use the fast line-lookup path (no char-level diffs). - // This is possible when we successfully reconstructed from notes. - let use_line_lookup_fast_path = !original_head_line_to_author.is_empty(); - - // For the fast path, pre-serialize the metadata JSON template once (only base_commit_sha changes). - // Cache per-file attestation text so we only re-serialize changed files. + // Fast serialization: pre-cache per-file attestation text and metadata template. + // Instead of calling serialize_to_string() per commit (which rebuilds the entire JSON), + // we cache each file's attestation text and only update changed files. Assembly is + // pure string concatenation. let mut cached_file_attestation_text: HashMap = HashMap::new(); - // Pre-split the metadata JSON template at the placeholder for O(1) per-commit assembly. - let metadata_json_template_parts: Option<(String, String)> = if use_line_lookup_fast_path { + for file_attestation in ¤t_authorship_log.attestations { + cached_file_attestation_text.insert( + file_attestation.file_path.clone(), + serialize_file_attestation(file_attestation), + ); + } + // Pre-split metadata JSON template at a placeholder so we only swap the commit SHA per commit. + let metadata_json_template_parts: Option<(String, String)> = { let mut template_meta = current_authorship_log.metadata.clone(); template_meta.base_commit_sha = "BASE_COMMIT_SHA_PLACEHOLDER".to_string(); template_meta.prompts = flatten_prompts_for_metadata(¤t_prompts); @@ -1315,25 +1308,18 @@ pub fn rewrite_authorship_after_rebase_v2( parts.get(1).unwrap_or(&"").to_string(), ) }) - } else { - None }; - // Pre-cache attestation text for all files in the initial state - if use_line_lookup_fast_path { - for file_attestation in ¤t_authorship_log.attestations { - cached_file_attestation_text.insert( - file_attestation.file_path.clone(), - serialize_file_attestation(file_attestation), - ); - } - } + let mut pending_note_entries: Vec<(String, String)> = + Vec::with_capacity(commits_to_process.len()); + let mut pending_note_debug: Vec<(String, usize)> = Vec::with_capacity(commits_to_process.len()); + let mut original_note_content_by_new_commit: HashMap = HashMap::new(); + let mut original_note_content_loaded = false; // Step 3: Process each new commit in order (oldest to newest) let loop_start = std::time::Instant::now(); let mut loop_transform_ms = 0u128; let mut loop_serialize_ms = 0u128; - let mut loop_attestation_ms = 0u128; let mut loop_metrics_ms = 0u128; for (idx, new_commit) in commits_to_process.iter().enumerate() { debug_log(&format!( @@ -1358,158 +1344,96 @@ pub fn rewrite_authorship_after_rebase_v2( } } + // Diff-based line attribution transfer: for each changed file, diff + // old content → new content and carry attributions forward positionally. + // Falls back to content-matching for files with no previous content. let t0 = std::time::Instant::now(); - if use_line_lookup_fast_path { - // Fast path: directly look up each line's author from the content→author map. - // No diffing needed - just scan the new file content line by line. - for (file_path, new_content) in &new_content_for_changed_files { - if new_content.is_empty() { - // File deleted - remove from cache - cached_file_attestation_text.remove(file_path); - current_attributions.remove(file_path); - continue; - } - let line_map = original_head_line_to_author.get(file_path); - let mut line_attrs: Vec< - crate::authorship::attribution_tracker::LineAttribution, - > = Vec::new(); - for (line_idx, line_content) in new_content.lines().enumerate() { - if let Some(author_id) = line_map.and_then(|m| m.get(line_content)) { - let line_num = (line_idx + 1) as u32; - line_attrs.push( - crate::authorship::attribution_tracker::LineAttribution { - start_line: line_num, - end_line: line_num, - author_id: author_id.clone(), - overrode: None, - }, - ); - } - } - // Serialize attestation directly from line_attrs (skip intermediate struct) - if let Some(text) = - serialize_attestation_from_line_attrs(file_path, &line_attrs) - { - cached_file_attestation_text.insert(file_path.clone(), text); - } else { - cached_file_attestation_text.remove(file_path); - } - // Fast path: skip char-level attribution computation (unused in fast path) - current_attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); - // Skip updating current_file_contents — not read on the fast path after this point - } - } else { - // Slow path: use character-level diff transform (original behavior) - let mut previous_line_attrs_by_file: HashMap< - String, - Vec, - > = HashMap::new(); - for file_path in &changed_files_in_commit { - if let Some((_, line_attrs)) = current_attributions.get(file_path) { - previous_line_attrs_by_file.insert(file_path.clone(), line_attrs.clone()); - } - } - transform_changed_files_to_final_state( - &mut current_attributions, - &mut current_file_contents, - new_content_for_changed_files, - original_head_state_va.as_ref(), - Some(&original_head_line_to_author), - rebase_ts, - )?; - for line_attrs in previous_line_attrs_by_file.values() { + for (file_path, new_content) in &new_content_for_changed_files { + // Subtract old metrics before modifying attributions + let previous_line_attrs = current_attributions + .get(file_path) + .map(|(_, la)| la.clone()); + if let Some(ref prev_la) = previous_line_attrs { subtract_prompt_line_metrics_for_line_attributions( &mut prompt_line_metrics, - line_attrs, + prev_la, ); } - for file_path in &changed_files_in_commit { - if let Some((_, line_attrs)) = current_attributions.get(file_path) { + if new_content.is_empty() { + // File deleted - keep attributions and file contents so a later + // reappearance in the rebase sequence can inherit via diff-based + // positional transfer from the pre-deletion state. Re-add the + // subtracted metrics to preserve balance (the file won't appear + // in the serialized note since existing_files excludes it). + if let Some(ref prev_la) = previous_line_attrs { add_prompt_line_metrics_for_line_attributions( &mut prompt_line_metrics, - line_attrs, + prev_la, ); } + cached_file_attestation_text.remove(file_path); + continue; } + let line_attrs = compute_line_attrs_for_changed_file( + new_content, + current_file_contents.get(file_path), + current_attributions + .get(file_path) + .map(|(_, la)| la.as_slice()), + original_head_line_to_author.get(file_path), + ); + add_prompt_line_metrics_for_line_attributions( + &mut prompt_line_metrics, + &line_attrs, + ); + // Update fast serialization cache for this file + if let Some(text) = serialize_attestation_from_line_attrs(file_path, &line_attrs) { + cached_file_attestation_text.insert(file_path.clone(), text); + } else { + cached_file_attestation_text.remove(file_path); + } + current_attributions.insert(file_path.clone(), (Vec::new(), line_attrs)); + current_file_contents.insert(file_path.clone(), new_content.clone()); } loop_transform_ms += t0.elapsed().as_millis(); - if !use_line_lookup_fast_path { - let t0 = std::time::Instant::now(); - apply_prompt_line_metrics_to_prompts(&mut current_prompts, &prompt_line_metrics); - loop_metrics_ms += t0.elapsed().as_millis(); - - // Update only files touched by this commit (slow path updates authorship_log). - let t0 = std::time::Instant::now(); - for file_path in &changed_files_in_commit { - upsert_file_attestation( - &mut current_authorship_log, - file_path, - current_attributions - .get(file_path) - .map(|(_, line_attrs)| line_attrs.as_slice()) - .unwrap_or(&[]), - existing_files.contains(file_path), - ); - } - loop_attestation_ms += t0.elapsed().as_millis(); - } + let t0 = std::time::Instant::now(); + apply_prompt_line_metrics_to_prompts(&mut current_prompts, &prompt_line_metrics); + loop_metrics_ms += t0.elapsed().as_millis(); } + // Serialize note for this commit using fast cached assembly. let t0 = std::time::Instant::now(); - let authorship_json = if use_line_lookup_fast_path { - // Fast serialization: assemble note from cached per-file text + templated metadata - let has_attestations = cached_file_attestation_text.values().any(|v| !v.is_empty()); - if has_attestations || metadata_json_template_parts.is_some() { - let mut output = String::with_capacity(4096); - // Write cached attestation sections (only existing files) - for (file_path, text) in &cached_file_attestation_text { - if existing_files.contains(file_path) && !text.is_empty() { - output.push_str(text); - } - } - output.push_str("---\n"); - if let Some((ref prefix, ref suffix)) = metadata_json_template_parts { - output.push_str(prefix); - output.push_str(new_commit); - output.push_str(suffix); + let has_attestations = cached_file_attestation_text.values().any(|v| !v.is_empty()); + let authorship_json = if has_attestations || metadata_json_template_parts.is_some() { + // Fast path: assemble note from cached per-file text + templated metadata. + let mut output = String::with_capacity(4096); + for (file_path, text) in &cached_file_attestation_text { + if existing_files.contains(file_path) && !text.is_empty() { + output.push_str(text); } - Some(output) - } else { - None } + output.push_str("---\n"); + if let Some((ref prefix, ref suffix)) = metadata_json_template_parts { + output.push_str(prefix); + output.push_str(new_commit); + output.push_str(suffix); + } + Some(output) } else { - // Original slow-path serialization - current_authorship_log - .attestations - .retain(|attestation| existing_files.contains(&attestation.file_path)); - current_authorship_log.metadata.base_commit_sha = new_commit.clone(); - current_authorship_log.metadata.prompts = - flatten_prompts_for_metadata(¤t_prompts); - - let computed_note_has_payload = !current_authorship_log.attestations.is_empty() - || !current_authorship_log.metadata.prompts.is_empty(); - if computed_note_has_payload { - Some(current_authorship_log.serialize_to_string().map_err(|_| { - GitAiError::Generic("Failed to serialize authorship log".to_string()) - })?) - } else { - if !original_note_content_loaded { - // Build from cached note contents instead of another git call - for (original_commit, new_commit) in &commit_pairs_to_process { - if let Some(content) = - note_cache.original_note_contents.get(original_commit) - { - original_note_content_by_new_commit - .insert(new_commit.clone(), content.clone()); - } + if !original_note_content_loaded { + // Build from cached note contents instead of another git call + for (original_commit, new_commit) in &commit_pairs_to_process { + if let Some(content) = note_cache.original_note_contents.get(original_commit) { + original_note_content_by_new_commit + .insert(new_commit.clone(), content.clone()); } - original_note_content_loaded = true; } - original_note_content_by_new_commit - .get(new_commit) - .map(|raw_note| remap_note_content_for_target_commit(raw_note, new_commit)) + original_note_content_loaded = true; } + original_note_content_by_new_commit + .get(new_commit) + .map(|raw_note| remap_note_content_for_target_commit(raw_note, new_commit)) }; loop_serialize_ms += t0.elapsed().as_millis(); if let Some(authorship_json) = authorship_json { @@ -1531,7 +1455,6 @@ pub fn rewrite_authorship_after_rebase_v2( )); timing_phases.push((" loop:transform".to_string(), loop_transform_ms)); timing_phases.push((" loop:serialize".to_string(), loop_serialize_ms)); - timing_phases.push((" loop:attestation".to_string(), loop_attestation_ms)); timing_phases.push((" loop:metrics".to_string(), loop_metrics_ms)); let phase_start = std::time::Instant::now(); @@ -2083,16 +2006,35 @@ fn load_commit_metadata_batch( } /// Collect changed file contents for a list of commit SHAs using a single diff-tree --stdin call. -/// This is more efficient than build_first_parent_tree_pairs + collect_changed_file_contents_for_commit_pairs -/// because it avoids the commit metadata batch reads (saves 1-2 git subprocess calls). -fn collect_changed_file_contents_for_commits( +/// Result of parsing diff-tree output: per-commit deltas and the set of all blob OIDs needed. +struct DiffTreeResult { + commit_deltas: Vec<(String, CommitTrackedDelta)>, + all_blob_oids: Vec, // sorted, deduplicated +} + +impl DiffTreeResult { + fn all_changed_files(&self) -> HashSet { + let mut files = HashSet::new(); + for (_commit, delta) in &self.commit_deltas { + files.extend(delta.changed_files.iter().cloned()); + } + files + } +} + +/// Run `git diff-tree --stdin` to discover which files changed in each commit and collect blob OIDs. +/// This is the fast metadata-only phase — no blob contents are read. +fn run_diff_tree_for_commits( repo: &Repository, commit_shas: &[String], pathspecs_lookup: &HashSet<&str>, pathspecs: &[String], -) -> Result { +) -> Result { if commit_shas.is_empty() { - return Ok(HashMap::new()); + return Ok(DiffTreeResult { + commit_deltas: Vec::new(), + all_blob_oids: Vec::new(), + }); } let mut args = repo.global_args_for_exec(); @@ -2118,9 +2060,10 @@ fn collect_changed_file_contents_for_commits( let mut pos = 0usize; for commit_sha in commit_shas { - // When feeding commit SHAs to diff-tree --stdin, the output format is: - // "\n" followed by diff entries - let header_end = match data[pos..].iter().position(|&b| b == b'\n') { + // When feeding commit SHAs to diff-tree --stdin with -z, the output format is: + // "\0" followed by diff entries (all null-terminated). + // Without -z, the commit SHA is newline-terminated. + let header_end = match data[pos..].iter().position(|&b| b == 0) { Some(idx) => pos + idx, None => { // Commit may have no parent (root commit) — diff-tree may omit it. @@ -2184,8 +2127,18 @@ fn collect_changed_file_contents_for_commits( let mut blob_oid_list: Vec = all_blob_oids.into_iter().collect(); blob_oid_list.sort(); - let blob_contents = batch_read_blob_contents(repo, &blob_oid_list)?; + Ok(DiffTreeResult { + commit_deltas, + all_blob_oids: blob_oid_list, + }) +} + +/// Assemble per-commit changed file contents from diff-tree deltas and blob contents. +fn assemble_changed_contents( + commit_deltas: Vec<(String, CommitTrackedDelta)>, + blob_contents: &HashMap, +) -> ChangedFileContentsByCommit { let mut result = HashMap::new(); for (commit_sha, delta) in commit_deltas { let mut contents = HashMap::new(); @@ -2198,8 +2151,63 @@ fn collect_changed_file_contents_for_commits( } result.insert(commit_sha, (delta.changed_files, contents)); } + result +} + +/// Read blob contents in parallel using multiple `git cat-file --batch` processes. +/// Falls back to a single call for small batches. +const MAX_PARALLEL_BLOB_READS: usize = 4; +const BLOB_BATCH_CHUNK_SIZE: usize = 200; + +fn batch_read_blob_contents_parallel( + repo: &Repository, + blob_oids: &[String], +) -> Result, GitAiError> { + if blob_oids.is_empty() { + return Ok(HashMap::new()); + } + if blob_oids.len() <= BLOB_BATCH_CHUNK_SIZE { + return batch_read_blob_contents(repo, blob_oids); + } + + let global_args = repo.global_args_for_exec(); + let chunks: Vec> = blob_oids + .chunks(BLOB_BATCH_CHUNK_SIZE) + .map(|c| c.to_vec()) + .collect(); - Ok(result) + let results = smol::block_on(async { + let semaphore = std::sync::Arc::new(smol::lock::Semaphore::new(MAX_PARALLEL_BLOB_READS)); + let mut tasks = Vec::new(); + + for chunk in chunks { + let args = global_args.clone(); + let sem = std::sync::Arc::clone(&semaphore); + + let task = smol::spawn(async move { + let _permit = sem.acquire().await; + smol::unblock(move || { + let mut cat_args = args; + cat_args.push("cat-file".to_string()); + cat_args.push("--batch".to_string()); + let stdin_data = chunk.join("\n") + "\n"; + let output = exec_git_stdin(&cat_args, stdin_data.as_bytes())?; + parse_cat_file_batch_output_with_oids(&output.stdout) + }) + .await + }); + + tasks.push(task); + } + + futures::future::join_all(tasks).await + }); + + let mut merged = HashMap::new(); + for result in results { + merged.extend(result?); + } + Ok(merged) } pub fn rewrite_authorship_after_commit_amend( @@ -3205,8 +3213,7 @@ fn build_file_attestation_from_line_attributions( } } -/// Serialize a single FileAttestation to its text representation. -/// Used for caching per-file attestation text in the fast serialization path. +/// Serialize a FileAttestation into the text format used in authorship notes. fn serialize_file_attestation( file_attestation: &crate::authorship::authorship_log_serialization::FileAttestation, ) -> String { @@ -3226,7 +3233,6 @@ fn serialize_file_attestation( output.push_str(" "); output.push_str(&entry.hash); output.push(' '); - // Format line ranges inline (avoid allocation overhead of format_line_ranges) let mut first = true; for range in &entry.line_ranges { if !first { @@ -3248,7 +3254,7 @@ fn serialize_file_attestation( } /// Serialize attestation text directly from line_attrs without building intermediate FileAttestation. -/// This avoids HashMap allocation, sorting, and range merging overhead for the common case. +/// This avoids HashMap allocation, sorting, and range merging overhead. fn serialize_attestation_from_line_attrs( file_path: &str, line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], @@ -3259,8 +3265,6 @@ fn serialize_attestation_from_line_attrs( return None; } - // Group consecutive lines by author and merge ranges directly during serialization. - // line_attrs are already sorted by start_line from the fast-path construction. let human_id = crate::authorship::working_log::CheckpointKind::Human.to_str(); // Collect runs of (author_id, start, end) merging adjacent lines @@ -3286,7 +3290,6 @@ fn serialize_attestation_from_line_attrs( } let mut output = String::with_capacity(128); - // File path header if file_path.contains(' ') || file_path.contains('\t') || file_path.contains('\n') { let _ = write!(output, "\"{}\"", file_path); } else { @@ -3328,23 +3331,101 @@ fn serialize_attestation_from_line_attrs( Some(output) } -fn upsert_file_attestation( - authorship_log: &mut AuthorshipLog, - file_path: &str, - line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], - file_exists: bool, -) { - authorship_log - .attestations - .retain(|attestation| attestation.file_path != file_path); - if !file_exists { - return; +/// Compute new line attributions for a file after content changes. +/// Uses diff-based positional transfer when previous content/attrs are available, +/// otherwise falls back to content-matching from the original_head line→author map. +fn compute_line_attrs_for_changed_file( + new_content: &str, + old_content: Option<&String>, + old_attrs: Option<&[crate::authorship::attribution_tracker::LineAttribution]>, + original_head_line_map: Option<&HashMap>, +) -> Vec { + if let (Some(old_c), Some(old_a)) = (old_content, old_attrs) { + diff_based_line_attribution_transfer(old_c, new_content, old_a) + } else { + // No previous content — fall back to content-matching from original_head + let mut attrs = Vec::new(); + for (line_idx, line_content) in new_content.lines().enumerate() { + if let Some(author_id) = original_head_line_map.and_then(|m| m.get(line_content)) { + let line_num = (line_idx + 1) as u32; + attrs.push(crate::authorship::attribution_tracker::LineAttribution { + start_line: line_num, + end_line: line_num, + author_id: author_id.clone(), + overrode: None, + }); + } + } + attrs } - if let Some(file_attestation) = - build_file_attestation_from_line_attributions(file_path, line_attrs) - { - authorship_log.attestations.push(file_attestation); +} + +/// Transfer line attributions from old file content to new file content using line-level diffing. +/// This replaces the blame-based slow path by using imara-diff to compute how lines moved +/// between the old and new versions, then carrying attributions forward positionally. +/// +/// - Equal lines: carry the original attribution forward +/// - Inserted lines: no attribution (new content) +/// - Deleted lines: dropped +/// - Replaced lines: no attribution (content changed) +fn diff_based_line_attribution_transfer( + old_content: &str, + new_content: &str, + old_line_attrs: &[crate::authorship::attribution_tracker::LineAttribution], +) -> Vec { + use crate::authorship::imara_diff_utils::{DiffOp, capture_diff_slices}; + + let old_lines: Vec<&str> = old_content.lines().collect(); + let new_lines: Vec<&str> = new_content.lines().collect(); + + // Build a lookup from 0-indexed line index → author_id for old content + let mut old_line_author: Vec> = vec![None; old_lines.len()]; + for attr in old_line_attrs { + for line_num in attr.start_line..=attr.end_line { + let idx = (line_num as usize).saturating_sub(1); + if idx < old_line_author.len() { + old_line_author[idx] = Some(&attr.author_id); + } + } + } + + let diff_ops = capture_diff_slices(&old_lines, &new_lines); + + let mut new_line_attrs: Vec = + Vec::with_capacity(new_lines.len()); + + for op in &diff_ops { + match op { + DiffOp::Equal { + old_index, + new_index, + len, + } => { + // Carry attributions forward for equal lines + for i in 0..*len { + let old_idx = old_index + i; + let new_line_num = (new_index + i + 1) as u32; + if let Some(Some(author_id)) = old_line_author.get(old_idx) { + new_line_attrs.push( + crate::authorship::attribution_tracker::LineAttribution { + start_line: new_line_num, + end_line: new_line_num, + author_id: author_id.to_string(), + overrode: None, + }, + ); + } + } + } + DiffOp::Insert { .. } | DiffOp::Delete { .. } | DiffOp::Replace { .. } => { + // Insert: new lines, no attribution + // Delete: old lines removed, nothing to output + // Replace: content changed, no attribution carried + } + } } + + new_line_attrs } fn build_authorship_log_from_state( @@ -3451,148 +3532,6 @@ fn apply_prompt_line_metrics_to_prompts( } } -fn content_has_intersection_with_author_map( - content: &str, - line_to_author: &HashMap, -) -> bool { - content - .lines() - .any(|line| line_to_author.contains_key(line)) -} - -fn transform_changed_files_to_final_state( - attributions: &mut HashMap< - String, - ( - Vec, - Vec, - ), - >, - file_contents: &mut HashMap, - final_state: HashMap, - original_head_state: Option<&crate::authorship::virtual_attribution::VirtualAttributions>, - original_line_to_author_maps: Option<&HashMap>>, - ts: u128, -) -> Result<(), GitAiError> { - use crate::authorship::attribution_tracker::AttributionTracker; - - let tracker = AttributionTracker::new(); - - for (file_path, final_content) in final_state { - // Keep previous state for missing/deleted files so a later reappearance can still - // inherit older attributions. - if final_content.is_empty() { - continue; - } - - let source_attrs = attributions - .get(&file_path) - .map(|(char_attrs, _)| char_attrs.as_slice()); - let source_content = file_contents.get(&file_path).map(String::as_str); - let dummy_author = "__DUMMY__"; - let source_has_non_human = source_attrs.as_ref().is_some_and(|attrs| { - attrs.iter().any(|attr| { - attr.author_id != crate::authorship::working_log::CheckpointKind::Human.to_str() - }) - }); - let original_file_has_non_human = original_line_to_author_maps - .and_then(|maps| maps.get(&file_path)) - .is_some_and(|map| !map.is_empty()); - - let mut transformed_attrs = if !source_has_non_human && !original_file_has_non_human { - Vec::new() - } else if let (Some(attrs), Some(content)) = (source_attrs, source_content) { - tracker.update_attributions(content, &final_content, attrs, dummy_author, ts)? - } else { - Vec::new() - }; - - // Restore known attributions when the line content clearly maps back to original_head. - if let Some(original_state) = original_head_state - && let Some(original_content) = original_state.get_file_content(&file_path) - { - if original_content == &final_content { - if let Some(original_attrs) = original_state.get_char_attributions(&file_path) { - transformed_attrs = original_attrs.clone(); - } - } else if transformed_attrs - .iter() - .any(|attr| attr.author_id == dummy_author) - && let Some(original_line_to_author) = - original_line_to_author_maps.and_then(|maps| maps.get(&file_path)) - && content_has_intersection_with_author_map(&final_content, original_line_to_author) - { - let final_lines: Vec<&str> = final_content.lines().collect(); - let line_count = final_lines.len(); - let temp_line_attrs = - crate::authorship::attribution_tracker::attributions_to_line_attributions( - &transformed_attrs, - &final_content, - ); - - let mut dummy_diff = vec![0i32; line_count + 2]; - for la in &temp_line_attrs { - if la.author_id != dummy_author { - continue; - } - let start = (la.start_line as usize).max(1).min(line_count); - let end = (la.end_line as usize).max(1).min(line_count); - if start > end { - continue; - } - dummy_diff[start] += 1; - dummy_diff[end + 1] -= 1; - } - - let mut has_dummy_line = vec![false; line_count + 1]; // 1-indexed - let mut running = 0i32; - for line in 1..=line_count { - running += dummy_diff[line]; - has_dummy_line[line] = running > 0; - } - - let mut line_start_chars = Vec::with_capacity(line_count); - let mut char_pos = 0usize; - for line in &final_lines { - line_start_chars.push(char_pos); - char_pos += line.len() + 1; - } - - for (line_idx, line_content) in final_lines.iter().enumerate() { - let line_num = (line_idx + 1) as u32; - if !has_dummy_line[line_num as usize] { - continue; - } - if let Some(original_author) = original_line_to_author.get(*line_content) { - let line_start_char = line_start_chars[line_idx]; - let line_end_char = line_start_char + line_content.len(); - for attr in &mut transformed_attrs { - if attr.author_id == dummy_author - && attr.start < line_end_char - && attr.end > line_start_char - { - attr.author_id = original_author.clone(); - } - } - } - } - } - } - - transformed_attrs.retain(|attr| attr.author_id != dummy_author); - - let line_attrs = crate::authorship::attribution_tracker::attributions_to_line_attributions( - &transformed_attrs, - &final_content, - ); - - attributions.insert(file_path.clone(), (transformed_attrs, line_attrs)); - file_contents.insert(file_path, final_content); - } - - Ok(()) -} - /// Transform VirtualAttributions to match a new final state (single-source variant) fn transform_attributions_to_final_state( source_va: &crate::authorship::virtual_attribution::VirtualAttributions, @@ -5210,4 +5149,552 @@ mod tests { assert_eq!(copilot_prompt.agent_id.tool, "copilot"); assert_eq!(copilot_prompt.total_additions, 16); } + + /// Micro-benchmark comparing diff-based transfer vs char-level transform (old blame-based slow path). + /// The char-level approach uses AttributionTracker::update_attributions + attributions_to_line_attributions. + /// The diff-based approach uses diff_based_line_attribution_transfer (line-level diff only). + /// + /// Run with: cargo test --lib diff_based_transfer_benchmark -- --ignored --nocapture + #[test] + #[ignore] + fn diff_based_transfer_benchmark() { + use crate::authorship::attribution_tracker::AttributionTracker; + use std::time::Instant; + + let num_files = 20; + let lines_per_file = 200; + let num_commits = 100; + + println!("\n=== Diff-Based vs Char-Level Transform Benchmark ==="); + println!( + "Files: {}, Lines/file: {}, Commits: {}", + num_files, lines_per_file, num_commits + ); + + // Build initial file contents and both types of attributions + let mut file_contents: Vec = Vec::new(); + let mut line_attrs_per_file: Vec> = Vec::new(); + let mut char_attrs_per_file: Vec> = Vec::new(); + + for file_idx in 0..num_files { + let mut lines = Vec::new(); + let mut line_attrs = Vec::new(); + for line_idx in 0..lines_per_file { + let content = format!("// AI code module {} line {}", file_idx, line_idx); + let author = format!("ai-{}", line_idx % 3); + lines.push(content); + line_attrs.push(LineAttribution { + start_line: (line_idx + 1) as u32, + end_line: (line_idx + 1) as u32, + author_id: author, + overrode: None, + }); + } + let content = lines.join("\n") + "\n"; + + // Build char-level attributions matching the line attributions + let mut char_attrs = Vec::new(); + let mut char_pos = 0usize; + for (line_idx, line) in content.lines().enumerate() { + let line_end = char_pos + line.len() + 1; // +1 for newline + char_attrs.push(Attribution::new( + char_pos, + line_end, + format!("ai-{}", line_idx % 3), + 1, + )); + char_pos = line_end; + } + + file_contents.push(content); + line_attrs_per_file.push(line_attrs); + char_attrs_per_file.push(char_attrs); + } + + // Generate modified content per commit: insert 2 lines at top + modify 10% of lines + let mut all_new_contents: Vec> = Vec::new(); + let mut prev_contents = file_contents.clone(); + + for commit_idx in 0..num_commits { + let mut new_contents = Vec::new(); + for (file_idx, old_content) in prev_contents.iter().enumerate() { + let old_lines: Vec<&str> = old_content.lines().collect(); + let mut new_lines: Vec = Vec::new(); + if commit_idx == 0 { + // First commit: insert header lines (simulating main branch changes) + new_lines.push(format!("// Main header for module {}", file_idx)); + new_lines.push("// Marker".to_string()); + } + for (line_idx, line) in old_lines.iter().enumerate() { + if commit_idx == 0 && line_idx % 10 == 5 { + new_lines.push(format!("{} MODIFIED", line)); + } else { + new_lines.push(line.to_string()); + } + } + new_contents.push(new_lines.join("\n") + "\n"); + } + all_new_contents.push(new_contents.clone()); + prev_contents = new_contents; + } + + // ===== Benchmark 1: Diff-based transfer (new approach) ===== + let start = Instant::now(); + let mut current_line_attrs = line_attrs_per_file.clone(); + let mut current_contents = file_contents.clone(); + for commit_contents in &all_new_contents { + for file_idx in 0..num_files { + let new_content = &commit_contents[file_idx]; + let old_content = ¤t_contents[file_idx]; + let old_attrs = ¤t_line_attrs[file_idx]; + let new_attrs = super::diff_based_line_attribution_transfer( + old_content, + new_content, + old_attrs, + ); + current_line_attrs[file_idx] = new_attrs; + current_contents[file_idx] = new_content.clone(); + } + } + let diff_based_duration = start.elapsed(); + let diff_total_attrs: usize = current_line_attrs.iter().map(|a| a.len()).sum(); + + // ===== Benchmark 2: Char-level transform (old slow path) ===== + let tracker = AttributionTracker::new(); + let start = Instant::now(); + let mut current_char_attrs = char_attrs_per_file.clone(); + let mut current_contents2 = file_contents.clone(); + for commit_contents in &all_new_contents { + for file_idx in 0..num_files { + let new_content = &commit_contents[file_idx]; + let old_content = ¤t_contents2[file_idx]; + let old_attrs = ¤t_char_attrs[file_idx]; + let new_attrs = tracker + .update_attributions(old_content, new_content, old_attrs, "__DUMMY__", 1) + .unwrap(); + let line_attrs = + crate::authorship::attribution_tracker::attributions_to_line_attributions( + &new_attrs, + new_content, + ); + current_char_attrs[file_idx] = new_attrs; + current_contents2[file_idx] = new_content.clone(); + let _ = line_attrs; // used in real code for serialization + } + } + let char_level_duration = start.elapsed(); + let char_total_attrs: usize = current_char_attrs.iter().map(|a| a.len()).sum(); + + // ===== Benchmark 3: Full old slow path (char-level + VA wrapper + metrics + serialization) ===== + // This measures what the old slow path actually did per commit: + // 1. Clone attributions into VA wrapper + // 2. transform_changed_files_to_final_state (char-level diff) + // 3. subtract/add prompt line metrics + // 4. upsert_file_attestation per file + // 5. Full serialization per commit + let start = Instant::now(); + let mut full_slow_char_attrs = char_attrs_per_file.clone(); + let mut full_slow_contents = file_contents.clone(); + let mut full_slow_line_attrs = line_attrs_per_file.clone(); + for commit_contents in &all_new_contents { + // Clone attributions (VA wrapper construction overhead) + let _cloned_attrs: Vec> = full_slow_char_attrs.clone(); + let _cloned_contents: Vec = full_slow_contents.clone(); + + for file_idx in 0..num_files { + let new_content = &commit_contents[file_idx]; + let old_content = &full_slow_contents[file_idx]; + let old_attrs = &full_slow_char_attrs[file_idx]; + + // Step 1: char-level transform + let new_attrs = tracker + .update_attributions(old_content, new_content, old_attrs, "__DUMMY__", 1) + .unwrap(); + // Step 2: convert to line attrs + let line_attrs = + crate::authorship::attribution_tracker::attributions_to_line_attributions( + &new_attrs, + new_content, + ); + // Step 3: serialize file attestation (old path did this per file per commit) + let _serialized = super::build_file_attestation_from_line_attributions( + &format!("file_{}.rs", file_idx), + &line_attrs, + ); + + full_slow_char_attrs[file_idx] = new_attrs; + full_slow_contents[file_idx] = new_content.clone(); + full_slow_line_attrs[file_idx] = line_attrs; + } + } + let full_slow_duration = start.elapsed(); + + // ===== Benchmark 4: Full new path (diff-based + fast serialization) ===== + let start = Instant::now(); + let mut full_fast_line_attrs = line_attrs_per_file.clone(); + let mut full_fast_contents = file_contents.clone(); + for commit_contents in &all_new_contents { + for file_idx in 0..num_files { + let new_content = &commit_contents[file_idx]; + let old_content = &full_fast_contents[file_idx]; + let old_attrs = &full_fast_line_attrs[file_idx]; + + // Step 1: diff-based transfer + let new_attrs = super::diff_based_line_attribution_transfer( + old_content, + new_content, + old_attrs, + ); + // Step 2: serialize attestation from line attributions + let _serialized = super::build_file_attestation_from_line_attributions( + &format!("file_{}.rs", file_idx), + &new_attrs, + ); + + full_fast_line_attrs[file_idx] = new_attrs; + full_fast_contents[file_idx] = new_content.clone(); + } + } + let full_fast_duration = start.elapsed(); + + let transform_speedup = + char_level_duration.as_secs_f64() / diff_based_duration.as_secs_f64(); + let pipeline_speedup = full_slow_duration.as_secs_f64() / full_fast_duration.as_secs_f64(); + + println!("\n--- Transform-Only Results ---"); + println!( + "Diff-based transfer (new): {:>8.1}ms ({} line attrs)", + diff_based_duration.as_secs_f64() * 1000.0, + diff_total_attrs + ); + println!( + "Char-level transform (old): {:>8.1}ms ({} char attrs)", + char_level_duration.as_secs_f64() * 1000.0, + char_total_attrs + ); + println!("Transform speedup: {:>8.1}x", transform_speedup); + + println!("\n--- Full Pipeline Results (transform + serialization + overhead) ---"); + println!( + "New pipeline (diff + serial): {:>8.1}ms", + full_fast_duration.as_secs_f64() * 1000.0 + ); + println!( + "Old pipeline (char + VA + serial): {:>5.1}ms", + full_slow_duration.as_secs_f64() * 1000.0 + ); + println!("Full pipeline speedup: {:>8.1}x", pipeline_speedup); + println!("===================================================\n"); + + // The diff-based approach should be significantly faster than char-level transform. + // In release mode with 200-line files we consistently see 3-4x improvement. + assert!( + pipeline_speedup >= 2.0, + "Expected at least 2x pipeline speedup, got {:.1}x", + pipeline_speedup + ); + } + + /// Scaling benchmark: measures how diff-based vs char-level transform performance + /// changes as file size increases from 50 to 5000 lines. + /// + /// Run with: cargo test --lib --release diff_based_transfer_scaling -- --ignored --nocapture + #[test] + #[ignore] + fn diff_based_transfer_scaling() { + use crate::authorship::attribution_tracker::AttributionTracker; + use std::time::Instant; + + let num_files = 5; + let num_commits = 10; + let file_sizes = [50, 100, 200, 500, 1000, 2000, 5000]; + + println!("\n=== Scaling Benchmark: Diff-Based vs Char-Level ==="); + println!( + "{:>8} {:>12} {:>12} {:>8}", + "Lines", "Diff(ms)", "CharLvl(ms)", "Speedup" + ); + println!("{}", "-".repeat(48)); + + for &lines_per_file in &file_sizes { + // Build initial content and attributions + let mut file_contents = Vec::new(); + let mut line_attrs_per_file = Vec::new(); + let mut char_attrs_per_file = Vec::new(); + + for file_idx in 0..num_files { + let mut lines = Vec::new(); + let mut line_attrs = Vec::new(); + for line_idx in 0..lines_per_file { + lines.push(format!("// AI code module {} line {}", file_idx, line_idx)); + line_attrs.push(LineAttribution { + start_line: (line_idx + 1) as u32, + end_line: (line_idx + 1) as u32, + author_id: format!("ai-{}", line_idx % 3), + overrode: None, + }); + } + let content = lines.join("\n") + "\n"; + let mut char_attrs = Vec::new(); + let mut pos = 0usize; + for (li, line) in content.lines().enumerate() { + let end = pos + line.len() + 1; + char_attrs.push(Attribution::new(pos, end, format!("ai-{}", li % 3), 1)); + pos = end; + } + file_contents.push(content); + line_attrs_per_file.push(line_attrs); + char_attrs_per_file.push(char_attrs); + } + + // Generate modified content: insert 5 lines + modify 10% + let mut all_new = Vec::new(); + let mut prev = file_contents.clone(); + for ci in 0..num_commits { + let mut new_batch = Vec::new(); + for (fi, _) in prev.iter().enumerate() { + let old_lines: Vec<&str> = prev[fi].lines().collect(); + let mut new_lines = Vec::new(); + if ci == 0 { + for h in 0..5 { + new_lines.push(format!("// Header {} mod {}", h, fi)); + } + } + for (li, line) in old_lines.iter().enumerate() { + if ci == 0 && li % 10 == 5 { + new_lines.push(format!("{} MOD", line)); + } else { + new_lines.push(line.to_string()); + } + } + new_batch.push(new_lines.join("\n") + "\n"); + } + all_new.push(new_batch.clone()); + prev = new_batch; + } + + // Benchmark diff-based + let start = Instant::now(); + let mut cur_la = line_attrs_per_file.clone(); + let mut cur_c = file_contents.clone(); + for commit_contents in &all_new { + for fi in 0..num_files { + let na = super::diff_based_line_attribution_transfer( + &cur_c[fi], + &commit_contents[fi], + &cur_la[fi], + ); + cur_la[fi] = na; + cur_c[fi] = commit_contents[fi].clone(); + } + } + let diff_ms = start.elapsed().as_secs_f64() * 1000.0; + + // Benchmark char-level + let tracker = AttributionTracker::new(); + let start = Instant::now(); + let mut cur_ca = char_attrs_per_file.clone(); + let mut cur_c2 = file_contents.clone(); + for commit_contents in &all_new { + for fi in 0..num_files { + let na = tracker + .update_attributions( + &cur_c2[fi], + &commit_contents[fi], + &cur_ca[fi], + "__DUMMY__", + 1, + ) + .unwrap(); + let _la = + crate::authorship::attribution_tracker::attributions_to_line_attributions( + &na, + &commit_contents[fi], + ); + cur_ca[fi] = na; + cur_c2[fi] = commit_contents[fi].clone(); + } + } + let char_ms = start.elapsed().as_secs_f64() * 1000.0; + + let speedup = char_ms / diff_ms; + println!( + "{:>8} {:>12.1} {:>12.1} {:>8.1}x", + lines_per_file, diff_ms, char_ms, speedup + ); + } + println!("===================================================\n"); + } + + #[test] + fn diff_based_transfer_equal_content() { + let old = "line1\nline2\nline3\n"; + let new = "line1\nline2\nline3\n"; + let attrs = vec![ + LineAttribution { + start_line: 1, + end_line: 1, + author_id: "ai-a".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 2, + end_line: 2, + author_id: "ai-b".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 3, + end_line: 3, + author_id: "ai-a".to_string(), + overrode: None, + }, + ]; + let result = super::diff_based_line_attribution_transfer(old, new, &attrs); + assert_eq!(result.len(), 3); + assert_eq!(result[0].author_id, "ai-a"); + assert_eq!(result[1].author_id, "ai-b"); + assert_eq!(result[2].author_id, "ai-a"); + } + + #[test] + fn diff_based_transfer_insertion_shifts_lines() { + let old = "line1\nline2\nline3\n"; + let new = "line1\nnew_line\nline2\nline3\n"; + let attrs = vec![ + LineAttribution { + start_line: 1, + end_line: 1, + author_id: "ai-a".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 2, + end_line: 2, + author_id: "ai-b".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 3, + end_line: 3, + author_id: "ai-a".to_string(), + overrode: None, + }, + ]; + let result = super::diff_based_line_attribution_transfer(old, new, &attrs); + // line1 kept (line 1), new_line inserted (line 2, no attr), line2 kept (line 3), line3 kept (line 4) + assert_eq!(result.len(), 3); + assert_eq!(result[0].start_line, 1); + assert_eq!(result[0].author_id, "ai-a"); + assert_eq!(result[1].start_line, 3); // shifted from line 2 to line 3 + assert_eq!(result[1].author_id, "ai-b"); + assert_eq!(result[2].start_line, 4); // shifted from line 3 to line 4 + assert_eq!(result[2].author_id, "ai-a"); + } + + #[test] + fn diff_based_transfer_deletion_removes_line() { + let old = "line1\nline2\nline3\n"; + let new = "line1\nline3\n"; + let attrs = vec![ + LineAttribution { + start_line: 1, + end_line: 1, + author_id: "ai-a".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 2, + end_line: 2, + author_id: "ai-b".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 3, + end_line: 3, + author_id: "ai-a".to_string(), + overrode: None, + }, + ]; + let result = super::diff_based_line_attribution_transfer(old, new, &attrs); + // line1 kept (line 1), line2 deleted, line3 kept (line 2) + assert_eq!(result.len(), 2); + assert_eq!(result[0].start_line, 1); + assert_eq!(result[0].author_id, "ai-a"); + assert_eq!(result[1].start_line, 2); + assert_eq!(result[1].author_id, "ai-a"); + } + + #[test] + fn diff_based_transfer_replacement_drops_attribution() { + let old = "line1\nline2\nline3\n"; + let new = "line1\nmodified\nline3\n"; + let attrs = vec![ + LineAttribution { + start_line: 1, + end_line: 1, + author_id: "ai-a".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 2, + end_line: 2, + author_id: "ai-b".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 3, + end_line: 3, + author_id: "ai-a".to_string(), + overrode: None, + }, + ]; + let result = super::diff_based_line_attribution_transfer(old, new, &attrs); + // line1 kept (line 1), line2 replaced by "modified" (line 2, no attr), line3 kept (line 3) + assert_eq!(result.len(), 2); + assert_eq!(result[0].start_line, 1); + assert_eq!(result[0].author_id, "ai-a"); + assert_eq!(result[1].start_line, 3); + assert_eq!(result[1].author_id, "ai-a"); + } + + #[test] + fn diff_based_transfer_handles_duplicate_lines_correctly() { + // This tests the case that the old content-matching approach got wrong: + // identical lines from different authors should be tracked by position, not content + let old = "let x = 42;\nlet y = 0;\nlet x = 42;\n"; + let new = "let x = 42;\nlet z = 1;\nlet y = 0;\nlet x = 42;\n"; + let attrs = vec![ + LineAttribution { + start_line: 1, + end_line: 1, + author_id: "ai-a".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 2, + end_line: 2, + author_id: "ai-b".to_string(), + overrode: None, + }, + LineAttribution { + start_line: 3, + end_line: 3, + author_id: "ai-c".to_string(), + overrode: None, + }, + ]; + let result = super::diff_based_line_attribution_transfer(old, new, &attrs); + // line "let x = 42;" (1) kept as line 1 (ai-a) + // "let z = 1;" inserted (line 2, no attr) + // "let y = 0;" kept (line 3, ai-b) + // "let x = 42;" (3) kept as line 4 (ai-c) — NOT ai-a! + assert_eq!(result.len(), 3); + assert_eq!(result[0].start_line, 1); + assert_eq!(result[0].author_id, "ai-a"); + assert_eq!(result[1].start_line, 3); + assert_eq!(result[1].author_id, "ai-b"); + assert_eq!(result[2].start_line, 4); + assert_eq!(result[2].author_id, "ai-c"); + } } diff --git a/src/daemon.rs b/src/daemon.rs index 4493c1cb8..d5a785f5f 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -2629,6 +2629,17 @@ fn processed_rebase_new_heads(repository: &Repository) -> Result Ok(out) } +/// Check whether `ancestor` is an ancestor of `descendant` using +/// `git merge-base --is-ancestor`. +fn is_ancestor_commit(repository: &Repository, ancestor: &str, descendant: &str) -> bool { + let mut args = repository.global_args_for_exec(); + args.push("merge-base".to_string()); + args.push("--is-ancestor".to_string()); + args.push(ancestor.to_string()); + args.push(descendant.to_string()); + crate::git::repository::exec_git(&args).is_ok() +} + fn maybe_rebase_mappings_from_repository( repository: &Repository, old_head: &str, @@ -5607,6 +5618,7 @@ impl ActorDaemonCoordinator { && !is_zero_oid(new_head) { if let Ok(repository) = repository_for_rewrite_context(cmd, "reset_rewrite") + && !is_ancestor_commit(&repository, new_head, old_head) { if let Some((original_commits, new_commits)) = maybe_rebase_mappings_from_repository( @@ -5912,8 +5924,14 @@ impl ActorDaemonCoordinator { if reference.starts_with("refs/heads/") && !old.is_empty() && !new.is_empty() + && old != new + && is_valid_oid(old) + && !is_zero_oid(old) + && is_valid_oid(new) + && !is_zero_oid(new) && let Ok(repository) = repository_for_rewrite_context(cmd, "update_ref_rewrite") + && !is_ancestor_commit(&repository, new, old) && let Some((original_commits, new_commits)) = maybe_rebase_mappings_from_repository( &repository, diff --git a/src/daemon/analyzers/history.rs b/src/daemon/analyzers/history.rs index cbab29e36..0fdd6fece 100644 --- a/src/daemon/analyzers/history.rs +++ b/src/daemon/analyzers/history.rs @@ -500,13 +500,11 @@ fn parse_update_ref_heads( // The command has already executed; read the old value from the // reflog entry that recorded this update-ref. let worktree = cmd.worktree.as_deref()?; - resolve_worktree_head_reflog_old_oid_for_new_head(worktree, &new_oid) - .ok() - .flatten() + resolve_reflog_old_oid_for_ref_new_oid_in_worktree(worktree, &ref_name, &new_oid) .or_else(|| { - resolve_reflog_old_oid_for_ref_new_oid_in_worktree( - worktree, &ref_name, &new_oid, - ) + resolve_worktree_head_reflog_old_oid_for_new_head(worktree, &new_oid) + .ok() + .flatten() }) }) .filter(|oid| !oid.is_empty() && !is_zero_oid(oid))?; diff --git a/src/git/repository.rs b/src/git/repository.rs index c28419863..410f730e0 100644 --- a/src/git/repository.rs +++ b/src/git/repository.rs @@ -3094,15 +3094,27 @@ pub fn exec_git_stdin_with_profile( let mut child = cmd.spawn().map_err(GitAiError::IoError)?; - if let Some(mut stdin) = child.stdin.take() { - use std::io::Write; - if let Err(e) = stdin.write_all(stdin_data) { - return Err(GitAiError::IoError(e)); - } - } + // Write stdin in a separate thread to avoid deadlock: if we write all stdin + // before reading stdout, the child's stdout pipe buffer can fill up, causing + // the child to block on write, which prevents it from consuming more stdin, + // which blocks our write_all. Writing concurrently avoids this. + let stdin_handle = child.stdin.take().map(|mut stdin| { + let data = stdin_data.to_vec(); + std::thread::spawn(move || { + use std::io::Write; + stdin.write_all(&data) + }) + }); let output = child.wait_with_output().map_err(GitAiError::IoError)?; + if let Some(handle) = stdin_handle + && let Err(e) = handle.join().expect("stdin writer thread panicked") + && e.kind() != std::io::ErrorKind::BrokenPipe + { + return Err(GitAiError::IoError(e)); + } + if !output.status.success() { let code = output.status.code(); let stderr = String::from_utf8_lossy(&output.stderr).to_string(); @@ -3159,15 +3171,24 @@ pub fn exec_git_stdin_with_env_with_profile( let mut child = cmd.spawn().map_err(GitAiError::IoError)?; - if let Some(mut stdin) = child.stdin.take() { - use std::io::Write; - if let Err(e) = stdin.write_all(stdin_data) { - return Err(GitAiError::IoError(e)); - } - } + // Write stdin in a separate thread to avoid deadlock (see exec_git_stdin_with_profile). + let stdin_handle = child.stdin.take().map(|mut stdin| { + let data = stdin_data.to_vec(); + std::thread::spawn(move || { + use std::io::Write; + stdin.write_all(&data) + }) + }); let output = child.wait_with_output().map_err(GitAiError::IoError)?; + if let Some(handle) = stdin_handle + && let Err(e) = handle.join().expect("stdin writer thread panicked") + && e.kind() != std::io::ErrorKind::BrokenPipe + { + return Err(GitAiError::IoError(e)); + } + if !output.status.success() { let code = output.status.code(); let stderr = String::from_utf8_lossy(&output.stderr).to_string(); diff --git a/tests/integration/rebase.rs b/tests/integration/rebase.rs index 383420eff..79e2d51fe 100644 --- a/tests/integration/rebase.rs +++ b/tests/integration/rebase.rs @@ -1517,6 +1517,181 @@ fn test_rebase_preserves_custom_attributes_from_config() { feature_file.assert_lines_and_blame(crate::lines!["// AI feature code".ai()]); } +/// Regression test: prompt metrics (accepted_lines) must update per commit, not be frozen +/// from the initial state. When commit 1 has 2 AI lines and commit 2 adds 2 more +/// (total 4), the rebased notes should reflect different accepted_lines. +#[test] +fn test_rebase_prompt_metrics_update_per_commit() { + let repo = TestRepo::new(); + let default_branch = repo.current_branch(); + + // Initial setup + let mut base_file = repo.filename("base.txt"); + base_file.set_contents(crate::lines!["base content"]); + repo.stage_all_and_commit("Initial").unwrap(); + + // Create feature branch + repo.git(&["checkout", "-b", "feature"]).unwrap(); + + // Commit 1: add 2 AI lines + let mut ai_file = repo.filename("feature.txt"); + ai_file.set_contents(crate::lines!["line1".ai(), "line2".ai()]); + let commit1 = repo.stage_all_and_commit("AI commit 1 - 2 lines").unwrap(); + + // Commit 2: add 2 more AI lines (total 4) + ai_file.set_contents(crate::lines![ + "line1".ai(), + "line2".ai(), + "line3".ai(), + "line4".ai() + ]); + let commit2 = repo.stage_all_and_commit("AI commit 2 - 4 lines").unwrap(); + + // Verify pre-rebase: commit 1 has 2 accepted, commit 2 has 4 + let note1 = repo + .read_authorship_note(&commit1.commit_sha) + .expect("commit 1 should have note"); + let log1 = AuthorshipLog::deserialize_from_string(¬e1).expect("parse note 1"); + let note2 = repo + .read_authorship_note(&commit2.commit_sha) + .expect("commit 2 should have note"); + let log2 = AuthorshipLog::deserialize_from_string(¬e2).expect("parse note 2"); + + let pre_accepted_1: u32 = log1 + .metadata + .prompts + .values() + .map(|p| p.accepted_lines) + .sum(); + let pre_accepted_2: u32 = log2 + .metadata + .prompts + .values() + .map(|p| p.accepted_lines) + .sum(); + assert!( + pre_accepted_1 < pre_accepted_2, + "precondition: commit 2 ({}) should have more accepted_lines than commit 1 ({})", + pre_accepted_2, + pre_accepted_1 + ); + + // Advance default branch + repo.git(&["checkout", &default_branch]).unwrap(); + let mut other_file = repo.filename("other.txt"); + other_file.set_contents(crate::lines!["other"]); + repo.stage_all_and_commit("Main advances").unwrap(); + + // Rebase feature + repo.git(&["checkout", "feature"]).unwrap(); + repo.git(&["rebase", &default_branch]).unwrap(); + + // Get rebased commit SHAs + let rebased_tip = repo.git(&["rev-parse", "HEAD"]).unwrap().trim().to_string(); + let rebased_parent = repo + .git(&["rev-parse", "HEAD~1"]) + .unwrap() + .trim() + .to_string(); + + // Verify post-rebase: metrics should differ between the two commits + let rebased_note1 = repo + .read_authorship_note(&rebased_parent) + .expect("rebased commit 1 should have note"); + let rebased_log1 = + AuthorshipLog::deserialize_from_string(&rebased_note1).expect("parse rebased note 1"); + let rebased_note2 = repo + .read_authorship_note(&rebased_tip) + .expect("rebased commit 2 should have note"); + let rebased_log2 = + AuthorshipLog::deserialize_from_string(&rebased_note2).expect("parse rebased note 2"); + + let post_accepted_1: u32 = rebased_log1 + .metadata + .prompts + .values() + .map(|p| p.accepted_lines) + .sum(); + let post_accepted_2: u32 = rebased_log2 + .metadata + .prompts + .values() + .map(|p| p.accepted_lines) + .sum(); + + assert!( + post_accepted_1 < post_accepted_2, + "regression: rebased commit 2 ({}) should have more accepted_lines than commit 1 ({}). \ + If equal, the fast path is freezing metrics across commits.", + post_accepted_2, + post_accepted_1 + ); +} + +/// Regression test: attributions should survive a delete-recreate cycle within a rebase. +/// If a file is deleted in commit N and recreated in commit N+1, the recreated file +/// should inherit attributions from the pre-deletion state via positional diff transfer. +#[test] +fn test_rebase_file_delete_recreate_preserves_attribution() { + let repo = TestRepo::new(); + let default_branch = repo.current_branch(); + + // Initial setup + let mut base_file = repo.filename("base.txt"); + base_file.set_contents(crate::lines!["base content"]); + repo.stage_all_and_commit("Initial").unwrap(); + + // Create feature branch with AI file + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let mut ai_file = repo.filename("feature.txt"); + ai_file.set_contents(crate::lines!["line1".ai(), "line2".ai(), "line3".ai()]); + repo.stage_all_and_commit("Add AI file").unwrap(); + + // Delete the file + repo.git(&["rm", "feature.txt"]).unwrap(); + repo.stage_all_and_commit("Delete AI file").unwrap(); + + // Recreate the file with same content + ai_file.set_contents(crate::lines!["line1".ai(), "line2".ai(), "line3".ai()]); + let recreate_commit = repo.stage_all_and_commit("Recreate AI file").unwrap(); + + // Verify pre-rebase: recreated file has attributions + let pre_note = repo + .read_authorship_note(&recreate_commit.commit_sha) + .expect("recreated commit should have note"); + let pre_log = AuthorshipLog::deserialize_from_string(&pre_note).expect("parse pre note"); + assert!( + !pre_log.attestations.is_empty(), + "precondition: recreated file should have attestations" + ); + + // Advance default branch + repo.git(&["checkout", &default_branch]).unwrap(); + let mut other_file = repo.filename("other.txt"); + other_file.set_contents(crate::lines!["other"]); + repo.stage_all_and_commit("Main advances").unwrap(); + + // Rebase feature + repo.git(&["checkout", "feature"]).unwrap(); + repo.git(&["rebase", &default_branch]).unwrap(); + + // Check rebased tip (the recreate commit) + let rebased_sha = repo.git(&["rev-parse", "HEAD"]).unwrap().trim().to_string(); + let rebased_note = repo + .read_authorship_note(&rebased_sha) + .expect("rebased recreate commit should have note"); + let rebased_log = + AuthorshipLog::deserialize_from_string(&rebased_note).expect("parse rebased note"); + + assert!( + !rebased_log.attestations.is_empty(), + "regression: file recreated after deletion should still have attestations after rebase" + ); + + // Verify the AI attribution itself survived + ai_file.assert_lines_and_blame(crate::lines!["line1".ai(), "line2".ai(), "line3".ai()]); +} + crate::reuse_tests_in_worktree!( test_rebase_no_conflicts_identical_trees, test_rebase_with_different_trees, @@ -1543,6 +1718,8 @@ crate::reuse_tests_in_worktree!( test_rebase_exec, test_rebase_preserve_merges, test_rebase_commit_splitting, + test_rebase_prompt_metrics_update_per_commit, + test_rebase_file_delete_recreate_preserves_attribution, ); crate::reuse_tests_in_worktree_with_attrs!( diff --git a/tests/integration/rebase_benchmark.rs b/tests/integration/rebase_benchmark.rs index 69d20ffce..4b71b7c70 100644 --- a/tests/integration/rebase_benchmark.rs +++ b/tests/integration/rebase_benchmark.rs @@ -297,6 +297,311 @@ fn benchmark_rebase_with_perf_json() { } } +/// Benchmark diff-based attribution transfer with large files and content changes. +/// This tests the scenario where rebasing changes file content (main branch modifies +/// AI-tracked files), forcing the diff-based path instead of the fast-path note remap. +/// +/// Scale: 50 commits × 10 files × 200 lines = significant AI-authored content. +/// The diff-based path should complete the per-commit processing loop in <10ms total. +#[test] +#[ignore] +fn benchmark_rebase_diff_based_large() { + let num_feature_commits: usize = std::env::var("REBASE_BENCH_FEATURE_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(50); + let num_ai_files: usize = std::env::var("REBASE_BENCH_AI_FILES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + let lines_per_file: usize = std::env::var("REBASE_BENCH_LINES_PER_FILE") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(200); + + println!("\n=== Diff-Based Large Rebase Benchmark ==="); + println!("Feature commits: {}", num_feature_commits); + println!("AI files: {}", num_ai_files); + println!("Lines per file: {}", lines_per_file); + println!("==========================================\n"); + + let repo = TestRepo::new(); + + // Create initial commit with shared files (both branches will modify) + { + for file_idx in 0..num_ai_files { + let filename = format!("shared/mod_{}/f_{}.rs", file_idx, file_idx); + let mut file = repo.filename(&filename); + let mut lines: Vec = Vec::new(); + lines.push(format!("// Header for module {}", file_idx).into()); + lines.push("// Main branch will add lines above this marker".into()); + for line_idx in 0..lines_per_file { + lines.push(format!("// Initial AI code mod{} line{}", file_idx, line_idx).ai()); + } + file.set_contents(lines); + } + repo.stage_all_and_commit("Initial shared files").unwrap(); + } + + let default_branch = repo.current_branch(); + + // Create feature branch with AI commits + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let setup_start = Instant::now(); + for commit_idx in 0..num_feature_commits { + for file_idx in 0..num_ai_files { + let filename = format!("shared/mod_{}/f_{}.rs", file_idx, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = format!( + "{}\n// AI addition v{} mod{}", + current, commit_idx, file_idx + ); + fs::write(&path, &new_content).unwrap(); + repo.git_ai(&["checkpoint", "mock_ai", &filename]).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("AI feature {}", commit_idx)) + .unwrap(); + + if (commit_idx + 1) % 10 == 0 { + println!( + " Feature commit {}/{} ({:.1}s)", + commit_idx + 1, + num_feature_commits, + setup_start.elapsed().as_secs_f64() + ); + } + } + println!("Feature setup: {:.1}s", setup_start.elapsed().as_secs_f64()); + + // Advance main branch with modifications to AI-tracked files (forces content changes on rebase) + repo.git(&["checkout", &default_branch]).unwrap(); + for main_idx in 0..5 { + for file_idx in 0..num_ai_files { + let filename = format!("shared/mod_{}/f_{}.rs", file_idx, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = current.replacen( + "// Main branch will add lines above this marker", + &format!( + "// Main addition {} for mod{}\n// Main branch will add lines above this marker", + main_idx, file_idx + ), + 1, + ); + fs::write(&path, &new_content).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("Main change {}", main_idx)) + .unwrap(); + } + + // Unrelated main commits + for i in 0..10 { + let filename = format!("main_only/change_{}.txt", i); + let mut file = repo.filename(&filename); + file.set_contents(crate::lines![format!("main only {}", i)]); + repo.stage_all_and_commit(&format!("Main unrelated {}", i)) + .unwrap(); + } + + // Rebase feature onto main + repo.git(&["checkout", "feature"]).unwrap(); + let timing_file = repo.path().join("..").join("rebase_timing_diff.txt"); + let timing_path = timing_file.to_str().unwrap().to_string(); + + println!("\n--- Starting diff-based rebase ---"); + let rebase_start = Instant::now(); + let result = repo.git_with_env( + &["rebase", &default_branch], + &[ + ("GIT_AI_DEBUG_PERFORMANCE", "1"), + ("GIT_AI_REBASE_TIMING_FILE", &timing_path), + ], + None, + ); + let rebase_duration = rebase_start.elapsed(); + + match &result { + Ok(_) => println!("Rebase succeeded in {:.3}s", rebase_duration.as_secs_f64()), + Err(e) => println!( + "Rebase FAILED in {:.3}s: {}", + rebase_duration.as_secs_f64(), + e + ), + } + result.unwrap(); + + if let Ok(timing_data) = fs::read_to_string(&timing_file) { + println!("\n=== PHASE TIMING BREAKDOWN ==="); + print!("{}", timing_data); + println!("==============================="); + } + + println!("\n=== DIFF-BASED LARGE BENCHMARK RESULTS ==="); + println!( + "Total rebase time: {:.3}s ({:.0}ms)", + rebase_duration.as_secs_f64(), + rebase_duration.as_millis() + ); + println!( + "Per-commit average: {:.1}ms", + rebase_duration.as_millis() as f64 / num_feature_commits as f64 + ); + println!("============================================\n"); +} + +/// Benchmark comparing the notes-based fast path vs blame-based slow path. +/// Runs the same rebase twice: once with notes (fast) and once without (blame fallback). +/// +/// Run with: cargo test --test integration benchmark_blame_vs_diff -- --ignored --nocapture +#[test] +#[ignore] +fn benchmark_blame_vs_diff() { + let num_feature_commits: usize = std::env::var("REBASE_BENCH_FEATURE_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(30); + let num_ai_files: usize = std::env::var("REBASE_BENCH_AI_FILES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(5); + let lines_per_file: usize = std::env::var("REBASE_BENCH_LINES_PER_FILE") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(50); + + println!("\n=== Blame vs Diff-Based Benchmark ==="); + println!("Feature commits: {}", num_feature_commits); + println!("AI files: {}", num_ai_files); + println!("Lines per file: {}", lines_per_file); + println!("======================================\n"); + + // Helper closure to create a test repo with the same setup + let create_repo = |strip_notes: bool| -> (std::time::Duration, String) { + let repo = TestRepo::new(); + for file_idx in 0..num_ai_files { + let filename = format!("shared/mod_{}/f_{}.rs", file_idx, file_idx); + let mut file = repo.filename(&filename); + let mut lines: Vec = Vec::new(); + lines.push(format!("// Header for module {}", file_idx).into()); + lines.push("// Main branch marker".into()); + for line_idx in 0..lines_per_file { + lines.push(format!("// AI code mod{} line{}", file_idx, line_idx).ai()); + } + file.set_contents(lines); + } + repo.stage_all_and_commit("Initial shared files").unwrap(); + let default_branch = repo.current_branch(); + + repo.git(&["checkout", "-b", "feature"]).unwrap(); + for commit_idx in 0..num_feature_commits { + for file_idx in 0..num_ai_files { + let filename = format!("shared/mod_{}/f_{}.rs", file_idx, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = format!( + "{}\n// AI addition v{} mod{}", + current, commit_idx, file_idx + ); + fs::write(&path, &new_content).unwrap(); + repo.git_ai(&["checkpoint", "mock_ai", &filename]).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("AI feature {}", commit_idx)) + .unwrap(); + } + + if strip_notes { + // Delete the authorship notes ref to force the blame-based fallback + let _ = repo.git(&["update-ref", "-d", "refs/notes/git-ai-authorship"]); + } + + repo.git(&["checkout", &default_branch]).unwrap(); + for main_idx in 0..5 { + for file_idx in 0..num_ai_files { + let filename = format!("shared/mod_{}/f_{}.rs", file_idx, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = current.replacen( + "// Main branch marker", + &format!( + "// Main addition {} mod{}\n// Main branch marker", + main_idx, file_idx + ), + 1, + ); + fs::write(&path, &new_content).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("Main {}", main_idx)) + .unwrap(); + } + + repo.git(&["checkout", "feature"]).unwrap(); + let timing_file = repo.path().join("..").join(if strip_notes { + "timing_no_notes.txt" + } else { + "timing_with_notes.txt" + }); + let timing_path = timing_file.to_str().unwrap().to_string(); + + let rebase_start = Instant::now(); + repo.git_with_env( + &["rebase", &default_branch], + &[ + ("GIT_AI_DEBUG_PERFORMANCE", "1"), + ("GIT_AI_REBASE_TIMING_FILE", &timing_path), + ], + None, + ) + .unwrap(); + let duration = rebase_start.elapsed(); + + let timing_data = fs::read_to_string(&timing_file).unwrap_or_default(); + (duration, timing_data) + }; + + // Run with notes (diff-based fast path) + let (with_notes_dur, with_notes_timing) = create_repo(false); + println!("--- WITH NOTES (diff-based path) ---"); + print!("{}", with_notes_timing); + println!("Total rebase: {:.0}ms\n", with_notes_dur.as_millis()); + + // Run without notes (blame-based slow path) + let (no_notes_dur, no_notes_timing) = create_repo(true); + println!("--- WITHOUT NOTES (blame-based fallback) ---"); + print!("{}", no_notes_timing); + println!("Total rebase: {:.0}ms\n", no_notes_dur.as_millis()); + + let authorship_with = + extract_timing(&with_notes_timing, "TOTAL").unwrap_or(with_notes_dur.as_millis() as u64); + let authorship_without = + extract_timing(&no_notes_timing, "TOTAL").unwrap_or(no_notes_dur.as_millis() as u64); + + if authorship_without > 0 { + let speedup = authorship_without as f64 / authorship_with.max(1) as f64; + println!("=== COMPARISON ==="); + println!("Authorship rewrite with notes: {}ms", authorship_with); + println!("Authorship rewrite without notes: {}ms", authorship_without); + println!("Speedup: {:.1}x", speedup); + println!("==================\n"); + } +} + +fn extract_timing(data: &str, key: &str) -> Option { + for line in data.lines() { + let trimmed = line.trim(); + if trimmed.starts_with(key) + && let Some(val) = trimmed.split('=').nth(1) + { + return val.trim_end_matches("ms").parse().ok(); + } + } + None +} + /// Benchmark that forces the SLOW path (VirtualAttributions + blame) by having /// main branch also modify AI-touched files. This causes blob differences /// between original and rebased commits, making the fast-path note remap fail. @@ -474,3 +779,242 @@ fn benchmark_rebase_slow_path() { ); println!("====================================\n"); } + +/// Large-scale benchmark with mixed file sizes for PR comparison. +/// +/// Creates: +/// - 200 AI-tracked files (150 × 1000 lines, 50 × 5000 lines) +/// - 150 feature commits, each modifying all files (ensuring AI attribution on every commit) +/// - Main branch also modifies the same files (forces diff-based path, not blob-copy fast path) +/// +/// Run with: cargo test --package git-ai --test integration benchmark_large_scale_mixed -- --ignored --nocapture +#[test] +#[ignore] +fn benchmark_large_scale_mixed() { + let num_small_files: usize = std::env::var("BENCH_SMALL_FILES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(150); + let num_large_files: usize = std::env::var("BENCH_LARGE_FILES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(50); + let small_file_lines: usize = std::env::var("BENCH_SMALL_LINES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(1000); + let large_file_lines: usize = std::env::var("BENCH_LARGE_LINES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(5000); + let num_feature_commits: usize = std::env::var("BENCH_FEATURE_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(150); + let num_main_commits: usize = std::env::var("BENCH_MAIN_COMMITS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(10); + + let total_files = num_small_files + num_large_files; + let total_initial_lines = + num_small_files * small_file_lines + num_large_files * large_file_lines; + + println!("\n=== Large-Scale Mixed Benchmark ==="); + println!( + "Small files: {} × {} lines", + num_small_files, small_file_lines + ); + println!( + "Large files: {} × {} lines", + num_large_files, large_file_lines + ); + println!("Total files: {}", total_files); + println!("Total initial lines: {}", total_initial_lines); + println!("Feature commits: {}", num_feature_commits); + println!("Main commits: {}", num_main_commits); + println!("====================================\n"); + + let repo = TestRepo::new(); + let setup_start = Instant::now(); + + // Create initial commit with all files + { + for file_idx in 0..total_files { + let lines_for_file = if file_idx < num_small_files { + small_file_lines + } else { + large_file_lines + }; + let filename = format!("src/mod_{}/file_{}.rs", file_idx % 20, file_idx); + let mut file = repo.filename(&filename); + let mut lines: Vec = Vec::new(); + lines.push(format!("// Module {} header", file_idx).into()); + lines.push("// MAIN_MARKER".into()); + for line_idx in 0..lines_for_file { + lines.push( + format!( + "fn func_{}_{}() {{ /* AI generated */ }}", + file_idx, line_idx + ) + .ai(), + ); + } + file.set_contents(lines); + } + repo.stage_all_and_commit("Initial: all AI files").unwrap(); + } + println!( + "Initial commit setup: {:.1}s", + setup_start.elapsed().as_secs_f64() + ); + + let default_branch = repo.current_branch(); + + // Create feature branch + repo.git(&["checkout", "-b", "feature"]).unwrap(); + let feature_start = Instant::now(); + + for commit_idx in 0..num_feature_commits { + // Each commit modifies a subset of files (rotating window of ~20 files) + // but touches enough to exercise the diff path + let files_per_commit = 20.min(total_files); + let start_file = (commit_idx * 7) % total_files; // rotating start to vary which files + + for i in 0..files_per_commit { + let file_idx = (start_file + i) % total_files; + let filename = format!("src/mod_{}/file_{}.rs", file_idx % 20, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + // Append AI-authored line at the end + let new_content = format!( + "{}\nfn feature_{}_in_{}() {{ /* AI commit {} */ }}", + current, commit_idx, file_idx, commit_idx + ); + fs::write(&path, &new_content).unwrap(); + repo.git_ai(&["checkpoint", "mock_ai", &filename]).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("AI feature {}", commit_idx)) + .unwrap(); + + if (commit_idx + 1) % 25 == 0 { + println!( + " Feature commit {}/{} ({:.1}s)", + commit_idx + 1, + num_feature_commits, + feature_start.elapsed().as_secs_f64() + ); + } + } + println!( + "Feature branch setup: {:.1}s ({} commits)", + feature_start.elapsed().as_secs_f64(), + num_feature_commits + ); + + // Advance main branch — modify AI-tracked files to force diff-based path + repo.git(&["checkout", &default_branch]).unwrap(); + let main_start = Instant::now(); + for main_idx in 0..num_main_commits { + // Main modifies a different rotating set of files at the MARKER line + let files_per_main = 30.min(total_files); + let start_file = (main_idx * 13) % total_files; + for i in 0..files_per_main { + let file_idx = (start_file + i) % total_files; + let filename = format!("src/mod_{}/file_{}.rs", file_idx % 20, file_idx); + let path = repo.path().join(&filename); + let current = fs::read_to_string(&path).unwrap_or_default(); + let new_content = current.replacen( + "// MAIN_MARKER", + &format!( + "// Main change {} in file {}\n// MAIN_MARKER", + main_idx, file_idx + ), + 1, + ); + fs::write(&path, &new_content).unwrap(); + } + repo.git(&["add", "-A"]).unwrap(); + repo.stage_all_and_commit(&format!("Main {}", main_idx)) + .unwrap(); + } + // Add unrelated main commits + for i in 0..5 { + let mut f = repo.filename(&format!("main_only/f_{}.txt", i)); + f.set_contents(crate::lines![format!("main only {}", i)]); + repo.stage_all_and_commit(&format!("Main unrelated {}", i)) + .unwrap(); + } + println!( + "Main branch setup: {:.1}s", + main_start.elapsed().as_secs_f64() + ); + println!("Total setup: {:.1}s", setup_start.elapsed().as_secs_f64()); + + // Rebase using benchmark_git for structured timing + repo.git(&["checkout", "feature"]).unwrap(); + + println!( + "\n--- Starting rebase ({} commits onto {}) ---", + num_feature_commits, &default_branch + ); + let wall_start = Instant::now(); + let bench_result = repo.benchmark_git(&["rebase", &default_branch]); + let wall_duration = wall_start.elapsed(); + + match &bench_result { + Ok(bench) => { + let git_ms = bench.git_duration.as_millis(); + let total_ms = bench.total_duration.as_millis(); + let pre_ms = bench.pre_command_duration.as_millis(); + let post_ms = bench.post_command_duration.as_millis(); + let overhead_ms = total_ms.saturating_sub(git_ms); + let overhead_pct = if git_ms > 0 { + overhead_ms as f64 / git_ms as f64 * 100.0 + } else { + 0.0 + }; + + println!("\n╔══════════════════════════════════════════════════════════╗"); + println!("║ LARGE-SCALE BENCHMARK RESULTS ║"); + println!("╠══════════════════════════════════════════════════════════╣"); + println!( + "║ Files: {} ({} × {}L + {} × {}L)", + total_files, num_small_files, small_file_lines, num_large_files, large_file_lines + ); + println!("║ Initial lines: {}", total_initial_lines); + println!("║ Commits: {}", num_feature_commits); + println!("╠══════════════════════════════════════════════════════════╣"); + println!("║ Wall time: {:.3}s", wall_duration.as_secs_f64()); + println!("║ Total (wrapper): {}ms", total_ms); + println!("║ Git rebase: {}ms", git_ms); + println!("║ Pre-command: {}ms", pre_ms); + println!("║ Post-command: {}ms", post_ms); + println!( + "║ Overhead: {}ms ({:.1}% of git time)", + overhead_ms, overhead_pct + ); + println!( + "║ Per-commit avg: {:.1}ms total, {:.1}ms git, {:.1}ms overhead", + total_ms as f64 / num_feature_commits as f64, + git_ms as f64 / num_feature_commits as f64, + overhead_ms as f64 / num_feature_commits as f64, + ); + println!("╚══════════════════════════════════════════════════════════╝\n"); + } + Err(e) => { + println!( + "Benchmark failed after {:.3}s: {}", + wall_duration.as_secs_f64(), + e + ); + println!( + "Wall time: {:.3}s ({:.0}ms)", + wall_duration.as_secs_f64(), + wall_duration.as_millis() + ); + panic!("Benchmark failed: {}", e); + } + } +}