Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ tasks/
# Fuzz testing
fuzz/artifacts/
fuzz/corpus/
src/authorship/snapshots/*.snap.new
157 changes: 130 additions & 27 deletions src/authorship/attribution_tracker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,21 @@ impl Ord for Token {
}
}

/// Line-level statistics derived from the diff computation.
/// Returned alongside attribution results so callers don't need a second diff pass.
#[derive(Debug, Clone, Default)]
pub struct DiffLineStats {
pub additions: u32,
pub deletions: u32,
pub additions_sloc: u32,
pub deletions_sloc: u32,
}

#[derive(Default)]
struct DiffComputation {
diffs: Vec<ByteDiff>,
substantive_new_ranges: Vec<(usize, usize)>,
line_stats: DiffLineStats,
}

/// Configuration for the attribution tracker
Expand Down Expand Up @@ -353,6 +364,57 @@ impl AttributionTracker {

self.push_equal_lines(op, &old_lines, old_content, &mut computation.diffs)?;
} else {
// Accumulate line stats from non-equal ops
match &op {
DiffOp::Delete {
old_index, old_len, ..
} => {
computation.line_stats.deletions += *old_len as u32;
for i in *old_index..(*old_index + *old_len) {
if let Some(line) = old_lines.get(i)
&& !line.text.trim().is_empty()
{
computation.line_stats.deletions_sloc += 1;
}
}
}
DiffOp::Insert {
new_index, new_len, ..
} => {
computation.line_stats.additions += *new_len as u32;
for i in *new_index..(*new_index + *new_len) {
if let Some(line) = new_lines.get(i)
&& !line.text.trim().is_empty()
{
computation.line_stats.additions_sloc += 1;
}
}
}
DiffOp::Replace {
old_index,
old_len,
new_index,
new_len,
} => {
computation.line_stats.deletions += *old_len as u32;
computation.line_stats.additions += *new_len as u32;
Comment on lines +372 to +400
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 DiffLineStats counts blank lines in additions/deletions while old code did not

The new DiffLineStats accumulated in compute_diffs uses *new_len as u32 and *old_len as u32 from the DiffOp, which counts every line including blank lines. The old compute_file_line_stats (at src/commands/checkpoint.rs:2012) used change.value().lines().count(), and Rust's .lines() returns an empty iterator for strings like "\n" (a blank line with only a newline terminator), so blank lines were counted as 0. This means additions and deletions in CheckpointLineStats will now be higher whenever blank lines are added or deleted. The additions_sloc/deletions_sloc fields are unaffected since both old and new code filter whitespace-only lines. These stats are stored in checkpoint data and emitted as telemetry metrics (lines_added, lines_deleted), so the behavioral change affects both persisted data and reported metrics.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

for i in *old_index..(*old_index + *old_len) {
if let Some(line) = old_lines.get(i)
&& !line.text.trim().is_empty()
{
computation.line_stats.deletions_sloc += 1;
}
}
for i in *new_index..(*new_index + *new_len) {
if let Some(line) = new_lines.get(i)
&& !line.text.trim().is_empty()
{
computation.line_stats.additions_sloc += 1;
}
}
}
DiffOp::Equal { .. } => unreachable!(),
Comment on lines +368 to +416
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 DiffLineStats uses different diff post-processing than the replaced compute_file_line_stats, changing reported metrics

The new DiffLineStats accumulated in compute_diffs counts lines from capture_diff_slices (src/authorship/imara_diff_utils.rs:132), which does NOT call diff.postprocess_lines(). The old compute_file_line_stats (src/commands/checkpoint.rs:2012) used compute_line_changes (src/authorship/imara_diff_utils.rs:178) which DOES call diff.postprocess_lines(&input) at line 185. The postprocess_lines method adjusts hunk boundaries to produce more natural diffs matching git's output, which can change which lines are counted as added/deleted vs equal. This means the additions, deletions, additions_sloc, and deletions_sloc values persisted in checkpoint.line_stats and emitted as checkpoint metrics can differ from the old behavior for the same file changes.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

}
pending_changed.push(op);
}
}
Expand Down Expand Up @@ -497,45 +559,62 @@ impl AttributionTracker {
Ok(())
}

/// Attribute all unattributed ranges to the given author
/// Attribute all unattributed ranges to the given author.
///
/// Uses a merged-intervals sweep for O(n + m) where n = content chars,
/// m = number of attributions (instead of the previous O(n * m)).
pub fn attribute_unattributed_ranges(
&self,
content: &str,
prev_attributions: &[Attribution],
author: &str,
ts: u128,
) -> Vec<Attribution> {
let mut attributions = prev_attributions.to_vec();
let mut range_start: Option<usize> = None;
if content.is_empty() {
return prev_attributions.to_vec();
}

// Find all unattributed character ranges on UTF-8 boundaries.
for (idx, ch) in content.char_indices() {
let end = idx + ch.len_utf8();
let covered = attributions.iter().any(|a| a.overlaps(idx, end));
// Build sorted, merged coverage intervals from existing attributions.
// This lets us sweep through the content with a single cursor.
let mut intervals: Vec<(usize, usize)> = prev_attributions
.iter()
.filter(|a| a.start < a.end)
.map(|a| (a.start, a.end))
.collect();
intervals.sort_unstable_by_key(|&(s, _)| s);

if covered {
if let Some(start) = range_start.take()
&& start < idx
{
attributions.push(Attribution::new(start, idx, author.to_string(), ts));
}
} else if range_start.is_none() {
range_start = Some(idx);
// Merge overlapping intervals
let mut merged: Vec<(usize, usize)> = Vec::with_capacity(intervals.len());
for (s, e) in intervals {
if let Some(last) = merged.last_mut()
&& s <= last.1
{
last.1 = last.1.max(e);
continue;
}
merged.push((s, e));
}

if let Some(start) = range_start.take()
&& start < content.len()
{
attributions.push(Attribution::new(
start,
content.len(),
author.to_string(),
ts,
));
// Sweep: find gaps between merged intervals within [0, content.len())
let mut new_attributions = Vec::new();
let content_len = content.len();
let mut pos = 0;
for &(start, end) in &merged {
if pos < start && pos < content_len {
// Gap before this interval — attribute it
let gap_end = start.min(content_len);
new_attributions.push(Attribution::new(pos, gap_end, author.to_string(), ts));
}
pos = end;
}
// Gap after the last interval
if pos < content_len {
new_attributions.push(Attribution::new(pos, content_len, author.to_string(), ts));
}

attributions
let mut result = prev_attributions.to_vec();
result.extend(new_attributions);
result
}

/// Update attributions from old content to new content
Expand Down Expand Up @@ -575,14 +654,38 @@ impl AttributionTracker {
ts: u128,
is_ai_checkpoint: bool,
) -> Result<Vec<Attribution>, GitAiError> {
let (attrs, _) = self.update_attributions_for_checkpoint_with_stats(
old_content,
new_content,
old_attributions,
current_author,
ts,
is_ai_checkpoint,
)?;
Ok(attrs)
}

/// Like `update_attributions_for_checkpoint`, but also returns line-level diff
/// statistics derived from the same diff computation. This avoids a redundant
/// second diff pass when the caller needs both attributions and line stats.
pub fn update_attributions_for_checkpoint_with_stats(
&self,
old_content: &str,
new_content: &str,
old_attributions: &[Attribution],
current_author: &str,
ts: u128,
is_ai_checkpoint: bool,
) -> Result<(Vec<Attribution>, DiffLineStats), GitAiError> {
// Cursor-based scans in transform_attributions assume sorted ranges.
// Normalize once at the boundary so callers can pass ranges in any order.
let sorted_old_storage = (!is_attribution_list_sorted(old_attributions))
.then(|| sort_attributions_for_transform(old_attributions));
let old_attributions = sorted_old_storage.as_deref().unwrap_or(old_attributions);

// Phase 1: Compute diff
// Phase 1: Compute diff (also accumulates line stats)
let diff_result = self.compute_diffs(old_content, new_content, is_ai_checkpoint)?;
let line_stats = diff_result.line_stats.clone();

// Phase 2: Build deletion and insertion catalogs
let (deletions, insertions) = self.build_diff_catalog(&diff_result.diffs);
Expand Down Expand Up @@ -612,7 +715,7 @@ impl AttributionTracker {
);

// Phase 5: Merge and clean up
Ok(self.merge_attributions(new_attributions))
Ok((self.merge_attributions(new_attributions), line_stats))
}

fn should_skip_move_detection(
Expand Down
Loading
Loading