Skip to content

Commit dbe5cbd

Browse files
authored
Fix line numbers (#493)
* add (failing) unit test for line number bug * fix line reporting by showing latest shared token * report structure mismatches with different tokens
1 parent 97acd89 commit dbe5cbd

9 files changed

Lines changed: 268 additions & 62 deletions

File tree

PythonScripts/audit_translations/auditor.py

Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -291,11 +291,84 @@ def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -
291291
return lines[0] if lines else rule.line_number
292292

293293

294+
def structure_token_occurrence_index(tokens: List[str], position: int) -> Optional[int]:
295+
"""
296+
Return which occurrence of a token appears at a given absolute token position.
297+
298+
Example: for ["test:", "if:", "test:"], position 2 returns 1.
299+
"""
300+
if position < 0 or position >= len(tokens):
301+
return None
302+
token = tokens[position]
303+
return sum(1 for current in tokens[:position] if current == token)
304+
305+
306+
def resolve_structure_issue_lines(diff: RuleDifference) -> Optional[Tuple[int, int]]:
307+
"""
308+
Resolve stable line anchors for a structural rule difference.
309+
310+
Strategy:
311+
- Use position-aware token occurrence matching when possible.
312+
- For insert/delete cases (one side missing token), anchor to the previous
313+
shared structural token; if unavailable, anchor to `replace:`.
314+
"""
315+
en_tokens = extract_structure_elements(diff.english_rule.data)
316+
tr_tokens = extract_structure_elements(diff.translated_rule.data)
317+
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)
318+
319+
if mismatch_pos < 0:
320+
return None
321+
322+
# Insertion/deletion: anchor to the previous shared token if possible.
323+
if en_token is None or tr_token is None:
324+
anchor_pos = mismatch_pos - 1
325+
if (
326+
anchor_pos >= 0
327+
and anchor_pos < len(en_tokens)
328+
and anchor_pos < len(tr_tokens)
329+
and en_tokens[anchor_pos] == tr_tokens[anchor_pos]
330+
):
331+
anchor_token = en_tokens[anchor_pos]
332+
en_occ = structure_token_occurrence_index(en_tokens, anchor_pos)
333+
tr_occ = structure_token_occurrence_index(tr_tokens, anchor_pos)
334+
if en_occ is not None and tr_occ is not None:
335+
line_en = resolve_issue_line_at_position(diff.english_rule, "structure", anchor_token, en_occ)
336+
line_tr = resolve_issue_line_at_position(diff.translated_rule, "structure", anchor_token, tr_occ)
337+
if line_en is not None and line_tr is not None:
338+
return line_en, line_tr
339+
340+
# Fallback: anchor both sides to replace, which is the rule body entrypoint.
341+
line_en = resolve_issue_line(diff.english_rule, "structure", "replace:") or diff.english_rule.line_number
342+
line_tr = resolve_issue_line(diff.translated_rule, "structure", "replace:") or diff.translated_rule.line_number
343+
return line_en, line_tr
344+
345+
# Exact token available on both sides: resolve by occurrence index at mismatch.
346+
en_occ = structure_token_occurrence_index(en_tokens, mismatch_pos)
347+
tr_occ = structure_token_occurrence_index(tr_tokens, mismatch_pos)
348+
if en_occ is not None and tr_occ is not None:
349+
line_en = resolve_issue_line_at_position(diff.english_rule, "structure", en_token, en_occ)
350+
line_tr = resolve_issue_line_at_position(diff.translated_rule, "structure", tr_token, tr_occ)
351+
if line_en is not None and line_tr is not None:
352+
return line_en, line_tr
353+
354+
line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
355+
line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
356+
if line_en is None or line_tr is None:
357+
return None
358+
return line_en, line_tr
359+
360+
294361
def collect_issues(
295362
result: ComparisonResult,
296363
file_name: str,
297364
language: str,
298365
) -> List[dict]:
366+
"""
367+
Flatten a ComparisonResult into one normalized dictionary per issue.
368+
369+
This is the canonical bridge from parser/diff objects to serializable
370+
records consumed by JSONL output, snapshot tests, and line-level assertions.
371+
"""
299372
issues = []
300373

301374
for rule in result.missing_rules:
@@ -345,23 +418,10 @@ def collect_issues(
345418
rule = diff.english_rule
346419
issue = issue_base(rule, file_name, language)
347420
if diff.diff_type == "structure":
348-
en_tokens = extract_structure_elements(diff.english_rule.data)
349-
tr_tokens = extract_structure_elements(diff.translated_rule.data)
350-
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)
351-
352-
# Skip reporting when tokens are misaligned (both exist but differ)
353-
# This avoids misleading line numbers when entire blocks are missing/added
354-
# We only report when one is None (clear case of missing element)
355-
if en_token is not None and tr_token is not None and en_token != tr_token:
356-
continue
357-
358-
issue_line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
359-
issue_line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
360-
361-
# Skip reporting structure differences where we can't find both tokens
362-
# This avoids misleading line numbers when blocks are missing
363-
if issue_line_en is None or issue_line_tr is None:
421+
structure_lines = resolve_structure_issue_lines(diff)
422+
if structure_lines is None:
364423
continue
424+
issue_line_en, issue_line_tr = structure_lines
365425
else:
366426
issue_line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
367427
issue_line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
@@ -444,20 +504,10 @@ def add_issue(rule: RuleInfo, issue_type: str, payload: Dict[str, Any]) -> None:
444504

445505
for diff in result.rule_differences:
446506
if diff.diff_type == "structure":
447-
en_tokens = extract_structure_elements(diff.english_rule.data)
448-
tr_tokens = extract_structure_elements(diff.translated_rule.data)
449-
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)
450-
451-
# Skip reporting when tokens are misaligned (both exist but differ)
452-
# This avoids misleading line numbers when entire blocks are missing/added
453-
if en_token is not None and tr_token is not None and en_token != tr_token:
454-
continue
455-
456-
line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
457-
line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
458-
# Skip structure diffs where we can't find both tokens
459-
if line_en is None or line_tr is None:
507+
structure_lines = resolve_structure_issue_lines(diff)
508+
if structure_lines is None:
460509
continue
510+
line_en, line_tr = structure_lines
461511
else:
462512
line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
463513
line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Repro Fixtures
2+
3+
This folder contains minimal fixtures from snapshots of the Rules at some point in time.
4+
5+
The first use of this is comparing `per-fraction` between the English and Norwegian rules as of 2026-02-17, which had a bug of the wrong lines being shown.
6+
- `en/per_fraction.yaml`
7+
- `nb/per_fraction.yaml`
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
- name: per-fraction
2+
tag: fraction
3+
match:
4+
- "BaseNode(*[1])[contains(@data-intent-property, ':unit') or"
5+
- " ( self::m:mrow and count(*)=3 and" # maybe a bit paranoid checking the structure...
6+
- " *[1][self::m:mn] and *[2][.='\u2062'] and BaseNode(*[3])[contains(@data-intent-property, ':unit')] ) ] and"
7+
- "BaseNode(*[2])[contains(@data-intent-property, ':unit')] "
8+
replace:
9+
- x: "*[1]"
10+
- t: "per" # phrase('5 meters 'per' second)
11+
- x: "*[2]"
12+
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
- name: per-fraction
2+
tag: fraction
3+
match:
4+
- "BaseNode(*[1])[contains(@data-intent-property, ':unit') or"
5+
- " ( self::m:mrow and count(*)=3 and" # maybe a bit paranoid checking the structure...
6+
- " *[1][self::m:mn] and *[2][.='\u2062'] and BaseNode(*[3])[contains(@data-intent-property, ':unit')] ) ] and"
7+
- "BaseNode(*[2])[contains(@data-intent-property, ':unit') or (contains(@data-intent-property, ':unit') and .='t')] "
8+
replace:
9+
- x: "*[1]"
10+
- T: "per" # phrase('5 meters 'per' second)
11+
- test:
12+
if: "*[2]='t'"
13+
then: [T: "time"]
14+
else:
15+
- x: "*[2]"
16+

PythonScripts/audit_translations/tests/golden/jsonl/de.json

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@
115115
"rule_name": "struct-rule",
116116
"rule_tag": "mi",
117117
"rule_key": "struct-rule|mi",
118-
"issue_line_en": 9,
119-
"issue_line_tr": 1,
118+
"issue_line_en": 7,
119+
"issue_line_tr": 7,
120120
"rule_line_en": 1,
121121
"rule_line_tr": 1,
122122
"issue_type": "rule_difference",
@@ -161,16 +161,34 @@
161161
"english_snippet": "$Verbosity!='Terse', $Setting = 'Value', parent::m:minus, *[2][.='2']",
162162
"translated_snippet": "$Setting = 'Value', parent::m:minus, *[2][.='2']",
163163
"untranslated_texts": [],
164-
"_explanation": "structure_misaligned.yaml: English has extra test block causing misalignment. Fix filters out misleading structure differences but reports condition difference."
164+
"_explanation": "structure_misaligned.yaml: condition difference remains reported."
165+
},
166+
{
167+
"language": "de",
168+
"file": "structure_misaligned.yaml",
169+
"rule_name": "misaligned-structure",
170+
"rule_tag": "root",
171+
"rule_key": "misaligned-structure|root",
172+
"issue_line_en": 11,
173+
"issue_line_tr": 11,
174+
"rule_line_en": 1,
175+
"rule_line_tr": 1,
176+
"issue_type": "rule_difference",
177+
"diff_type": "structure",
178+
"description": "Rule structure differs (test/if/then/else blocks)",
179+
"english_snippet": "replace: test: if: then: test: if: then: test: if: then: else: test: if: then: else:",
180+
"translated_snippet": "replace: test: if: then: test: if: then: else: test: if: then: else:",
181+
"untranslated_texts": [],
182+
"_explanation": "structure_misaligned.yaml: structure substitutions/realignments are now reported with position-aware anchors."
165183
},
166184
{
167185
"language": "de",
168186
"file": "structure_missing_else.yaml",
169187
"rule_name": "missing-else-block",
170188
"rule_tag": "root",
171189
"rule_key": "missing-else-block|root",
172-
"issue_line_en": 8,
173-
"issue_line_tr": 1,
190+
"issue_line_en": 7,
191+
"issue_line_tr": 7,
174192
"rule_line_en": 1,
175193
"rule_line_tr": 1,
176194
"issue_type": "rule_difference",

PythonScripts/audit_translations/tests/golden/rich/cli_calculus_verbose.golden

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
en: $Verbosity!='Terse', not(IsNode(*[1], 'leaf'))
4444
es: not(IsNode(*[1], 'leaf'))
4545
Structure Differences [1]
46-
• (line 38 en, 18 es)
46+
• (line 40 en, 25 es)
4747
Rule structure differs (test/if/then/else blocks)
4848
en: replace: test: if: then: test: if: then:
4949
es: replace: test: if: then:

PythonScripts/audit_translations/tests/golden/rich/structure_diff_nonverbose.golden

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@
77
≠ Rule Issues [1] (grouped by rule and issue type)
88
• struct-rule (mi)
99
Structure Differences [1]
10-
• (line 9 en, 1 tr)
10+
• (line 7 en, 7 tr)
1111
Rule structure differs (test/if/then/else blocks)

PythonScripts/audit_translations/tests/golden/rich/structure_diff_verbose.golden

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
≠ Rule Issues [1] (grouped by rule and issue type)
88
• struct-rule (mi)
99
Structure Differences [1]
10-
• (line 9 en, 1 tr)
10+
• (line 7 en, 7 tr)
1111
Rule structure differs (test/if/then/else blocks)
1212
en: replace: test: if: then: else:
1313
tr: replace: test: if: then:

0 commit comments

Comments
 (0)