Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions src/core/diff/strategies/__tests__/unicode-preservation.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { MultiSearchReplaceDiffStrategy } from "../multi-search-replace"

describe("Unicode Character Preservation", () => {
it("should preserve Unicode apostrophes when applying diffs", async () => {
const strategy = new MultiSearchReplaceDiffStrategy(1.0) // Exact matching

const originalContent = `This file contains Unicode apostrophes: \u2018hello\u2019 and \u201Cworld\u201D
Another line with Unicode: \u2018test\u2019 and \u201Cexample\u201D
Regular ASCII: 'normal' and "standard"`

const diffContent = `<<<<<<< SEARCH
:start_line:1
-------
This file contains Unicode apostrophes: 'hello' and "world"
=======
This file contains Unicode apostrophes: 'goodbye' and "universe"
\>>>>>>> REPLACE`

const result = await strategy.applyDiff(originalContent, diffContent)

expect(result.success).toBe(true)
if (result.success && result.content) {
// Check that Unicode characters are preserved
expect(result.content).toContain("\u2018goodbye\u2019") // Should preserve Unicode apostrophe (U+2018/U+2019)
expect(result.content).toContain("\u201Cuniverse\u201D") // Should preserve Unicode quotes (U+201C/U+201D)
// Check that ASCII characters are NOT present (they should be converted to Unicode)
expect(result.content).not.toContain("'goodbye'") // Should not have ASCII apostrophe
expect(result.content).not.toContain('"universe"') // Should not have ASCII quotes
}
})

it("should preserve Unicode quotes in multi-line replacements", async () => {
const strategy = new MultiSearchReplaceDiffStrategy(1.0)

const originalContent = `Line 1: \u2018unicode\u2019
Line 2: \u201Cquotes\u201D
Line 3: normal`

const diffContent = `<<<<<<< SEARCH
:start_line:1
-------
Line 1: 'unicode'
Line 2: "quotes"
=======
Line 1: 'modified'
Line 2: "changed"
\>>>>>>> REPLACE`

const result = await strategy.applyDiff(originalContent, diffContent)

expect(result.success).toBe(true)
if (result.success && result.content) {
expect(result.content).toContain("\u2018modified\u2019")
expect(result.content).toContain("\u201Cchanged\u201D")
}
})

it("should handle mixed Unicode and ASCII quotes correctly", async () => {
const strategy = new MultiSearchReplaceDiffStrategy(1.0)

const originalContent = `Unicode: \u2018test\u2019 and \u201Cexample\u201D
ASCII: 'normal' and "standard"`

const diffContent = `<<<<<<< SEARCH
:start_line:1
-------
Unicode: 'test' and "example"
=======
Unicode: 'replaced' and "modified"
\>>>>>>> REPLACE`

const result = await strategy.applyDiff(originalContent, diffContent)

expect(result.success).toBe(true)
if (result.success && result.content) {
// Should preserve Unicode in the replaced line
expect(result.content).toContain("\u2018replaced\u2019")
expect(result.content).toContain("\u201Cmodified\u201D")
// Should keep ASCII in the unchanged line
expect(result.content).toContain("'normal'")
expect(result.content).toContain('"standard"')
}
})

it("should not affect content when no Unicode characters are present", async () => {
const strategy = new MultiSearchReplaceDiffStrategy(1.0)

const originalContent = `Regular ASCII: 'test' and "example"`

const diffContent = `<<<<<<< SEARCH
:start_line:1
-------
Regular ASCII: 'test' and "example"
=======
Regular ASCII: 'modified' and "changed"
\>>>>>>> REPLACE`

const result = await strategy.applyDiff(originalContent, diffContent)

expect(result.success).toBe(true)
if (result.success && result.content) {
expect(result.content).toBe(`Regular ASCII: 'modified' and "changed"`)
}
})
})
38 changes: 38 additions & 0 deletions src/core/diff/strategies/multi-file-search-replace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,39 @@ function getSimilarity(original: string, search: string): number {
return 1 - dist / maxLength
}

/**
* Preserves Unicode characters from the original content when applying replacements.
* This function maps Unicode characters from the original to the replacement content
* when they have been normalized to ASCII equivalents.
*/
function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string {
// Create a mapping of ASCII characters to their Unicode equivalents from the original
const unicodeMap = new Map<string, string>()

// Check for Unicode quotes in the original content
const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""''
const asciiChars = ['"', '"', "'", "'"]

for (let i = 0; i < unicodeChars.length; i++) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The preserveUnicodeCharacters function here uses a simple mapping that overwrites the mapping for ASCII quotes (" and ') when both opening and closing Unicode variants are present. This means all occurrences of a quote in the replacement will use the last mapped Unicode character, potentially losing the distinction between opening and closing quotes. Consider using a sequential approach or separate mappings to preserve paired quotes correctly. Also, the function is duplicated in another file; consider extracting it into a shared utility module.

This comment was generated because it violated a code review rule: irule_tTqpIuNs8DV0QFGj.

const unicodeChar = unicodeChars[i]
const asciiChar = asciiChars[i]

// If original contains Unicode character, map ASCII to Unicode
if (originalContent.includes(unicodeChar)) {
unicodeMap.set(asciiChar, unicodeChar)
}
}

// Apply the mapping to the replacement content
let result = replaceContent
for (const [ascii, unicode] of unicodeMap) {
// Use a more specific replacement to avoid replacing characters that shouldn't be replaced
result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode)
}

return result
}

/**
* Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
* the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
Expand Down Expand Up @@ -650,6 +683,11 @@ Each file requires its own path, start_line, and diff elements.

// Get the matched lines from the original content
const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length)
const originalMatchedContent = matchedLines.join("\n")

// Preserve Unicode characters from the original content in the replacement
replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent)
replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)

// Get the exact indentation (preserving tabs/spaces) of each line
const originalIndents = matchedLines.map((line) => {
Expand Down
41 changes: 40 additions & 1 deletion src/core/diff/strategies/multi-search-replace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ function getSimilarity(original: string, search: string): number {
return 0
}

// Use the normalizeString utility to handle smart quotes and other special characters
// Use the normalizeString utility for comparison only, but preserve original characters
// This allows matching content with different quote styles without changing the actual content
const normalizedOriginal = normalizeString(original)
const normalizedSearch = normalizeString(search)

Expand All @@ -32,6 +33,39 @@ function getSimilarity(original: string, search: string): number {
return 1 - dist / maxLength
}

/**
* Preserves Unicode characters from the original content when applying replacements.
* This function maps Unicode characters from the original to the replacement content
* when they have been normalized to ASCII equivalents.
*/
function preserveUnicodeCharacters(originalContent: string, searchContent: string, replaceContent: string): string {
// Create a mapping of ASCII characters to their Unicode equivalents from the original
const unicodeMap = new Map<string, string>()

// Check for Unicode quotes in the original content
const unicodeChars = ["\u201C", "\u201D", "\u2018", "\u2019"] // ""''
const asciiChars = ['"', '"', "'", "'"]

for (let i = 0; i < unicodeChars.length; i++) {
const unicodeChar = unicodeChars[i]
const asciiChar = asciiChars[i]

// If original contains Unicode character, map ASCII to Unicode
if (originalContent.includes(unicodeChar)) {
unicodeMap.set(asciiChar, unicodeChar)
}
}

// Apply the mapping to the replacement content
let result = replaceContent
for (const [ascii, unicode] of unicodeMap) {
// Use a more specific replacement to avoid replacing characters that shouldn't be replaced
result = result.replace(new RegExp(ascii.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"), unicode)
}

return result
}

/**
* Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
* the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
Expand Down Expand Up @@ -550,6 +584,11 @@ Only use a single line of '=======' between search and replacement content, beca

// Get the matched lines from the original content
const matchedLines = resultLines.slice(matchIndex, matchIndex + searchLines.length)
const originalMatchedContent = matchedLines.join("\n")

// Preserve Unicode characters from the original content in the replacement
replaceContent = preserveUnicodeCharacters(originalMatchedContent, searchContent, replaceContent)
replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)

// Get the exact indentation (preserving tabs/spaces) of each line
const originalIndents = matchedLines.map((line) => {
Expand Down
Loading