Skip to content

Commit b32b900

Browse files
bukzorclaude
andcommitted
Fix #160: Add proper text escaping in FormatHtml
HTML text nodes containing &, <, > were output without escaping, causing xq's output to be unparseable when piped back through xq -j. This commit adds: - New escapeTextContent() function for minimal entity escaping - Modified FormatHtml to escape text nodes with &amp;, &lt;, &gt; - Tests verifying the output is valid XML Example issue: echo '<html>1 &amp; 2</html>' | xq | xq -j # Before: Error - bare & in output # After: Success - properly escaped as &amp; This is a critical fix preventing data corruption when round-tripping HTML through xq. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 727399c commit b32b900

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed

cmd/root_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"strings"
99
"testing"
1010

11+
"github.com/antchfx/xmlquery"
1112
"github.com/sibprogrammer/xq/internal/utils"
1213
"github.com/spf13/cobra"
1314
"github.com/spf13/pflag"
@@ -102,6 +103,57 @@ func TestRootCmd(t *testing.T) {
102103
assert.ErrorContains(t, err, "invalid argument")
103104
}
104105

106+
func TestEscapedTextNodes(t *testing.T) {
107+
// Test case 1: ampersand entity - reproduce issue #160
108+
// xq outputs bare & which fails when parsed as XML (used by -j flag)
109+
t.Run("ampersand entity output is valid XML", func(t *testing.T) {
110+
input := "<html>1 &amp; 2</html>"
111+
112+
// First pass: format the HTML
113+
reader1 := strings.NewReader(input)
114+
var output1 bytes.Buffer
115+
err := utils.FormatHtml(reader1, &output1, "", utils.ColorsDisabled)
116+
assert.Nil(t, err)
117+
118+
result1 := strings.TrimSpace(output1.String())
119+
t.Logf("First pass output: %q", result1)
120+
121+
// Second pass: try to parse as XML (this is what `xq -j` does)
122+
reader2 := strings.NewReader(result1)
123+
_, err = xmlquery.Parse(reader2)
124+
assert.Nil(t, err, "xq output should be parseable as XML (for -j flag)")
125+
})
126+
127+
// Test case 2: less-than and greater-than entities - reproduce issue #160
128+
// xq outputs bare < and > which are parsed as tags
129+
t.Run("less-than and greater-than entities output is valid XML", func(t *testing.T) {
130+
input := "<html>is &lt;bold&gt; a valid tag?</html>"
131+
132+
// First pass: format the HTML
133+
reader1 := strings.NewReader(input)
134+
var output1 bytes.Buffer
135+
err := utils.FormatHtml(reader1, &output1, "", utils.ColorsDisabled)
136+
assert.Nil(t, err)
137+
138+
result1 := strings.TrimSpace(output1.String())
139+
t.Logf("First pass output: %q", result1)
140+
141+
// Second pass: try to parse as XML (this is what `xq -j` does)
142+
reader2 := strings.NewReader(result1)
143+
doc, err := xmlquery.Parse(reader2)
144+
assert.Nil(t, err, "xq output should be parseable as XML (for -j flag)")
145+
146+
// Verify the text content is preserved correctly
147+
if doc != nil {
148+
textNode := xmlquery.FindOne(doc, "//html")
149+
if textNode != nil {
150+
assert.Equal(t, "is <bold> a valid tag?", textNode.InnerText(),
151+
"Text content should preserve the literal < and > characters")
152+
}
153+
}
154+
})
155+
}
156+
105157
func TestProcessAsJSON(t *testing.T) {
106158
tests := []struct {
107159
name string

internal/utils/utils.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,9 @@ func FormatHtml(reader io.Reader, writer io.Writer, indent string, colors int) e
338338
case html.TextToken:
339339
str := normalizeSpaces(string(tokenizer.Text()), indent, level)
340340
hasContent = str != ""
341+
if hasContent {
342+
str = escapeTextContent(str)
343+
}
341344
_, _ = fmt.Fprint(writer, str)
342345
case html.StartTagToken, html.SelfClosingTagToken:
343346
if level > 0 {
@@ -585,6 +588,15 @@ func escapeText(input string) (string, error) {
585588
return result, nil
586589
}
587590

591+
func escapeTextContent(input string) string {
592+
// Only escape the minimal set of characters needed for text content
593+
// to avoid XML parsing errors: & < >
594+
result := strings.ReplaceAll(input, "&", "&amp;")
595+
result = strings.ReplaceAll(result, "<", "&lt;")
596+
result = strings.ReplaceAll(result, ">", "&gt;")
597+
return result
598+
}
599+
588600
func normalizeSpaces(input string, indent string, level int) string {
589601
if strings.TrimSpace(input) == "" {
590602
input = ""

0 commit comments

Comments
 (0)