Skip to content

Commit 3f9f33d

Browse files
committed
Fix issue with UTF-8 BOM when charset only in HTML.
Fixes jhy#348
1 parent 6c4f16f commit 3f9f33d

File tree

3 files changed

+22
-7
lines changed

3 files changed

+22
-7
lines changed

CHANGES

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ jsoup changelog
1919
* Fixed an issue where <svg><img/></svg> was parsed as <svg><image/></svg>
2020
<https://github.com/jhy/jsoup/issues/364>
2121

22+
* Fixed an issue where a UTF-8 BOM character was not detected if the HTTP response did not specify a charset, and
23+
the HTML body did, leading to the head contents incorrectly being parsed into the body. Changed the behavior so that
24+
when the UTF-8 BOM is detected, it will take precedence for determining the charset to decode with.
25+
<https://github.com/jhy/jsoup/issues/348>
26+
2227
*** Release 1.7.3 [2013-Nov-10]
2328
* Introduced FormElement, providing easy access to form controls and their data, and the ability to submit forms
2429
with Jsoup.Connect.

src/main/java/org/jsoup/helper/DataUtil.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ public static Document load(InputStream in, String charsetName, String baseUri,
7272

7373
// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
7474
// switching the chartset midstream when a meta http-equiv tag defines the charset.
75+
// todo - this is getting gnarly. needs a rewrite.
7576
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
7677
String docData;
7778
Document doc = null;
@@ -81,7 +82,6 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
8182
doc = parser.parseInput(docData, baseUri);
8283
Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
8384
if (meta != null) { // if not found, will keep utf-8 as best attempt
84-
8585
String foundCharset;
8686
if (meta.hasAttr("http-equiv")) {
8787
foundCharset = getCharsetFromContentType(meta.attr("content"));
@@ -110,13 +110,15 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
110110
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
111111
docData = Charset.forName(charsetName).decode(byteData).toString();
112112
}
113+
// UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
114+
if (docData.length() > 0 && docData.charAt(0) == 65279) {
115+
byteData.rewind();
116+
docData = Charset.forName(defaultCharset).decode(byteData).toString();
117+
docData = docData.substring(1);
118+
charsetName = defaultCharset;
119+
doc = null;
120+
}
113121
if (doc == null) {
114-
// there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
115-
// in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
116-
// into head mode
117-
if (docData.length() > 0 && docData.charAt(0) == 65279)
118-
docData = docData.substring(1);
119-
120122
doc = parser.parseInput(docData, baseUri);
121123
doc.outputSettings().charset(charsetName);
122124
}

src/test/java/org/jsoup/helper/DataUtilTest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ public void testCharset() {
3535
assertEquals("One", doc.head().text());
3636
}
3737

38+
@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
39+
String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
40+
ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
41+
Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
42+
assertEquals("One", doc.head().text());
43+
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
44+
}
45+
3846
@Test
3947
public void shouldNotThrowExceptionOnEmptyCharset() {
4048
assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset="));

0 commit comments

Comments
 (0)