Fix issue with UTF-8 BOM when charset only in HTML.

jhy · jhy · commit 3f9f33d88355 · 2013-11-17T21:12:57.000-08:00
Fixes jhy#348
diff --git a/CHANGES b/CHANGES
@@ -19,6 +19,11 @@ jsoup changelog
  * Fixed an issue where <svg><img/></svg> was parsed as <svg><image/></svg>
    <https://github.com/jhy/jsoup/issues/364>
 
+ * Fixed an issue where a UTF-8 BOM character was not detected if the HTTP response did not specify a charset, and
+   the HTML body did, leading to the head contents incorrectly being parsed into the body. Changed the behavior so that
+   when the UTF-8 BOM is detected, it will take precedence for determining the charset to decode with.
+   <https://github.com/jhy/jsoup/issues/348>
+
 *** Release 1.7.3 [2013-Nov-10]
  * Introduced FormElement, providing easy access to form controls and their data, and the ability to submit forms
    with Jsoup.Connect.
diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -72,6 +72,7 @@ public static Document load(InputStream in, String charsetName, String baseUri,
 
     // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
     // switching the chartset midstream when a meta http-equiv tag defines the charset.
+    // todo - this is getting gnarly. needs a rewrite.
     static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
         String docData;
         Document doc = null;
@@ -81,7 +82,6 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
             doc = parser.parseInput(docData, baseUri);
             Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
             if (meta != null) { // if not found, will keep utf-8 as best attempt
-
                 String foundCharset;
                 if (meta.hasAttr("http-equiv")) {
                     foundCharset = getCharsetFromContentType(meta.attr("content"));
@@ -110,13 +110,15 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
             Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
             docData = Charset.forName(charsetName).decode(byteData).toString();
         }
+        // UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
+        if (docData.length() > 0 && docData.charAt(0) == 65279) {
+            byteData.rewind();
+            docData = Charset.forName(defaultCharset).decode(byteData).toString();
+            docData = docData.substring(1);
+            charsetName = defaultCharset;
+            doc = null;
+        }
         if (doc == null) {
-            // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
-            // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
-            // into head mode
-            if (docData.length() > 0 && docData.charAt(0) == 65279)
-                docData = docData.substring(1);
-
             doc = parser.parseInput(docData, baseUri);
             doc.outputSettings().charset(charsetName);
         }
diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -35,6 +35,14 @@ public void testCharset() {
         assertEquals("One", doc.head().text());
     }
 
+    @Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
+        String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
+        ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
+        Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
+        assertEquals("One", doc.head().text());
+        assertEquals("UTF-8", doc.outputSettings().charset().displayName());
+    }
+
     @Test
     public void shouldNotThrowExceptionOnEmptyCharset() {
         assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset="));