@@ -72,6 +72,7 @@ public static Document load(InputStream in, String charsetName, String baseUri,
72
72
73
73
// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
74
74
// switching the chartset midstream when a meta http-equiv tag defines the charset.
75
+ // todo - this is getting gnarly. needs a rewrite.
75
76
static Document parseByteData (ByteBuffer byteData , String charsetName , String baseUri , Parser parser ) {
76
77
String docData ;
77
78
Document doc = null ;
@@ -81,7 +82,6 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
81
82
doc = parser .parseInput (docData , baseUri );
82
83
Element meta = doc .select ("meta[http-equiv=content-type], meta[charset]" ).first ();
83
84
if (meta != null ) { // if not found, will keep utf-8 as best attempt
84
-
85
85
String foundCharset ;
86
86
if (meta .hasAttr ("http-equiv" )) {
87
87
foundCharset = getCharsetFromContentType (meta .attr ("content" ));
@@ -110,13 +110,15 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
110
110
Validate .notEmpty (charsetName , "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML" );
111
111
docData = Charset .forName (charsetName ).decode (byteData ).toString ();
112
112
}
113
+ // UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
114
+ if (docData .length () > 0 && docData .charAt (0 ) == 65279 ) {
115
+ byteData .rewind ();
116
+ docData = Charset .forName (defaultCharset ).decode (byteData ).toString ();
117
+ docData = docData .substring (1 );
118
+ charsetName = defaultCharset ;
119
+ doc = null ;
120
+ }
113
121
if (doc == null ) {
114
- // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
115
- // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
116
- // into head mode
117
- if (docData .length () > 0 && docData .charAt (0 ) == 65279 )
118
- docData = docData .substring (1 );
119
-
120
122
doc = parser .parseInput (docData , baseUri );
121
123
doc .outputSettings ().charset (charsetName );
122
124
}
0 commit comments