Skip to content

Commit 684ae68

Browse files
committed
Merge branch 'pr/486'
2 parents 6e14804 + 40e9ed5 commit 684ae68

File tree

4 files changed

+408
-13
lines changed

4 files changed

+408
-13
lines changed

CHANGES

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ jsoup changelog
1414
* File upload support. Added the ability to specify input streams for POST data, which will upload content in
1515
MIME multipart/form-data encoding.
1616

17+
* Add a meta-charset element to documents when setting the character set, so that the document's charset is
18+
unambiguous.
19+
<https://github.com/jhy/jsoup/pull/486>
20+
1721
* Added ability to disable TLS (SSL) certificate validation. Helpful if you're hitting a host with a bad cert,
1822
or your JDK doesn't support SNI.
1923
<https://github.com/jhy/jsoup/pull/343>

src/main/java/org/jsoup/nodes/Document.java

Lines changed: 148 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ public class Document extends Element {
1818
private OutputSettings outputSettings = new OutputSettings();
1919
private QuirksMode quirksMode = QuirksMode.noQuirks;
2020
private String location;
21+
private boolean updateMetaCharset = false;
2122

2223
/**
2324
Create a new, empty Document.
@@ -128,6 +129,8 @@ public Document normalise() {
128129
normaliseStructure("head", htmlEl);
129130
normaliseStructure("body", htmlEl);
130131

132+
ensureMetaCharsetElement();
133+
131134
return this;
132135
}
133136

@@ -206,13 +209,157 @@ public Element text(String text) {
206209
public String nodeName() {
207210
return "#document";
208211
}
212+
213+
/**
214+
* Sets the charset used in this document. This method is equivalent
215+
* to {@link OutputSettings#charset(java.nio.charset.Charset)
216+
* OutputSettings.charset(Charset)} but in addition it updates the
217+
* charset / encoding element within the document.
218+
*
219+
* <p>This enables
220+
* {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
221+
*
222+
* <p>If there's no element with charset / encoding information yet it will
223+
* be created. Obsolete charset / encoding definitions are removed!</p>
224+
*
225+
* <p><b>Elements used:</b></p>
226+
*
227+
* <ul>
228+
* <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
229+
* <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
230+
* </ul>
231+
*
232+
* @param charset Charset
233+
*
234+
* @see #updateMetaCharsetElement(boolean)
235+
* @see OutputSettings#charset(java.nio.charset.Charset)
236+
*/
237+
public void charset(Charset charset) {
238+
updateMetaCharsetElement(true);
239+
outputSettings.charset(charset);
240+
ensureMetaCharsetElement();
241+
}
242+
243+
/**
244+
* Returns the charset used in this document. This method is equivalent
245+
* to {@link OutputSettings#charset()}.
246+
*
247+
* @return Current Charset
248+
*
249+
* @see OutputSettings#charset()
250+
*/
251+
public Charset charset() {
252+
return outputSettings.charset();
253+
}
254+
255+
/**
256+
* Sets whether the element with charset information in this document is
257+
* updated on changes through {@link #charset(java.nio.charset.Charset)
258+
* Document.charset(Charset)} or not.
259+
*
260+
* <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
261+
* modified.</p>
262+
*
263+
* @param update If <tt>true</tt> the element updated on charset
264+
* changes, <tt>false</tt> if not
265+
*
266+
* @see #charset(java.nio.charset.Charset)
267+
*/
268+
public void updateMetaCharsetElement(boolean update) {
269+
this.updateMetaCharset = true;
270+
}
271+
272+
/**
273+
* Returns whether the element with charset information in this document is
274+
* updated on changes through {@link #charset(java.nio.charset.Charset)
275+
* Document.charset(Charset)} or not.
276+
*
277+
* @return Returns <tt>true</tt> if the element is updated on charset
278+
* changes, <tt>false</tt> if not
279+
*/
280+
public boolean updateMetaCharsetElement() {
281+
return updateMetaCharset;
282+
}
209283

210284
@Override
211285
public Document clone() {
212286
Document clone = (Document) super.clone();
213287
clone.outputSettings = this.outputSettings.clone();
214288
return clone;
215289
}
290+
291+
/**
292+
* Ensures a meta charset (html) or xml declaration (xml) with the current
293+
* encoding used. This only applies with
294+
* {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
295+
* <tt>true</tt>, otherwise this method does nothing.
296+
*
297+
* <ul>
298+
* <li>An exsiting element gets updated with the current charset</li>
299+
* <li>If there's no element yet it will be inserted</li>
300+
* <li>Obsolete elements are removed</li>
301+
* </ul>
302+
*
303+
* <p><b>Elements used:</b></p>
304+
*
305+
* <ul>
306+
* <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
307+
* <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
308+
* </ul>
309+
*/
310+
private void ensureMetaCharsetElement() {
311+
if (updateMetaCharset == true) {
312+
OutputSettings.Syntax syntax = outputSettings().syntax();
313+
314+
if (syntax == OutputSettings.Syntax.html) {
315+
Element metaCharset = select("meta[charset]").first();
316+
317+
if (metaCharset != null) {
318+
metaCharset.attr("charset", charset().displayName());
319+
} else {
320+
Element head = head();
321+
322+
if (head != null) {
323+
head.appendElement("meta").attr("charset", charset().displayName());
324+
}
325+
}
326+
327+
// Remove obsolete elements
328+
select("meta[name=charset]").remove();
329+
} else if (syntax == OutputSettings.Syntax.xml) {
330+
Node node = childNodes().get(0);
331+
332+
if (node instanceof XmlDeclaration) {
333+
XmlDeclaration decl = (XmlDeclaration) node;
334+
335+
if (decl.attr(XmlDeclaration.DECL_KEY).equals("xml")) {
336+
decl.attr("encoding", charset().displayName());
337+
338+
final String version = decl.attr("version");
339+
340+
if (version != null) {
341+
decl.attr("version", "1.0");
342+
}
343+
} else {
344+
decl = new XmlDeclaration("xml", baseUri, false);
345+
decl.attr("version", "1.0");
346+
decl.attr("encoding", charset().displayName());
347+
348+
prependChild(decl);
349+
}
350+
} else {
351+
XmlDeclaration decl = new XmlDeclaration("xml", baseUri, false);
352+
decl.attr("version", "1.0");
353+
decl.attr("encoding", charset().displayName());
354+
355+
prependChild(decl);
356+
}
357+
} else {
358+
// Unsupported syntax - nothing to do yet
359+
}
360+
}
361+
}
362+
216363

217364
/**
218365
* A Document's output settings control the form of the text() and html() methods.
@@ -232,7 +379,7 @@ public enum Syntax {html, xml}
232379
private Syntax syntax = Syntax.html;
233380

234381
public OutputSettings() {}
235-
382+
236383
/**
237384
* Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
238385
* entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
@@ -274,7 +421,6 @@ public Charset charset() {
274421
* @return the document's output settings, for chaining
275422
*/
276423
public OutputSettings charset(Charset charset) {
277-
// todo: this should probably update the doc's meta charset
278424
this.charset = charset;
279425
charsetEncoder = charset.newEncoder();
280426
return this;

src/main/java/org/jsoup/nodes/XmlDeclaration.java

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
66
@author Jonathan Hedley, [email protected] */
77
public class XmlDeclaration extends Node {
8-
private static final String DECL_KEY = "declaration";
8+
static final String DECL_KEY = "declaration";
99
private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
1010

1111
/**
@@ -29,9 +29,29 @@ public String nodeName() {
2929
@return XML declaration
3030
*/
3131
public String getWholeDeclaration() {
32-
return attributes.get(DECL_KEY);
32+
final String decl = attributes.get(DECL_KEY);
33+
34+
if( decl.equals("xml") == true && attributes.size() > 1 ) {
35+
StringBuilder sb = new StringBuilder(decl);
36+
final String version = attributes.get("version");
37+
38+
if( version != null ) {
39+
sb.append(" version=\"").append(version).append("\"");
40+
}
41+
42+
final String encoding = attributes.get("encoding");
43+
44+
if( encoding != null ) {
45+
sb.append(" encoding=\"").append(encoding).append("\"");
46+
}
47+
48+
return sb.toString();
49+
}
50+
else {
51+
return attributes.get(DECL_KEY);
52+
}
3353
}
34-
54+
3555
void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
3656
accum
3757
.append("<")

0 commit comments

Comments
 (0)