@@ -18,6 +18,7 @@ public class Document extends Element {
18
18
private OutputSettings outputSettings = new OutputSettings ();
19
19
private QuirksMode quirksMode = QuirksMode .noQuirks ;
20
20
private String location ;
21
+ private boolean updateMetaCharset = false ;
21
22
22
23
/**
23
24
Create a new, empty Document.
@@ -128,6 +129,8 @@ public Document normalise() {
128
129
normaliseStructure ("head" , htmlEl );
129
130
normaliseStructure ("body" , htmlEl );
130
131
132
+ ensureMetaCharsetElement ();
133
+
131
134
return this ;
132
135
}
133
136
@@ -206,13 +209,157 @@ public Element text(String text) {
206
209
public String nodeName () {
207
210
return "#document" ;
208
211
}
212
+
213
+ /**
214
+ * Sets the charset used in this document. This method is equivalent
215
+ * to {@link OutputSettings#charset(java.nio.charset.Charset)
216
+ * OutputSettings.charset(Charset)} but in addition it updates the
217
+ * charset / encoding element within the document.
218
+ *
219
+ * <p>This enables
220
+ * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
221
+ *
222
+ * <p>If there's no element with charset / encoding information yet it will
223
+ * be created. Obsolete charset / encoding definitions are removed!</p>
224
+ *
225
+ * <p><b>Elements used:</b></p>
226
+ *
227
+ * <ul>
228
+ * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li>
229
+ * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li>
230
+ * </ul>
231
+ *
232
+ * @param charset Charset
233
+ *
234
+ * @see #updateMetaCharsetElement(boolean)
235
+ * @see OutputSettings#charset(java.nio.charset.Charset)
236
+ */
237
+ public void charset (Charset charset ) {
238
+ updateMetaCharsetElement (true );
239
+ outputSettings .charset (charset );
240
+ ensureMetaCharsetElement ();
241
+ }
242
+
243
+ /**
244
+ * Returns the charset used in this document. This method is equivalent
245
+ * to {@link OutputSettings#charset()}.
246
+ *
247
+ * @return Current Charset
248
+ *
249
+ * @see OutputSettings#charset()
250
+ */
251
+ public Charset charset () {
252
+ return outputSettings .charset ();
253
+ }
254
+
255
+ /**
256
+ * Sets whether the element with charset information in this document is
257
+ * updated on changes through {@link #charset(java.nio.charset.Charset)
258
+ * Document.charset(Charset)} or not.
259
+ *
260
+ * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
261
+ * modified.</p>
262
+ *
263
+ * @param update If <tt>true</tt> the element updated on charset
264
+ * changes, <tt>false</tt> if not
265
+ *
266
+ * @see #charset(java.nio.charset.Charset)
267
+ */
268
+ public void updateMetaCharsetElement (boolean update ) {
269
+ this .updateMetaCharset = true ;
270
+ }
271
+
272
+ /**
273
+ * Returns whether the element with charset information in this document is
274
+ * updated on changes through {@link #charset(java.nio.charset.Charset)
275
+ * Document.charset(Charset)} or not.
276
+ *
277
+ * @return Returns <tt>true</tt> if the element is updated on charset
278
+ * changes, <tt>false</tt> if not
279
+ */
280
+ public boolean updateMetaCharsetElement () {
281
+ return updateMetaCharset ;
282
+ }
209
283
210
284
@ Override
211
285
public Document clone () {
212
286
Document clone = (Document ) super .clone ();
213
287
clone .outputSettings = this .outputSettings .clone ();
214
288
return clone ;
215
289
}
290
+
291
+ /**
292
+ * Ensures a meta charset (html) or xml declaration (xml) with the current
293
+ * encoding used. This only applies with
294
+ * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
295
+ * <tt>true</tt>, otherwise this method does nothing.
296
+ *
297
+ * <ul>
298
+ * <li>An exsiting element gets updated with the current charset</li>
299
+ * <li>If there's no element yet it will be inserted</li>
300
+ * <li>Obsolete elements are removed</li>
301
+ * </ul>
302
+ *
303
+ * <p><b>Elements used:</b></p>
304
+ *
305
+ * <ul>
306
+ * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li>
307
+ * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li>
308
+ * </ul>
309
+ */
310
+ private void ensureMetaCharsetElement () {
311
+ if (updateMetaCharset == true ) {
312
+ OutputSettings .Syntax syntax = outputSettings ().syntax ();
313
+
314
+ if (syntax == OutputSettings .Syntax .html ) {
315
+ Element metaCharset = select ("meta[charset]" ).first ();
316
+
317
+ if (metaCharset != null ) {
318
+ metaCharset .attr ("charset" , charset ().displayName ());
319
+ } else {
320
+ Element head = head ();
321
+
322
+ if (head != null ) {
323
+ head .appendElement ("meta" ).attr ("charset" , charset ().displayName ());
324
+ }
325
+ }
326
+
327
+ // Remove obsolete elements
328
+ select ("meta[name=charset]" ).remove ();
329
+ } else if (syntax == OutputSettings .Syntax .xml ) {
330
+ Node node = childNodes ().get (0 );
331
+
332
+ if (node instanceof XmlDeclaration ) {
333
+ XmlDeclaration decl = (XmlDeclaration ) node ;
334
+
335
+ if (decl .attr (XmlDeclaration .DECL_KEY ).equals ("xml" )) {
336
+ decl .attr ("encoding" , charset ().displayName ());
337
+
338
+ final String version = decl .attr ("version" );
339
+
340
+ if (version != null ) {
341
+ decl .attr ("version" , "1.0" );
342
+ }
343
+ } else {
344
+ decl = new XmlDeclaration ("xml" , baseUri , false );
345
+ decl .attr ("version" , "1.0" );
346
+ decl .attr ("encoding" , charset ().displayName ());
347
+
348
+ prependChild (decl );
349
+ }
350
+ } else {
351
+ XmlDeclaration decl = new XmlDeclaration ("xml" , baseUri , false );
352
+ decl .attr ("version" , "1.0" );
353
+ decl .attr ("encoding" , charset ().displayName ());
354
+
355
+ prependChild (decl );
356
+ }
357
+ } else {
358
+ // Unsupported syntax - nothing to do yet
359
+ }
360
+ }
361
+ }
362
+
216
363
217
364
/**
218
365
* A Document's output settings control the form of the text() and html() methods.
@@ -232,7 +379,7 @@ public enum Syntax {html, xml}
232
379
private Syntax syntax = Syntax .html ;
233
380
234
381
public OutputSettings () {}
235
-
382
+
236
383
/**
237
384
* Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
238
385
* entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
@@ -274,7 +421,6 @@ public Charset charset() {
274
421
* @return the document's output settings, for chaining
275
422
*/
276
423
public OutputSettings charset (Charset charset ) {
277
- // todo: this should probably update the doc's meta charset
278
424
this .charset = charset ;
279
425
charsetEncoder = charset .newEncoder ();
280
426
return this ;
0 commit comments