Skip to content

Commit 6c4f16f

Browse files
committed
Support for HTML output syntax, enabled by default
Introduced the ability to chose between HTML and XML output, and made HTML the default. This means img tags are output as <img>, not <img />. XML is the default when using the XmlTreeBuilder. Control this with the Document.OutputSettings.syntax() method. Tightened the scope of what characters are escaped in attributes and textnodes, to align with the spec. Also, when using the extended escape entities map, only escape a character if the current output charset does not support it. This produces smaller, more legible HTML, with greated control over the output (by setting charset and escape mode). Fixes jhy#322 Fixes jhy#373 Fixes jhy#346 Fixes jhy#150
1 parent d651855 commit 6c4f16f

File tree

15 files changed

+227
-71
lines changed

15 files changed

+227
-71
lines changed

CHANGES

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
jsoup changelog
22

3-
*** Release 1.7.4 [PENDING]
4-
* Improved the performance of Element.text() by 2.2x
3+
*** Release 1.8.1 [PENDING]
4+
* Introduced the ability to chose between HTML and XML output, and made HTML the default. This means img tags are
5+
output as <img>, not <img />. XML is the default when using the XmlTreeBuilder. Control this with the
6+
Document.OutputSettings.syntax() method.
7+
8+
* Improved the performance of Element.text() by 3.2x
9+
10+
* Improved the performance of Element.html() by 1.7x
11+
12+
* Tightened the scope of what characters are escaped in attributes and textnodes, to align with the spec. Also, when
13+
using the extended escape entities map, only escape a character if the current output charset does not support it.
14+
This produces smaller, more legible HTML, with greated control over the output (by setting charset and escape mode).
515

616
* If pretty-print is disabled, don't trim outer whitespace in Element.html()
717
<https://github.com/jhy/jsoup/issues/368>

src/main/java/org/jsoup/nodes/Attribute.java

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,21 @@
22

33
import org.jsoup.helper.Validate;
44

5+
import java.util.Arrays;
56
import java.util.Map;
67

78
/**
89
A single key + value attribute. Keys are trimmed and normalised to lower-case.
910
1011
@author Jonathan Hedley, [email protected] */
1112
public class Attribute implements Map.Entry<String, String>, Cloneable {
13+
private static final String[] booleanAttributes = {
14+
"allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled",
15+
"formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize",
16+
"noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected",
17+
"sortable", "truespeed", "typemustmatch"
18+
};
19+
1220
private String key;
1321
private String value;
1422

@@ -66,15 +74,18 @@ public String setValue(String value) {
6674
@return HTML
6775
*/
6876
public String html() {
69-
return key + "=\"" + Entities.escape(value, (new Document("")).outputSettings()) + "\"";
77+
StringBuilder accum = new StringBuilder();
78+
html(accum, (new Document("")).outputSettings());
79+
return accum.toString();
7080
}
7181

7282
protected void html(StringBuilder accum, Document.OutputSettings out) {
73-
accum
74-
.append(key)
75-
.append("=\"")
76-
.append(Entities.escape(value, out))
77-
.append("\"");
83+
accum.append(key);
84+
if (!shouldCollapseAttribute(out)) {
85+
accum.append("=\"");
86+
Entities.escape(accum, value, out, true, false, false);
87+
accum.append('"');
88+
}
7889
}
7990

8091
/**
@@ -100,6 +111,15 @@ protected boolean isDataAttribute() {
100111
return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
101112
}
102113

114+
/**
115+
* Collapsible if it's a boolean attribute and value is empty or same as name
116+
*/
117+
protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
118+
return ("".equals(value) || value.equalsIgnoreCase(key))
119+
&& out.syntax() == Document.OutputSettings.Syntax.html
120+
&& Arrays.binarySearch(booleanAttributes, key) >= 0;
121+
}
122+
103123
@Override
104124
public boolean equals(Object o) {
105125
if (this == o) return true;

src/main/java/org/jsoup/nodes/Document.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,12 +218,18 @@ public Document clone() {
218218
* A Document's output settings control the form of the text() and html() methods.
219219
*/
220220
public static class OutputSettings implements Cloneable {
221+
/**
222+
* The output serialization syntax.
223+
*/
224+
public enum Syntax {html, xml}
225+
221226
private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
222227
private Charset charset = Charset.forName("UTF-8");
223228
private CharsetEncoder charsetEncoder = charset.newEncoder();
224229
private boolean prettyPrint = true;
225230
private boolean outline = false;
226231
private int indentAmount = 1;
232+
private Syntax syntax = Syntax.html;
227233

228234
public OutputSettings() {}
229235

@@ -240,7 +246,8 @@ public Entities.EscapeMode escapeMode() {
240246
}
241247

242248
/**
243-
* Set the document's escape mode
249+
* Set the document's escape mode, which determines how characters are escaped when the output character set
250+
* does not support a given character:- using either a named or a numbered escape.
244251
* @param escapeMode the new escape mode to use
245252
* @return the document's output settings, for chaining
246253
*/
@@ -287,6 +294,25 @@ CharsetEncoder encoder() {
287294
return charsetEncoder;
288295
}
289296

297+
/**
298+
* Get the document's current output syntax.
299+
* @return current syntax
300+
*/
301+
public Syntax syntax() {
302+
return syntax;
303+
}
304+
305+
/**
306+
* Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
307+
* {@code xml}, with self-closing tags.
308+
* @param syntax serialization syntax
309+
* @return the document's output settings, for chaining
310+
*/
311+
public OutputSettings syntax(Syntax syntax) {
312+
this.syntax = syntax;
313+
return this;
314+
}
315+
290316
/**
291317
* Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
292318
* the output, and the output will generally look like the input.

src/main/java/org/jsoup/nodes/DocumentType.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ public String nodeName() {
3434
void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
3535
accum.append("<!DOCTYPE ").append(attr("name"));
3636
if (!StringUtil.isBlank(attr("publicId")))
37-
accum.append(" PUBLIC \"").append(attr("publicId")).append("\"");
37+
accum.append(" PUBLIC \"").append(attr("publicId")).append('"');
3838
if (!StringUtil.isBlank(attr("systemId")))
39-
accum.append(" \"").append(attr("systemId")).append("\"");
39+
accum.append(" \"").append(attr("systemId")).append('"');
4040
accum.append('>');
4141
}
4242

src/main/java/org/jsoup/nodes/Element.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,8 +1072,13 @@ void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out)
10721072
.append(tagName());
10731073
attributes.html(accum, out);
10741074

1075-
if (childNodes.isEmpty() && tag.isSelfClosing())
1076-
accum.append(" />");
1075+
// selfclosing includes unknown tags, isEmpty defines tags that are always empty
1076+
if (childNodes.isEmpty() && tag.isSelfClosing()) {
1077+
if (out.syntax() == Document.OutputSettings.Syntax.html && tag.isEmpty())
1078+
accum.append('>');
1079+
else
1080+
accum.append(" />"); // <img> in html, <img /> in xml
1081+
}
10771082
else
10781083
accum.append(">");
10791084
}

src/main/java/org/jsoup/nodes/Entities.java

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.jsoup.nodes;
22

3+
import org.jsoup.helper.StringUtil;
34
import org.jsoup.parser.Parser;
45

56
import java.io.IOException;
@@ -16,7 +17,7 @@
1617
*/
1718
public class Entities {
1819
public enum EscapeMode {
19-
/** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */
20+
/** Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. */
2021
xhtml(xhtmlByVal),
2122
/** Default HTML output entities. */
2223
base(baseByVal),
@@ -71,38 +72,86 @@ public static Character getCharacterByName(String name) {
7172
}
7273

7374
static String escape(String string, Document.OutputSettings out) {
74-
return escape(string, out.encoder(), out.escapeMode());
75+
StringBuilder accum = new StringBuilder(string.length() * 2);
76+
escape(accum, string, out, false, false, false);
77+
return accum.toString();
7578
}
7679

77-
static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) {
78-
StringBuilder accum = new StringBuilder(string.length() * 2);
79-
Map<Character, String> map = escapeMode.getMap();
80+
// this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
81+
static void escape(StringBuilder accum, String string, Document.OutputSettings out,
82+
boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) {
8083

84+
boolean lastWasWhite = false;
85+
boolean reachedNonWhite = false;
86+
EscapeMode escapeMode = out.escapeMode();
87+
CharsetEncoder encoder = out.encoder();
88+
Map<Character, String> map = escapeMode.getMap();
8189
final int length = string.length();
82-
for (int offset = 0; offset < length; ) {
83-
final int codePoint = string.codePointAt(offset);
8490

91+
int codePoint;
92+
for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
93+
codePoint = string.codePointAt(offset);
94+
95+
if (normaliseWhite) {
96+
if (StringUtil.isWhitespace(codePoint)) {
97+
if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
98+
continue;
99+
accum.append(' ');
100+
lastWasWhite = true;
101+
continue;
102+
} else {
103+
lastWasWhite = false;
104+
reachedNonWhite = true;
105+
}
106+
}
85107
// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
86108
if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
87109
final char c = (char) codePoint;
88-
if (map.containsKey(c))
89-
accum.append('&').append(map.get(c)).append(';');
90-
else if (encoder.canEncode(c))
91-
accum.append(c);
92-
else
93-
accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
110+
// html specific and required escapes:
111+
switch (c) {
112+
case '&':
113+
accum.append("&amp;");
114+
break;
115+
case 0xA0:
116+
if (escapeMode != EscapeMode.xhtml)
117+
accum.append("&nbsp;");
118+
else
119+
accum.append(c);
120+
break;
121+
case '<':
122+
if (!inAttribute)
123+
accum.append("&lt;");
124+
else
125+
accum.append(c);
126+
break;
127+
case '>':
128+
if (!inAttribute)
129+
accum.append("&gt;");
130+
else
131+
accum.append(c);
132+
break;
133+
case '"':
134+
if (inAttribute)
135+
accum.append("&quot;");
136+
else
137+
accum.append(c);
138+
break;
139+
default:
140+
if (encoder.canEncode(c))
141+
accum.append(c);
142+
else if (map.containsKey(c))
143+
accum.append('&').append(map.get(c)).append(';');
144+
else
145+
accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
146+
}
94147
} else {
95148
final String c = new String(Character.toChars(codePoint));
96149
if (encoder.canEncode(c))
97150
accum.append(c);
98151
else
99152
accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
100153
}
101-
102-
offset += Character.charCount(codePoint);
103154
}
104-
105-
return accum.toString();
106155
}
107156

108157
static String unescape(String string) {
@@ -119,11 +168,11 @@ static String unescape(String string, boolean strict) {
119168
return Parser.unescapeEntities(string, strict);
120169
}
121170

171+
122172
// xhtml has restricted entities
123173
private static final Object[][] xhtmlArray = {
124174
{"quot", 0x00022},
125175
{"amp", 0x00026},
126-
{"apos", 0x00027},
127176
{"lt", 0x0003C},
128177
{"gt", 0x0003E}
129178
};

src/main/java/org/jsoup/nodes/TextNode.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,9 @@ void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out)
9393
if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
9494
indent(accum, depth, out);
9595

96-
String html = Entities.escape(getWholeText(), out);
97-
if (out.prettyPrint() && parent() instanceof Element && !Element.preserveWhitespace((Element) parent()))
98-
StringUtil.appendNormalisedWhitespace(accum, html, false);
99-
else
100-
accum.append(html);
96+
boolean normaliseWhite = out.prettyPrint() && parent() instanceof Element
97+
&& !Element.preserveWhitespace((Element) parent());
98+
Entities.escape(accum, getWholeText(), out, false, normaliseWhite, false);
10199
}
102100

103101
void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}

src/main/java/org/jsoup/parser/Tag.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,11 +230,11 @@ public String toString() {
230230
"var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q",
231231
"sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup",
232232
"option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
233-
"summary", "command", "device"
233+
"summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track"
234234
};
235235
private static final String[] emptyTags = {
236236
"meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
237-
"device"
237+
"device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track"
238238
};
239239
private static final String[] formatAsInlineTags = {
240240
"title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style",

src/main/java/org/jsoup/parser/XmlTreeBuilder.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,18 @@
77
import java.util.List;
88

99
/**
10+
* Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
11+
* document.
12+
* <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
13+
*
1014
* @author Jonathan Hedley
1115
*/
1216
public class XmlTreeBuilder extends TreeBuilder {
1317
@Override
1418
protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
1519
super.initialiseParse(input, baseUri, errors);
1620
stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
21+
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
1722
}
1823

1924
@Override

0 commit comments

Comments
 (0)