Skip to content

Add a parse method to encode illegal tag in html body #1359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/main/java/org/jsoup/Jsoup.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ public static Document parse(String html) {
return Parser.parse(html, "");
}

/**
Parse HTML into a Document with only legal tags by encoding illegal ones.
As no base URI is specified, absolute URL detection relies on the HTML including a {@code <base href>} tag.

@param html HTML to parse
@return sane HTML

@see #parse(String, String)
*/
public static Document parseLegalTag(String html) {
return Parser.parseLegalTag(html, "");
}

/**
* Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
* <p>
Expand Down
47 changes: 47 additions & 0 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import java.lang.reflect.Field;
import java.util.ArrayList;

import static org.jsoup.internal.StringUtil.inSorted;
Expand Down Expand Up @@ -253,6 +254,20 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
},
InBody {
boolean process(Token t, HtmlTreeBuilder tb) {
if (tb.legalize) {
if (t.type == Token.TokenType.StartTag && !islegal(t.asStartTag().normalName)) {
Token.StartTag st = t.asStartTag();
Token.Character newT = new Token.Character();
newT.data("<" + st.tagName + ">");
t = newT;
} else if (t.type == Token.TokenType.EndTag && !islegal(t.asEndTag().normalName)) {
Token.EndTag et = t.asEndTag();
Token.Character newT = new Token.Character();
newT.data("</" + et.tagName + ">");
t = newT;
}
}

switch (t.type) {
case Character: {
Token.Character c = t.asCharacter();
Expand Down Expand Up @@ -868,6 +883,38 @@ else if (!tb.onStack(formatEl)) {
}
return true;
}

private boolean islegal(String tagName) {
try {
// check tags in Constants
Field[] fields = Constants.class.getDeclaredFields();
Constants _const = new Constants();
for (Field item: fields) {
String[] tags = (String[]) item.get(_const);
if (inSorted(tagName, tags))
return true;
}

// check tags in Tag.java
Tag _tag = Tag.valueOf("noMeaning");
String[] checkList = {"blockTags", "inlineTags", "emptyTags", "formatAsInlineTags",
"preserveWhitespaceTags", "formListedTags", "formSubmitTags"};
for (String name: checkList) {
Field field = Tag.class.getDeclaredField(name);
field.setAccessible(true);
if (inSorted(tagName, (String[]) field.get(_tag))) {
field.setAccessible(false);
return true;
}
field.setAccessible(false);
}
} catch (IllegalAccessException e) {
System.err.println("IllegalAccessException");
} catch (NoSuchFieldException e) {
System.err.println("NoSuchFieldException");
}
return false;
}
},
Text {
// in script, style etc. normally treated as data tags
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ public static Document parse(String html, String baseUri) {
return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
}

/**
* Parse HTML into a Document with encoding illegal tags.
*
* @param html HTML to parse
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return parsed Document
*/
public static Document parseLegalTag(String html, String baseUri) {
TreeBuilder treeBuilder = new HtmlTreeBuilder();
return treeBuilder.parseLegalTag(new StringReader(html), baseUri, new Parser(treeBuilder));
}

/**
* Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
*
Expand Down
15 changes: 15 additions & 0 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ abstract class TreeBuilder {
protected String baseUri; // current base uri, for creating new elements
protected Token currentToken; // currentToken is used only for error tracking.
protected ParseSettings settings;
protected boolean legalize; // if encoding illegal tags

private Token.StartTag start = new Token.StartTag(); // start tag to process
private Token.EndTag end = new Token.EndTag();
Expand All @@ -40,6 +41,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
tokeniser = new Tokeniser(reader, parser.getErrors());
stack = new ArrayList<>(32);
this.baseUri = baseUri;
legalize = false;
}

Document parse(Reader input, String baseUri, Parser parser) {
Expand All @@ -55,6 +57,19 @@ Document parse(Reader input, String baseUri, Parser parser) {
return doc;
}

Document parseLegalTag(Reader input, String baseUri, Parser parser) {
initialiseParse(input, baseUri, parser);
this.legalize = true;
runParser();

reader.close();
reader = null;
tokeniser = null;
stack = null;

return doc;
}

abstract List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser);

protected void runParser() {
Expand Down
7 changes: 7 additions & 0 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1362,4 +1362,11 @@ public void testUNewlines() {
doc.outputSettings().prettyPrint(false);
assertEquals("<html><head></head><body>One <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
}

@Test public void testParseLegalTag() {
String html = "<p>Try <aaa>jsoup</aaa></p><div><hello></div>";
Document doc = Jsoup.parseLegalTag(html);
doc.outputSettings().prettyPrint(false);
assertEquals("<html><head></head><body><p>Try &lt;aaa&gt;jsoup&lt;/aaa&gt;</p><div>&lt;hello&gt;</div></body></html>", doc.outerHtml());
}
}