-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHTMLParser.py
More file actions
184 lines (169 loc) · 6.55 KB
/
HTMLParser.py
File metadata and controls
184 lines (169 loc) · 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from Helper.tokens import Text, Element
class HTMLParser:
SELF_CLOSING_TAGS = [
"area", "base", "br", "col", "embed", "hr", "img", "input",
"link", "meta", "param", "source", "track", "wbr",
]
HEAD_TAGS = [
"base", "basefont", "bgsound", "noscript",
"link", "meta", "title", "style", "script",
]
def __init__(self, body):
self.body = body
self.unfinished = []
def parse(self):
last_ten_chars = " "
text = ""
in_tag = False
in_quotes = False
in_script = False
read_text = True
last_known_tag = False
for c in self.body:
last_ten_chars = last_ten_chars[-9:] + c
if c == "\"":
in_quotes = not in_quotes
# check for comments
if last_ten_chars[-4:] == "<!--":
read_text = False
in_tag = last_known_tag
last_ten_chars = " "
text = text[:-2]
elif last_ten_chars[-3:] == "-->":
read_text = True
continue
# proceed if not in a comment
if read_text:
if in_script:
if last_ten_chars[-9:] == "</script>":
text = text[:-8]
# add content inside of script tags as text
if text:
self.add_text(text)
text = ""
# close of the tag
last_known_tag = False
in_tag = False
in_script = False
self.add_tag("/script")
else:
text += c
else:
# check for tags
if c == "<":
last_known_tag = in_tag
in_tag = True
if text:
self.add_text(text)
text = ""
elif c == ">" and not in_quotes:
last_known_tag = in_tag
in_tag = False
self.add_tag(text)
in_script = check_in_script(text)
text = ""
else:
text += c
if text[-4:] == '<':
text = text[:-4] + '<'
elif text[-4:] == '>':
text = text[:-4] + '>'
elif text[-4:] == '­':
text = text[:-4] + '\N{soft hyphen}'
elif text[-5:] == '&':
text = text[:-5] + '&'
elif text[-6:] == '"':
text = text[:-6] + '\"'
if not in_tag and text:
self.add_text(text)
return self.finish()
# add a new node as the child of the last unfinished node
def add_text(self, text):
# ignore whitespace
if text.isspace():
return
self.implicit_tags()
parent = self.unfinished[-1]
node = Text(text, parent)
parent.children.append(node)
def add_tag(self, tag):
tag, attributes = get_attributes(tag)
# ignore doctype declarations and comments
if tag.startswith("!"):
return
self.implicit_tags(tag)
# Important Tags
if tag.startswith("/"):
if len(self.unfinished) == 1:
return
# close tag removes the last unfinished node and adds the next unfinished node in the list
node = self.unfinished.pop()
parent = self.unfinished[-1]
parent.children.append(node)
elif tag in self.SELF_CLOSING_TAGS:
# auto close any tags that are part of this list
parent = self.unfinished[-1]
node = Element(tag, attributes, parent)
parent.children.append(node)
else:
# open tag adds an unfinished node to the end of the list
parent = self.unfinished[-1] if self.unfinished else None
node = Element(tag, attributes, parent)
self.unfinished.append(node)
# turn incomplete tree to a complete tree by finishing unfinished nodes
def finish(self):
if len(self.unfinished) == 0:
self.add_tag("html")
while len(self.unfinished) > 1:
node = self.unfinished.pop()
parent = self.unfinished[-1]
parent.children.append(node)
return self.unfinished.pop()
def print_tree(self, node, indent=0):
print(" " * indent, node)
for child in node.children:
self.print_tree(child, indent + 2)
def tree_str(self, node, indent=0):
t = " " * indent
t += node.__str__()
for child in node.children:
t += "\n" + self.tree_str(child, indent + 2)
return t
def implicit_tags(self, tag=None):
# compare the list of unfinished tags to figure out which ones have been omitted
# more than one tag can be omitted in each row -> loop
while True:
open_tags = [node.tag for node in self.unfinished]
# necessary when the first tag in the document is something other than <html>
if open_tags == [] and tag != "html":
self.add_tag("html")
elif open_tags == ["html"] and tag not in ["head", "body", "/html"]:
# head and body tags can be omitted
if tag in self.HEAD_TAGS:
self.add_tag("head")
else:
self.add_tag("body")
elif open_tags == ["html", "head"] and tag not in ["/head"] + self.HEAD_TAGS:
# the /head tag can also be implicit
self.add_tag("/head")
else:
break
def get_attributes(text):
parts = text.split()
tag = parts[0].lower()
attributes = {}
for attrpair in parts[1:]:
if "=" in attrpair:
key, value = attrpair.split("=", 1)
# value can also be quoted -> strip the quote out
if len(value) > 2 and value[0] in ["'", "\""]:
value = value[1:-1]
attributes[key.lower()] = value
else:
# empty string attribute
attributes[attrpair.lower()] = ""
return tag, attributes
def check_in_script(text):
parts = text.split()
tag = parts[0].lower()
return tag == "script"