-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtable_parser.py
More file actions
53 lines (44 loc) · 1.47 KB
/
table_parser.py
File metadata and controls
53 lines (44 loc) · 1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from html.parser import HTMLParser
class HTMLTableParser(HTMLParser):
def __init__(
self
):
HTMLParser.__init__(self)
self._in_td = False
self._in_th = False
self._current_table = []
self._current_row = []
self._current_cell = []
self.tables = []
self.ignore = False
def handle_starttag(self, tag, attrs):
if tag == 'td':
self._in_td = True
self.extracting = True
if tag == 'th':
self._in_th = True
if (len(attrs) > 0):
if (attrs[0][0] == 'rowspan'):
self.ignore = True
if tag == "a" and self._in_td:
self._current_cell.append(attrs[0][1])
def handle_data(self, data):
if self._in_td or self._in_th:
if self.ignore is False:
self._current_cell.append(data.strip())
def handle_endtag(self, tag):
if tag == 'td':
self._in_td = False
elif tag == 'th':
self._in_th = False
if tag in ['td', 'th']:
if self.ignore is False:
self._current_row.append(self._current_cell[::-1])
self._current_cell = []
self.ignore = False
elif tag == 'tr':
self._current_table.append(self._current_row)
self._current_row = []
elif tag == 'table':
self.tables.append(self._current_table)
self._current_table = []