-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmutcd-getter.py
More file actions
executable file
·156 lines (124 loc) · 5.78 KB
/
mutcd-getter.py
File metadata and controls
executable file
·156 lines (124 loc) · 5.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
"""Get MUTCD SVGs from Wikipedia."""
import urllib
import argparse
import os
import json
from lxml import html
import wikipedia
from pyquery import PyQuery
class MUTCDGetterError(Exception):
pass
# Setup arguments.
parser = argparse.ArgumentParser()
parser.add_argument('--title', default='Comparison of MUTCD-Influenced Traffic Signs',
help='The title of the Wikipedia entry containing the tables of MUTCD-inspired signs.')
parser.add_argument('--tables', default='Warning,Regulatory,Mandatory or permitted actions,Other (indication)',
help='The headers of the tables whose signs you wish to retrieve.')
parser.add_argument('--column', default='USA',
help='The identifier representing the column you wish to pull from each table (e.g. 5, or "USA").')
parser.add_argument('--output_folder', default=os.getcwd(),
help='The path to the folder into which the signage and common names file should be downloaded.')
parser.add_argument('--common_names_filename', default='mutcd_common_names.json',
help='The filename for the JSON-formatted file mapping common sign names to SVG file names.')
parsed_args = parser.parse_args()
# Get the requested article.
print('Fetching the wikipedia page...')
wiki_page = wikipedia.page(parsed_args.title)
# Parse the html and wrap it in pyquery.
d = PyQuery(html.fromstring(wiki_page.html()))
# Get a list of all headers in the document, to be filtered later for each table's header.
all_headers = d(':header')
# Create a data structure to store sign data and filenames.
signs = []
raw_filename_map = {}
print('Finding the SVGs. Hang tight. This may take a couple of minutes.\n')
# For each table header, get the associated table.
table_headers = parsed_args.tables.split(',')
for table_header in table_headers:
table_signs = {
'type': 'category',
'text': table_header,
'data': []
}
table = PyQuery(all_headers.filter(lambda: this.text_content() == table_header + '[edit]').next_all('table')[0])
first_row = table('tr:nth-child(1)')
column = parsed_args.column
if isinstance(column, str):
# If the provided column is a string, find the column index. While this is probably the same for each table,
# we'll still check every time. It's cheap and this is an infrequent script.
for i, header_cell in enumerate(first_row.find('th')):
if header_cell.text_content().strip().startswith(column):
column = i
break
if isinstance(column, str):
# We couldn't find it.
raise MUTCDGetterError('The specified column header could not be found.')
# Get all rows following the first row.
table_rows = first_row.next_all('tr')
for row in table_rows:
row = PyQuery(row)
# Due to a current pyquery bug, nth-child does not work right.
row_header = PyQuery(row.find('td')[0]).text()
if not row_header:
# This is an intermediary header row to remind readers which column is which.
continue
target_cell_imgs = PyQuery(row.find('td')[column]).find('img[alt$=svg]')
if not target_cell_imgs:
continue
row_sign_filenames = []
target_cell_imgs.each(lambda: row_sign_filenames.append(
{
'type': 'sign filename',
'text': PyQuery(this).attr('alt').replace(' ', '_'),
}
))
filenames_to_remove = []
for filename in row_sign_filenames:
# Download the SVG(s).
try:
image_page = PyQuery('http://en.wikipedia.org/wiki/File:{}'.format(filename['text']))
svg_link = image_page.find('a.internal').filter(lambda: this.text_content().strip() == 'Original file')
svg_url = svg_link.attr('href')
if not svg_url:
print(
'Could not find URL for {table_header}: {row_header}: {filename}\n'.format(
table_header=table_header,
row_header=row_header,
filename=filename['text']
)
)
if svg_url.startswith('//'):
# De-relativize to an absolute URL.
svg_url = 'http:' + svg_url
urllib.urlretrieve(svg_url, os.path.join(parsed_args.output_folder, filename['text']))
raw_filename_map[filename['text']] = {
'category': table_header,
'commonName': row_header,
}
except KeyboardInterrupt:
raise
except:
# Plan for success, prepare for absolute destruction of everything you hold dear.
print(
'An unknown error occurred while attempting to retrieve '
'{table_header}: {row_header}: {filename}\n'.format(
table_header=table_header,
row_header=row_header,
filename=filename['text']
)
)
filenames_to_remove.append(filename)
for filename in filenames_to_remove:
row_sign_filenames.remove(filename)
if row_sign_filenames:
table_signs['data'].append({
'type': 'common name',
'text': row_header,
'data': row_sign_filenames,
})
if table_signs['data']:
signs.append(table_signs)
with open(os.path.join(parsed_args.output_folder, parsed_args.common_names_filename), 'w') as json_file:
json.dump({'signs': signs, 'rawFilenameMap': raw_filename_map}, json_file, indent=4)
print('\nAll files written. Enjoy!')