-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmain.py
More file actions
121 lines (98 loc) · 3.84 KB
/
Copy pathmain.py
File metadata and controls
121 lines (98 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""The main module
"""
from threading import Thread
from time import strftime, gmtime
import timeit
from src.crawler_bot import config, custom_logging, extractor, monitoring, retriever, storage
# load seed, remove linebreak and empty lines
with open(config.SEED_FILE, encoding="utf-8") as f:
seed = f.readlines()
seed = [a.replace("\n", "") for a in seed if a != ""]
# setting up the logger
logger = custom_logging.Logger(custom_logging.LogLevel.DEBUG, "logfile")
logger.log_info("MAIN", "START")
# setting up the databases
html_database = storage.HTMLDatabase(logger)
crawled_urls = storage.CrawledURLs(logger, config.CRAWLING_LIMIT)
domain_timers = storage.DomainTimers(logger)
robots_txt_database = storage.RobotsTXTDatabase(logger)
url_queue = storage.URLQueue(logger, seed)
unprocessed_html_database = storage.UnprocessedHTMLDatabase(logger)
url_map = storage.URLMap(logger)
# setting up the global monitor
monitor = monitoring.GlobalMonitor(logger)
# setting up the retrievers
retrievers = []
for i in range(config.NUM_RETRIEVER_THREADS):
my_retriever = retriever.Retriever(i, logger, url_queue, crawled_urls,
unprocessed_html_database, domain_timers,
robots_txt_database, monitor)
retrievers.append(my_retriever)
# setting up the extractors
extractors = []
for i in range(config.NUM_EXTRACTOR_THREADS):
my_extractor = extractor.Extractor(i, logger, html_database,
unprocessed_html_database, url_queue,
crawled_urls, url_map, monitor)
extractors.append(my_extractor)
# start timer
start = timeit.default_timer()
# putting all retrievers and extractors in seperate threads
threads = []
for single_retriever in retrievers:
t = Thread(target=single_retriever.start_retriever, args=())
threads.append(t)
t.start()
for single_extractor in extractors:
t = Thread(target=single_extractor.start_extractor, args=())
threads.append(t)
t.start()
try:
# wait for all threads to end
for t in threads:
t.join()
finally:
# sort list after relevance
html_database.sort_after_relevance()
relevant_urls = html_database.get_list_of_relevant_urls()
# measure time
stop = timeit.default_timer()
runtime = round(stop - start)
logger.log_info("MAIN", "DONE")
logger.log_info("MAIN", "Runtime: " + str(runtime) + "s")
print("Runtime: " + str(runtime) + "s")
# print information
print("len crawled urls: ", str(len(crawled_urls.crawled_urls)))
print("len unprocessed html database: ",
str(len(unprocessed_html_database.database)))
print("len html database: ", str(len(html_database.database)))
print("len relevant urls: ", str(len(relevant_urls)))
# safe results
filename = strftime("%Y%m%d_%H%M%S", gmtime())
with open("assets/" + logger.file_prefix + "_html_database.json",
"x",
encoding="utf-8") as a:
a.write(html_database.to_json())
with open("assets/" + logger.file_prefix + "_unprocessed_html_database.json",
"x",
encoding="utf-8") as a:
a.write(unprocessed_html_database.to_json())
with open("assets/" + logger.file_prefix + "_crawled_urls.json",
"x",
encoding="utf-8") as a:
a.write(crawled_urls.to_json())
with open("assets/" + logger.file_prefix + "_url_map.json",
"x",
encoding="utf-8") as a:
a.write(url_map.to_json())
with open("assets/" + logger.file_prefix + "_robotstxt.json",
"x",
encoding="utf-8") as a:
a.write(robots_txt_database.to_json())
with open("assets/" + logger.file_prefix + "_relevant_urls.csv",
"x",
encoding="utf-8") as a:
for url in relevant_urls:
a.writelines(url + "\n")
# create url map, only works with very few fetched urls!!
#url_map.draw_map("logs/" + logger.file_prefix + "_url_map")