-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgenerate_ground_truth.py
More file actions
102 lines (89 loc) · 3.63 KB
/
Copy pathgenerate_ground_truth.py
File metadata and controls
102 lines (89 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Example on how to create new ground truth vectors
"""
from src.crawler_bot.custom_logging import Logger, LogLevel
from src.crawler_bot.classification import Classifier
from src.crawler_bot.tools import load_dataset
import timeit
import json
################################################################################
dataset_filename = "assets/20221211_033449_dataset.json"
ignore_categories = False
max_amount_of_sentences = 50
use_adaptive_amount_of_sentences = False # if True, overwrites max_amount_of_sentences
get_most_important_sentences = False
allowed_distance_average = True # use average or maximum for distance to ground truth vectors
################################################################################
# start timer
start = timeit.default_timer()
# set up logger and classifier
logger = Logger(LogLevel.DEBUG, "generate_ground_truth")
classifier = Classifier(1, logger)
# load dataset from file
# (can also be generated directly from a url list by combining load_url_list
# and download_url_list)
data = load_dataset(dataset_filename)
dataset = data["dataset"]
# if needed, generate ideal_amount_of_sentences_first
if use_adaptive_amount_of_sentences:
print("Calculating ideal max amount of sentences")
max_amount_of_sentences, sentence_gradients = classifier.calculate_ideal_amount_of_sentences(
dataset, ignore_categories)
# save parameters
parameters = {}
parameters["dataset"] = data["parameters"]
parameters["ignore_categories"] = ignore_categories
parameters["max_amount_of_sentences"] = max_amount_of_sentences
parameters["dataset_filename"] = dataset_filename
parameters["use_adaptive_amount_of_sentences"] = str(
use_adaptive_amount_of_sentences)
parameters["get_most_important_sentences"] = str(get_most_important_sentences)
parameters["allowed_distance_average"] = str(allowed_distance_average)
# generate ground truth vectors
print("Generating ground truth vectors")
result = classifier.generate_ground_truth_vectors(dataset, ignore_categories,
max_amount_of_sentences,
allowed_distance_average,
get_most_important_sentences)
# measure time
stop = timeit.default_timer()
runtime = round(stop - start)
print("Runtime: " + str(runtime) + "s")
logger.log_info("MAIN", "Runtime: " + str(runtime) + "s")
# save ground truth vectors to file
with open("assets/" + logger.file_prefix + "_ground_truth_vectors.json",
"w",
encoding="utf-8") as f:
json.dump(
{
"parameters": parameters,
"ground_truth_vectors": result["ground_truth_vectors"]
}, f)
# save gradients to file
with open("assets/" + logger.file_prefix + "_ground_truth_gradients.json",
"w",
encoding="utf-8") as f:
json.dump(
{
"parameters": parameters,
"ground_truth_gradients": result["ground_truth_gradients"]
}, f)
# if available, save sentence_gradients to file
if use_adaptive_amount_of_sentences:
with open("assets/" + logger.file_prefix + "_sentence_gradients.json",
"w",
encoding="utf-8") as f:
json.dump(
{
"parameters": parameters,
"sentence_gradients": sentence_gradients
}, f)
# if available save most important sentences to file
if get_most_important_sentences:
with open("assets/" + logger.file_prefix + "_most_important_sentences.json",
"w",
encoding="utf-8") as f:
json.dump(
{
"parameters": parameters,
"most_important_sentences": result["most_important_sentences"]
}, f)