-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcommon.py
181 lines (165 loc) · 7.26 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import json
import multiprocessing
import os
import sys
import time
import pandas as pd
from pytictoc import TicToc
from sklearn.datasets import fetch_openml
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import LabelPowerset
# pd.options.mode.chained_assignment = None
FORCE_DATASET_REF = None
EXEC_TIME_MINUTES = 5
EXEC_TIME_SECONDS = EXEC_TIME_MINUTES*60
MAX_MEMORY_MB = 32*1024
NUM_CPUS = multiprocessing.cpu_count()
PRIME_NUMBERS = [
2, 3, 5, 7, 11, 13, 17, 19, 23, 29,
31, 37, 41, 43, 47, 53, 59, 61, 67, 71
]
LABEL_POWERSET = False # use True when needed
TIMER = TicToc()
def set_random_seed(seed):
try:
import random
random.seed(seed)
except ImportError as e:
print(f"Error importing a necessary module; skipping seed: {e}")
try:
import numpy
numpy.random.seed(seed)
except ImportError as e:
print(f"Error importing a necessary module; skipping seed: {e}")
try:
from sklearn.utils import check_random_state
check_random_state(seed)
except ImportError as e:
print(f"Error importing a necessary module; skipping seed: {e}")
try:
import tensorflow as tf
tf.random.set_seed(seed)
except ImportError as e:
print(f"Error importing a necessary module; skipping seed: {e}")
try:
import torch
torch.manual_seed(seed)
except ImportError as e:
print(f"Error importing a necessary module; skipping seed: {e}")
def get_dataset_ref():
if FORCE_DATASET_REF:
return FORCE_DATASET_REF
elif len(sys.argv) < 2:
print('usage: python common.py dataset_ref')
else:
dataset_ref = None
try:
dataset_ref = int(sys.argv[1])
except:
dataset_ref = str(sys.argv[1])
return dataset_ref
def infer_task_type(y_test):
num_classes = len(set(y_test))
if num_classes == 1:
raise Exception('Malformed data set; num_classes == 1')
elif num_classes == 2:
task_type = 'binary'
else:
task_type = 'multiclass'
return task_type
def is_multi_label():
return get_dataset_ref() in [285, 41464, 41465, 41468, 41470, 41471, 41473]
def load_data_delegate(seed):
if isinstance(get_dataset_ref(), int):
return load_openml(seed)
elif isinstance(get_dataset_ref(), str):
return load_csv()
else:
raise Exception('dataset_ref must be int (OpenML) or str (local CSV)')
def load_csv():
base_folder = os.path.join(os.path.dirname(__file__), 'datasets', get_dataset_ref())
filenames = ['X_train.csv', 'X_test.csv', 'y_train.csv', 'y_test.csv']
dfs = []
for filename in filenames:
full_filename = os.path.join(base_folder, filename)
csv = pd.read_csv(filepath_or_buffer=full_filename).infer_objects().to_numpy()
dfs.append(csv.ravel() if csv.shape[1] == 1 else csv)
return dfs[0], dfs[1], dfs[2], dfs[3]
def load_openml(seed):
dataset = fetch_openml(data_id=get_dataset_ref(), return_X_y=False, parser='auto')
X, y = dataset.data.copy(deep=True), dataset.target.copy(deep=True)
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9321731
# https://github.com/mrapp-ke/Boomer-Datasets/raw/refs/heads/main/flags.arff
if get_dataset_ref() == 285: # flags
df = pd.concat([X, y], axis='columns')
label_columns = [
'crescent', 'triangle', 'icon', 'animate', 'text', 'red',
'green', 'blue', 'gold', 'white', 'black', 'orange'
]
y = df[label_columns].astype(int) # Select only label columns
for col in y.columns.values:
y[col] = y[col].map({0: 'FALSE', 1: 'TRUE'})
X = df.drop(columns=label_columns).infer_objects() # Drop label columns to get remaining ones
for col in X.columns:
if col not in ['mainhue', 'topleft', 'botright']:
X[col] = X[col].astype(float)
assert df.shape[0] == X.shape[0] # rows
assert df.shape[0] == y.shape[0] # rows
assert df.shape[1] == X.shape[1] + y.shape[1] # columns
# handle categorical features
for col in X.columns.values:
if X[col].dtype.name == 'category':
X.loc[:, col] = pd.Series(pd.factorize(X[col])[0])
# handle target(s) column(s)
if is_multi_label():
for col in y.columns.values:
y[col] = y[col].map({'FALSE': 0, 'TRUE': 1}).to_numpy()
if LABEL_POWERSET:
y = pd.Series(LabelPowerset().transform(y), name="class")
else:
y = pd.Series(pd.factorize(y)[0], name="class")
return train_test_split(X, y, test_size=0.2, random_state=seed)
def calculate_score(metric, y_true, y_pred, **kwargs):
try:
return metric(y_true, y_pred, **kwargs)
except:
return -1.0
def collect_and_persist_results(y_test, y_pred, training_time, test_time, framework="unknown", seed=None):
suffix = 'ps' if LABEL_POWERSET else ''
results_folder = f'./results/{get_dataset_ref()}{suffix}'
results_filename = f'{results_folder}/automl_{framework}.json'
if not os.path.exists(results_folder):
os.makedirs(results_folder)
if os.path.exists(results_filename):
with open(results_filename, 'r') as infile:
all_results = json.load(infile)
else:
all_results = {
'id': get_dataset_ref(),
'framework': framework,
'results': []
}
this_result = {
"seed": str(seed),
"accuracy_score": calculate_score(accuracy_score, y_test, y_pred),
"average_precision_score": calculate_score(average_precision_score, y_test, y_pred),
"balanced_accuracy_score": calculate_score(balanced_accuracy_score, y_test, y_pred),
"cohen_kappa_score": calculate_score(cohen_kappa_score, y_test, y_pred),
"f1_score_macro": calculate_score(f1_score, y_test, y_pred, average="macro"),
"f1_score_micro": calculate_score(f1_score, y_test, y_pred, average="micro"),
"f1_score_weighted": calculate_score(f1_score, y_test, y_pred, average="weighted"),
"matthews_corrcoef": calculate_score(matthews_corrcoef, y_test, y_pred),
"precision_score": calculate_score(precision_score, y_test, y_pred),
"recall_score": calculate_score(recall_score, y_test, y_pred),
"roc_auc_score": calculate_score(roc_auc_score, y_test, y_pred),
"coverage_error": calculate_score(coverage_error, y_test, y_pred),
"label_ranking_average_precision_score": calculate_score(label_ranking_average_precision_score, y_test, y_pred),
"label_ranking_loss": calculate_score(label_ranking_loss, y_test, y_pred),
"training_time": training_time,
"test_time": test_time
}
print(f"\n\n{'*' * 80}\n{framework} => {this_result}\n{'*' * 80}\n\n")
all_results['results'].append(this_result)
with open(results_filename, 'w') as outfile:
json.dump(all_results, outfile)