-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils_stats_openml.py
151 lines (132 loc) · 6.24 KB
/
utils_stats_openml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import json
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.datasets import fetch_openml
from skmultilearn.problem_transform import LabelPowerset
from ydata_profiling import ProfileReport
dataset_ids = {
'binary': [31, 37, 44, 1462, 1479, 1510, 40945],
'multiclass': [23, 36, 54, 181, 1466, 40691, 40975],
'multilabel': [285, 41464, 41465, 41468, 41470, 41471, 41473]
}
def get_data_type(dataset_type, dataset_id, X):
profile = ProfileReport(X)
profile.to_file(f"reports/{dataset_type}_{dataset_id}.html")
data_types = {k:v['type'] for k,v in profile.description_set.variables.items()}
numerical_cols = [col for col, dtype in data_types.items() if dtype == "Numeric"]
categorical_cols = [col for col, dtype in data_types.items() if dtype == "Categorical"]
data_type = "Unknown"
if len(numerical_cols) == len(data_types): # All columns are numerical (quantitative)
data_type = "Quantitative"
elif len(categorical_cols) == len(data_types): # All columns are categorical (qualitative)
data_type = "Qualitative"
else: # Mixed
data_type = "Mixed"
return data_type
def calculate_complexity_binary_multiclass(n_samples, n_features, n_classes):
complexity = (n_features * n_classes) / n_samples
return round(complexity, 3)
def calculate_complexity_multilabel(n_samples, n_features, y):
total_labels = y.columns.size
total_labels_assigned = y.values.sum()
num_samples = len(y)
label_cardinality = total_labels_assigned / num_samples
effective_classes = label_cardinality * total_labels
complexity = (n_features * effective_classes) / n_samples
return round(complexity, 3)
def load_openml(dataset_type, dataset_id):
dataset = fetch_openml(data_id=dataset_id, return_X_y=False)
X, y = dataset.data.copy(deep=True), dataset.target.copy(deep=True)
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9321731
# https://github.com/mrapp-ke/Boomer-Datasets/raw/refs/heads/main/flags.arff
if dataset_id == 285: # flags
df = pd.concat([X, y], axis='columns')
label_columns = [
'crescent', 'triangle', 'icon', 'animate', 'text', 'red',
'green', 'blue', 'gold', 'white', 'black', 'orange'
]
y = df[label_columns].astype(int) # Select only label columns
for col in y.columns.values:
y[col] = y[col].map({0: 'FALSE', 1: 'TRUE'})
X = df.drop(columns=label_columns).infer_objects() # Drop label columns to get remaining ones
for col in X.columns:
if col not in ['mainhue', 'topleft', 'botright']:
X[col] = X[col].astype(float)
assert df.shape[0] == X.shape[0] # rows
assert df.shape[0] == y.shape[0] # rows
assert df.shape[1] == X.shape[1] + y.shape[1] # columns
id, name = dataset.details['id'], dataset.details['name']
data_type = get_data_type(dataset_type, dataset_id, X)
n_samples, n_features = X.shape[0], X.shape[1]
clf_type, n_classes, complexity = None, np.nan, np.nan
if isinstance(y, pd.Series): # binary or multiclass
n_classes = y.nunique()
clf_type = 'binary' if n_classes == 2 else 'multiclass'
complexity = calculate_complexity_binary_multiclass(n_samples, n_features, n_classes)
elif isinstance(y, pd.DataFrame): # multilabel
n_classes = y.columns.size
clf_type = 'multilabel'
for col in y.columns.values:
y[col] = y[col].map({'FALSE': 0, 'TRUE': 1}).to_numpy()
complexity = calculate_complexity_multilabel(n_samples, n_features, y)
latex_str = ' & '.join([str(x) for x in [id, name, data_type, n_samples, n_features, n_classes, complexity]])
result = {
"type": clf_type,
"id": id,
"name": name,
"data_type": data_type,
"n_samples": n_samples,
"n_features": n_features,
"n_classes": n_classes,
"complexity": complexity,
"latex_str": latex_str
}
pprint(result, width=200)
return result
def calculate_powerset(dataset_id):
dataset = fetch_openml(data_id=dataset_id, return_X_y=False, parser='auto')
X, y = dataset.data.copy(deep=True), dataset.target.copy(deep=True)
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9321731
# https://github.com/mrapp-ke/Boomer-Datasets/raw/refs/heads/main/flags.arff
if dataset_id == 285: # flags
df = pd.concat([X, y], axis='columns')
label_columns = [
'crescent', 'triangle', 'icon', 'animate', 'text', 'red',
'green', 'blue', 'gold', 'white', 'black', 'orange'
]
y = df[label_columns].astype(int) # Select only label columns
for col in y.columns.values:
y[col] = y[col].map({0: 'FALSE', 1: 'TRUE'})
X = df.drop(columns=label_columns).infer_objects() # Drop label columns to get remaining ones
for col in X.columns:
if col not in ['mainhue', 'topleft', 'botright']:
X[col] = X[col].astype(float)
assert df.shape[0] == X.shape[0] # rows
assert df.shape[0] == y.shape[0] # rows
assert df.shape[1] == X.shape[1] + y.shape[1] # columns
for col in y.columns.values:
y[col] = y[col].map({'FALSE': 0, 'TRUE': 1}).to_numpy()
y_ps = pd.Series(LabelPowerset().transform(y))
print(dataset_id, y_ps.nunique(), 'unique labels')
def merge_jsons():
summary = {}
for ds_type in dataset_ids.keys():
with open(f'reports/{ds_type}.json', 'r', encoding='utf-8') as fp:
content = json.load(fp)
summary[ds_type] = content
with open('reports/summary.json', 'w', encoding='utf-8') as fp:
json.dump(summary, fp, indent=4, ensure_ascii=True)
if __name__ == "__main__":
# summary = {}
# for ds_type, ds_ids in dataset_ids.items():
# summary[ds_type] = []
# print(ds_type)
# for id in ds_ids:
# result = load_openml(ds_type, id)
# summary[ds_type].append(result)
# with open(f'reports/{ds_type}.json', 'w', encoding='utf-8') as fp:
# json.dump(summary[ds_type], fp, indent=4, ensure_ascii=True)
# with open('reports/summary.json', 'w', encoding='utf-8') as fp:
# json.dump(summary, fp, indent=4, ensure_ascii=True)
merge_jsons()