-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_datasets.py
203 lines (164 loc) · 9.75 KB
/
generate_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import logging
import math
import random
import time
from itertools import cycle
from typing import List, Dict, Tuple
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import settings
import library_adaptations
from utils import calculate_label_skew, calculate_feature_skew, calculate_label_skew_from_counts
def generate_label_skewed_counts(num_clients: int = 2,
num_datapoints_per_client: int = settings.NUM_DATAPOINTS_PER_CLIENT,
num_classes: int = 2,
target_skewness: float = 0.5):
counts_per_partition = []
random.seed(settings.RANDOM_SEED)
# Generate a label distribution of 0.0 by equally dividing the datapoints over the clients.
for client in range(num_clients):
client_ = {}
for label in range(num_classes):
client_[label] = num_datapoints_per_client // num_classes
counts_per_partition.append(client_)
# Set the previous labels, such that we do not keep swapping the same labels.
previous = [0, 1]
# Calculate the label skewness of the generated data and keep adjusting the data until the skewness is
# within the target range.
while True:
label_skewness = calculate_label_skew_from_counts(counts_per_partition)
# If the calculated skewness is smaller than the target skewness, we need to swap labels such that
# both clients get more of each other's label.
if label_skewness < (target_skewness - 0.025):
client_1 = random.choice(range(num_clients))
client_2 = random.choice([x for x in range(num_clients) if x != client_1])
# Select the labels that are closest to the middle of the distribution to swap, such that both clients
# get a more extreme (e.g. 0 or num_datapoints_per_client // num_classes) label.
label_1 = np.argmin([abs(counts_per_partition[client_1][label] - (num_datapoints_per_client // num_classes)
) if label not in previous else np.inf for label in range(num_classes)])
label_2 = np.argmin([abs(counts_per_partition[client_2][label] - (num_datapoints_per_client // num_classes)
) if label != label_1 else np.inf for label in range(num_classes)])
# Swap at most 25 labels datapoints at once, or the minimum of the two labels that a client has.
to_swap = min(counts_per_partition[client_1][label_1], 25, counts_per_partition[client_2][label_2])
counts_per_partition[client_1][label_1] -= to_swap
counts_per_partition[client_2][label_1] += to_swap
counts_per_partition[client_1][label_2] += to_swap
counts_per_partition[client_2][label_2] -= to_swap
previous = [label_1, label_2]
# Else, if the label skewness is higher than desired, we need to swap labels from the extremes of two clients
# to get more towards the average each client should have.
elif label_skewness > (target_skewness + 0.025):
client_1 = random.choice(range(num_clients))
client_2 = random.choice([x for x in range(num_clients) if x != client_1])
# Now, select 2 labels based where the probability is equal to the label that is closest to
# 0 or num_datapoints_per_client // num_classes: so the furthest away from the middle.
label_1 = np.argmax([abs(counts_per_partition[client_1][label] - (num_datapoints_per_client // num_classes)
) if label not in previous else -np.inf for label in range(num_classes)])
label_2 = np.argmax([abs(counts_per_partition[client_2][label] - (num_datapoints_per_client // num_classes)
) if label != label_1 else -np.inf for label in range(num_classes)])
to_swap = min(counts_per_partition[client_2][label_1], 3, counts_per_partition[client_1][label_2])
counts_per_partition[client_1][label_1] += to_swap
counts_per_partition[client_2][label_1] -= to_swap
counts_per_partition[client_1][label_2] -= to_swap
counts_per_partition[client_2][label_2] += to_swap
# If the label skewness falls within the margin of the target skewness, return the counts per partition and
# actual skewness.
else:
return counts_per_partition, label_skewness
def generate_samples(counts_per_partition: List[Dict[int, int]], num_classes: int = settings.NUM_CLASSES):
x, y = make_classification(n_samples=settings.NUM_DATAPOINTS_PER_CLIENT * len(counts_per_partition),
n_features=settings.NUM_FEATURES,
n_informative=settings.NUM_FEATURES - settings.NUM_FEATURES_REDUNDANT,
n_redundant=settings.NUM_FEATURES_REDUNDANT,
n_clusters_per_class=1,
n_classes=num_classes,
flip_y=0,
random_state=settings.RANDOM_SEED
)
label_to_datapoints = dict()
for x_, y_ in zip(x, y):
if y_ not in label_to_datapoints:
label_to_datapoints[y_] = []
label_to_datapoints[y_].append(x_)
for label in label_to_datapoints:
random.shuffle(label_to_datapoints[label])
partitions = []
for client in counts_per_partition:
client_x = []
client_y = []
for label in client:
for i in range(client[label]):
client_x.append(label_to_datapoints[label].pop())
client_y.append(label)
partitions.append((np.array(client_x), np.array(client_y)))
return partitions
def generate_feature_skewed_dataset(num_clients: int = settings.NUM_CLIENTS,
target_skewness: float = settings.TARGET_FEATURE_SKEWNESS):
intra_sep = 1
while True:
x, y, z = tinker.make_classification(n_samples=settings.NUM_DATAPOINTS_PER_CLIENT * settings.NUM_CLIENTS,
n_features=settings.NUM_FEATURES, n_informative=settings.NUM_FEATURES,
n_redundant=0, n_clusters_per_class=num_clients, class_sep=1,
intra_class_sep=intra_sep, n_classes=settings.NUM_CLASSES,
random_state=settings.RANDOM_SEED)
clients = []
for client_idx in range(num_clients):
mask = z == client_idx
x_client = x[mask]
y_client = y[mask]
clients.append((x_client, y_client))
skewness = calculate_feature_skew(clients, num_clients)
print(f"Skewness: {skewness}, intra_sep: {intra_sep}")
if skewness < target_skewness * 0.95:
intra_sep *= 1.1
elif skewness > target_skewness * 1.05:
intra_sep *= 0.9
else:
break
# plot_data(clients)
print(f"Generated dataset with feature skewness {skewness}, "
f"intra sep {intra_sep}")
print(clients[0][0][:10], clients[0][1][:10])
return clients, skewness
def generate_combined_skewed_dataset(num_clients: int = settings.NUM_CLIENTS,
target_label_skewness: float = settings.TARGET_SKEWNESS,
target_feature_skewness: float = settings.TARGET_FEATURE_SKEWNESS):
counts_per_partition, calculated_label_skew = generate_label_skewed_counts(num_clients,
settings.NUM_DATAPOINTS_PER_CLIENT,
settings.NUM_CLASSES,
target_label_skewness)
intra_sep = 1
while True:
x, y, z = tinker.make_classification(n_samples=round((settings.NUM_DATAPOINTS_PER_CLIENT * num_clients) * 1.05),
# Generate 5% more data to compensate for label difference
n_features=settings.NUM_FEATURES, n_informative=settings.NUM_FEATURES,
n_redundant=0, n_clusters_per_class=num_clients, class_sep=30,
intra_class_sep=intra_sep, n_classes=settings.NUM_CLASSES,
random_state=settings.RANDOM_SEED,
weights=list(np.ones(settings.NUM_CLASSES) / settings.NUM_CLASSES))
label_to_datapoints = dict()
for x_, y_ in zip(x, y):
if y_ not in label_to_datapoints:
label_to_datapoints[y_] = []
label_to_datapoints[y_].append(x_)
for label in label_to_datapoints:
random.shuffle(label_to_datapoints[label])
partitions = []
for client in counts_per_partition:
client_x = []
client_y = []
for label in client:
for i in range(client[label]):
client_x.append(label_to_datapoints[label].pop())
client_y.append(label)
partitions.append((np.array(client_x), np.array(client_y)))
calculated_feature_skew = calculate_feature_skew(partitions, num_clients)
print(f"Skewness: {calculated_feature_skew}, intra_sep: {intra_sep}")
if calculated_feature_skew < target_feature_skewness * 0.95:
intra_sep *= 1.1
elif calculated_feature_skew > target_feature_skewness * 1.05:
intra_sep *= 0.9
else:
break
return partitions, calculated_label_skew, calculated_feature_skew