-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_data.py
More file actions
40 lines (29 loc) · 1.11 KB
/
preprocess_data.py
File metadata and controls
40 lines (29 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pandas as pd
import numpy as np
from multiprocessing import Pool
import tqdm
import multiprocessing as mp
"""Script for data preprocessing"""
DATA_PATH = "https://raw.githubusercontent.com/mkmkl93/ml-ca/master/data/uniform_200k/dataset1_200.csv"
OUTPUT_DATA_PATH = "data/data.csv"
def preprocess_row(idx):
row = data.iloc[idx]
new_df = pd.DataFrame()
new_df["time"] = [v for i, v in enumerate(row[1:-2]) if (i % 2 == 0)] + [0]
n = new_df.shape[0] + 1
new_df["dose"] = [v for i, v in enumerate(row[1:-1]) if (i % 2 != 0)] + [0]
new_df["series"] = idx
new_df["time_idx"] = range(n - 1)
new_df["is_target"] = [0 for _ in range(n - 2)] + [1]
new_df["target"] = [0 for _ in range(n - 2)] + [row[-1]]
return new_df
def main():
print(f"Availiable CPU cores number is {mp.cpu_count()}")
k = data.shape[0]
with Pool(mp.cpu_count() - 1) as p:
results = list(tqdm.tqdm(p.imap_unordered(preprocess_row, range(k)), total=k))
new_data = pd.concat(results)
new_data.to_csv(OUTPUT_DATA_PATH)
if __name__ == "__main__":
data = pd.read_csv(DATA_PATH)
main()