SciTransformer/prepare_data.py at main · thephet/SciTransformer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#########################################################################################
#
# This script will prepare the data, as read from files and prepare it in Python
# data structures (lists).
#
# JSON files are related to the input motor pattern speeds.
# It has values between -10... 10. only integers.
# It has 1 entry per minute, and each entry has 25 values (one per motor)
#
# CSV files are related to the state of the BZ as returned by the SVM or blue channel
# Its values are either 0 (non oscillation) or 1 (oscillation) for binarized ones.
# Or it has the blue channel value, from 0 to 255.
# It has 1 entry per frame (total of 7200 games), and 25 values (BZ cells)
#
# The objective will be to read these files and prepare a keras Generator.
#
#########################################################################################


import glob, json, random, pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator


def csv2data(pathToFolder="data/binary/*.csv"):
    ''' This will read the CSVs and JSON files, process them, and store them
    into a python list. The CSV must be as received from the SVM. 0s or 1s.
    It returns a list with the oscillations, and a list with the patterns'''

    # each experiment has 30 random patterns of 1 minute
    PATTERNS_PER_EXPERIMENT = 30
    CSV_ROWS = 7200 # videos have 7200 frames

    # here we will save the output oscillations generated by the BZ
    oscillations = []
    # here we will save the input patterns, which in this experiment were 1 a minute
    patterns = []

    # if each pattern flashes for 1 minute,
    # then we can calculate how many frames that pattern was flashed
    repeats = CSV_ROWS // PATTERNS_PER_EXPERIMENT
    # small trick for later on, because we will need to repeat those patterns for each
    # frame, but in the original file they only appear once
    # (only 30 patterns appear, 1 per minute, we will need 1 pattern per frame)
    repeats = np.repeat( repeats, PATTERNS_PER_EXPERIMENT )

    # the csv files contain the BZ oscillations, the json files contain the input pattern
    for csv_file in glob.glob(pathToFolder):

        # remove the initial 2000 positions because they are BZ noisy
        current_oscillations = np.genfromtxt(csv_file,
            delimiter=',').T.tolist()[2000:7200]
        oscillations.append(current_oscillations)

        # the pattern json files have the same name but the ending is .json
        exp_name = csv_file.split(".")[0]
        exp_name = csv_file.split("_")[0]
        pattern_json = json.load(open(exp_name+".json"))

        jsonpattern = [] # to save the loaded values

        for key,value in pattern_json.items():
            pattern = []

            for k,v in value.items():
                # speed goes from -10 to 10. First we get it from -1 to 1
                current_v = v/10.
                # now from 0 to 1
                # current_v = (current_v + 1) / 2.
                pattern.append(current_v)

            jsonpattern.append(pattern)

        current_pattern = np.repeat(jsonpattern, repeats, axis=0).tolist()
        current_pattern = current_pattern[2000:2000+len(current_oscillations)]
        patterns.append(current_pattern)

    print(len(oscillations))
    print(len(patterns))

    return oscillations, patterns


def csvraw2data(pathToFolder="data/raw/*.csv"):
    ''' This will read the CSVs and JSON files, process them, and store them
    into a python list. The CSV must be raw data. blue channel 0...255
    It returns a list with the oscillations, and a list with the patterns'''

    # each experiment has 30 random patterns of 1 minute
    PATTERNS_PER_EXPERIMENT = 30
    CSV_ROWS = 7200 # videos have 7200 frames

    # here we will save the output oscillations generated by the BZ
    oscillations = []
    # here we will save the input patterns, which in this experiment were 1 a minute
    patterns = []

    # if each pattern flashes for 1 minute,
    # then we can calculate how many frames that pattern was flashed
    repeats = CSV_ROWS // PATTERNS_PER_EXPERIMENT
    # small trick for later on, because we will need to repeat those patterns for each
    # frame, but in the original file they only appear once
    # (only 30 patterns appear, 1 per minute, we will need 1 pattern per frame)
    repeats = np.repeat( repeats, PATTERNS_PER_EXPERIMENT )

    # the csv files contain the BZ oscillations, the json files contain the input pattern
    for csv_file in glob.glob(pathToFolder):

        # remove the initial 2000 positions because they are BZ noisy
        current_oscillations = np.genfromtxt(csv_file,
            delimiter=',').T[2000:7200]
        # calculate moving average
        ma = moving_average(current_oscillations)
        # remove moving average from oscillation to remove the fact that it gets reddish
        co = current_oscillations - ma
        oscillations.append(co.tolist())

        # the pattern json files have the same name but the ending is .json
        exp_name = csv_file.split(".")[0]
        exp_name = csv_file.split("_")[0]
        pattern_json = json.load(open(exp_name+".json"))

        jsonpattern = [] # to save the loaded values

        for key,value in pattern_json.items():
            pattern = []

            for k,v in value.items():
                # speed goes from -10 to 10. First we get it from -1 to 1
                current_v = v/10.
                # now from 0 to 1 - NO, we will keep it from -1 to 1
                # current_v = (current_v + 1) / 2.
                pattern.append(current_v)

            jsonpattern.append(pattern)

        current_pattern = np.repeat(jsonpattern, repeats, axis=0).tolist()
        current_pattern = current_pattern[2000:2000+len(current_oscillations)]
        patterns.append(current_pattern)

    # get the mix and max oscillation value, to normalize it between 0 and 1
    tmax, tmin = getMinMaxLists(oscillations)
    print(len(oscillations))
    print(len(patterns))
    print(f'min {tmin} max {tmax}')

    # do the normalization, and return
    return applyMinMax(oscillations, tmax, tmin), patterns


def moving_average(a, n=10):
    ''' calculates moving average, edge values are just copied'''

    ret = np.cumsum(a, axis=0)
    ret[n:] = ret[n:] - ret[:-n]
    ret[:n] = a[:n] * n
    #ret = ret[n - 1:] / n
    return ret / n


def getMinMaxLists(lists):
    ''' Given a list of lists, it will go through all of them get min and max.
    np cannot be used because they have different sizes'''

    total_max, total_min = -100, 999 # some odd init numbers

    for l in lists:
        l = np.array(l)

        if np.amax(l) > total_max:
            total_max = np.amax(l)

        if np.amin(l) < total_min:
            total_min = np.amin(l)

    return total_max, total_min


def applyMinMax(lists, tmax, tmin):
    ''' it will regularize each element based on tmax and tmin'''

    new_list = []

    for l in lists:
        l = np.array(l)
        l = (l-tmin)/(tmax-tmin)
        new_list.append( l.tolist() )

    return new_list


def getMinMaxFolder(pathToFolder="data/raw/*.csv"):
    '''Given a folder with experiments, with CSVs, it will return min and max CSV value'''

    total_max, total_min = -100, 999 # some odd init numbers

    for csv_file in glob.glob(pathToFolder):
        # remove the initial 2000 positions because they are BZ noisy
        current_oscillations = np.genfromtxt(csv_file, delimiter=',').T[2000:7200]

        if np.amin(current_oscillations) < 1:
            print(csv_file)

        if np.amax(current_oscillations) > total_max:
            total_max = np.amax(current_oscillations)

        if np.amin(current_oscillations) < total_min:
            total_min = np.amin(current_oscillations)

    return total_max, total_min


def data2networkIOGen(oscillations, patterns, sampling_rate=1):
    ''' this will take the list previously generated, and process it
    so that they can be used as IO in the RNN to train.
    It returns network_input, network_output'''

    sr = sampling_rate
    sequence_length = 240*5 # 4 frames per second, so 5 minute of real video
    network_input = []
    # final length of the sequences considering sampling rate
    out_seq = sequence_length // sr
    # fill first position with crap because gen will zip i with i+1
    network_output = [ [0]*25 ]

    for o in range(len(oscillations)):
        print(o)
        osc = oscillations[o]
        pat = patterns[o]

        # create input sequences and the corresponding outputs
        for i in range(0, len(osc) - sequence_length, 1):
            a = pat[i : i + sequence_length : sr]
            b = osc[i : i + sequence_length : sr]
            sequence_in = [a+b for a,b in zip(a,b)]
            # shifted right by one
            sequence_out = osc[i+sr : i+sequence_length+sr : sr]
            network_input.append(sequence_in)
            network_output.append(sequence_out)

    # add useless entry at the end because we added 0 at start of output
    network_input.append([ [0]*50 ] * out_seq)

    n_patterns = len(network_input)
    network_input = np.reshape(network_input, (n_patterns, out_seq, 50))

    generator = TimeseriesGenerator(network_input, network_output, 1, batch_size=64)

    return generator


if __name__ == "__main__":

    oscillations, patterns = csv2data()
    gen = data2networkIOGen(oscillations, patterns, sampling_rate=8)

    # save into pickle file, because the previous calculation takes a bit
    with open('/home/data/juanma/BZ/databinsr8_-1to1.p', 'wb') as handle:
        pickle.dump(gen, handle)