1
1
2
2
3
3
from .logger import LoggingHandler
4
- from .splits import SlidingWindowTimeSeriesSplit , ExpandingWindowTimeSeriesSplit
4
+ from .splits import SlidingWindowTimeSeriesSplit
5
5
from .errors import RawResiduals
6
6
7
- import scipy . io as sio
7
+ # Core dependencies
8
8
import pandas as pd
9
9
import xarray as xr
10
10
import numpy as np
11
11
import copy
12
12
import matplotlib .pyplot as plt
13
- from itertools import product
14
13
from sklearn .model_selection import KFold
15
14
15
+ # Dependencies for data acquisition
16
+ import requests
17
+ from io import BytesIO
18
+ from zipfile import ZipFile
19
+ import tempfile
20
+ import os
21
+ import scipy .io as sio
16
22
17
23
18
24
def load_dummy_data_df (series_count = 10 , timestamp_count = 5 , time_feature_count = 3 , series_feature_count = 2 , vs_times_series_factor = 10000 , vs_times_timestamps_factor = 100 , vs_series_series_factor = 10000 ):
@@ -39,6 +45,85 @@ def load_dummy_data_df(series_count = 10, timestamp_count = 5, time_feature_coun
39
45
return (dummy_vs_times_df , dummy_vs_series_df )
40
46
41
47
48
+ def download_zipfile (url ):
49
+ content = requests .get (url )
50
+ zipped = ZipFile (BytesIO (content .content ))
51
+ return zipped
52
+
53
+
54
+ def download_ramsay_weather_data_dfs ():
55
+ zipped = download_zipfile ('http://www.psych.mcgill.ca/misc/fda/downloads/FDAfuns/Matlab/fdaM.zip' )
56
+ tempdir = tempfile .TemporaryDirectory ()
57
+ zipped .extract (member = 'examples/weather/daily.mat' , path = tempdir .name )
58
+ weather_data_dict = sio .loadmat (os .path .join (tempdir .name , 'examples/weather/daily.mat' ))
59
+
60
+ weather_tempav_df = pd .DataFrame (weather_data_dict ['tempav' ])
61
+ weather_tempav_df ['day_of_year' ] = weather_tempav_df .index .values + 1
62
+ weather_tempav_df = pd .melt (weather_tempav_df , id_vars = ['day_of_year' ])
63
+ weather_tempav_df .rename (columns = {'variable' : 'weather_station' , 'value' : 'tempav' }, inplace = True )
64
+ #weather_tempav_df
65
+
66
+ weather_precav_df = pd .DataFrame (weather_data_dict ['precav' ])
67
+ weather_precav_df ['day_of_year' ] = weather_precav_df .index .values + 1
68
+ weather_precav_df = pd .melt (weather_precav_df , id_vars = ['day_of_year' ])
69
+ weather_precav_df .rename (columns = {'variable' : 'weather_station' , 'value' : 'precav' }, inplace = True )
70
+ #weather_precav_df
71
+
72
+ weather_vs_times_df = pd .merge (weather_tempav_df , weather_precav_df )
73
+ weather_vs_series_df = None
74
+ return (weather_vs_times_df , weather_vs_series_df )
75
+
76
+
77
+ def download_ramsay_growth_data_dfs ():
78
+ zipped = download_zipfile ('http://www.psych.mcgill.ca/misc/fda/downloads/FDAfuns/Matlab/fdaM.zip' )
79
+ tempdir = tempfile .TemporaryDirectory ()
80
+ zipped .extract (member = 'examples/growth/growth.mat' , path = tempdir .name )
81
+ growth_data_dict = sio .loadmat (os .path .join (tempdir .name , 'examples/growth/growth.mat' ))
82
+
83
+ ages_arr = growth_data_dict ['age' ]
84
+
85
+ boys_df = pd .DataFrame (growth_data_dict ['hgtmmat' ])
86
+ boys_df ['age' ] = ages_arr
87
+ boys_df = pd .melt (boys_df , id_vars = ['age' ])
88
+ boys_df .rename (columns = {'variable' : 'cohort_id' , 'value' : 'height' }, inplace = True )
89
+ boys_df ['gender' ] = 'boy'
90
+ #boys_df
91
+
92
+ girls_df = pd .DataFrame (growth_data_dict ['hgtfmat' ])
93
+ girls_df ['age' ] = ages_arr
94
+ girls_df = pd .melt (girls_df , id_vars = ['age' ])
95
+ girls_df .rename (columns = {'variable' : 'cohort_id' , 'value' : 'height' }, inplace = True )
96
+ girls_df ['gender' ] = 'girl'
97
+ #girls_df
98
+
99
+ growth_df = pd .concat ([boys_df , girls_df ])
100
+
101
+ growth_vs_times_df = growth_df
102
+ growth_vs_series_df = growth_df .drop (['age' , 'height' ], axis = 1 ).drop_duplicates ()
103
+
104
+ return (growth_vs_times_df , growth_vs_series_df )
105
+
106
+
107
+ def download_ecg_data_dfs ():
108
+ zipped = download_zipfile ('http://timeseriesclassification.com/Downloads/ECG200.zip' )
109
+ tempdir = tempfile .TemporaryDirectory ()
110
+ zipped .extract (member = 'ECG200/ECG200.csv' , path = tempdir .name )
111
+ ecg_filepath = os .path .join (tempdir .name , 'ECG200/ECG200.csv' )
112
+
113
+ raw_df = pd .read_csv (ecg_filepath , names = ([str (x ) for x in range (96 )] + ['class_label' ]), skiprows = 101 )
114
+ raw_df ['heartbeat' ] = raw_df .index .values
115
+ ecg_vs_series = raw_df [['heartbeat' , 'class_label' ]]
116
+
117
+ raw_df = raw_df .melt (id_vars = ['heartbeat' , 'class_label' ])
118
+ raw_df .rename (columns = { 'variable' : 'timestamp' , 'value' : 'potential_difference' }, inplace = True )
119
+ raw_df ['timestamp' ] = raw_df ['timestamp' ].astype (int )
120
+ ecg_vs_times = raw_df [['heartbeat' , 'timestamp' , 'potential_difference' ]]
121
+
122
+ ecg_vs_series ['is_abnormal' ] = (ecg_vs_series ['class_label' ] == - 1 )
123
+ ecg_vs_series .drop ('class_label' , axis = 1 , inplace = True )
124
+ return (ecg_vs_times , ecg_vs_series )
125
+
126
+
42
127
# Design patterns used: Flyweight, Prototype.
43
128
class MultiSeries (LoggingHandler ):
44
129
"""The summary line for a class docstring should fit on one line.
@@ -1167,187 +1252,4 @@ def __repr__(self):
1167
1252
return (self .__class__ .__name__ + '(filtering ' + str (rows_filtered ) + '/' + str (rows_total ) + ' observations, over ' + str (self ._count_series_indices ) + ' series x ' + str (self ._count_time_indices ) + ' timestamps x ' + str (self .count_features ) + ' features)' )
1168
1253
1169
1254
1170
-
1171
-
1172
-
1173
- ##################################################
1174
- # For testing
1175
- ##################################################
1176
-
1177
-
1178
- if False :
1179
-
1180
- (dummy_vs_times_df , dummy_vs_series_df ) = load_dummy_data_df ()
1181
- print (dummy_vs_times_df )
1182
- print (dummy_vs_series_df )
1183
- data_dummy = MultiSeries (data_vs_times_df = dummy_vs_times_df , data_vs_series_df = dummy_vs_series_df , time_colname = 'timestamp' , series_id_colnames = 'series' )
1184
- data_dummy .visualise ()
1185
-
1186
- if False :
1187
-
1188
- (boston_vs_times_df , boston_vs_series_df ) = load_boston_housing_data_df ()
1189
- print (boston_vs_times_df )
1190
- print (boston_vs_series_df )
1191
- data_boston = MultiSeries (data_vs_times_df = boston_vs_times_df , data_vs_series_df = boston_vs_series_df , time_colname = 'feature' , series_id_colnames = 'sample' )
1192
- data_boston .visualise ()
1193
-
1194
- if False :
1195
-
1196
- (weather_vs_times_df , weather_vs_series_df ) = load_ramsay_weather_data_dfs ()
1197
- data_weather = MultiSeries (data_vs_times_df = weather_vs_times_df , data_vs_series_df = weather_vs_series_df , time_colname = 'day_of_year' , series_id_colnames = 'weather_station' )
1198
-
1199
- (growth_vs_times_df , growth_vs_series_df ) = load_ramsay_growth_data_dfs ()
1200
- growth_vs_series_df ['gender' ] = growth_vs_series_df ['gender' ].astype ('category' )
1201
- growth_vs_series_df = pd .concat ([growth_vs_series_df , pd .get_dummies (growth_vs_series_df ['gender' ])], axis = 1 )
1202
- data_growth = MultiSeries (data_vs_times_df = growth_vs_times_df , data_vs_series_df = growth_vs_series_df , time_colname = 'age' , series_id_colnames = ['gender' , 'cohort_id' ])
1203
-
1204
- #data_growth.get_backward_time_window(5, 18).visualise(filter_value_colnames='height')
1205
-
1206
- data_growth .get_backward_time_window (5 , 12 )._time_uniques_all
1207
-
1208
- data_growth .get_forward_time_window (5 , 15.5 )._time_uniques_all
1209
-
1210
-
1211
-
1212
-
1213
- data_weather_v2 = data_weather .get_backward_time_window (5 ).new_mutable_instance ()
1214
- data_weather_v2 ._data_vs_times_df
1215
-
1216
- data_weather_v2 ._data_vs_times_df .loc [(33 ,[366 ]), ['precav' , 'tempav' ]] = [1 ,2 ]
1217
- data_weather_v2 ._data_vs_times_df
1218
-
1219
- data_weather_v2 .set_time_labelled_values (prediction_series = [34 ], prediction_features = ['precav' , 'tempav' ], prediction_times = [366 ], values = [1 ,2 ])
1220
- data_weather_v2 ._data_vs_times_df
1221
-
1222
- # new_data_vs_times_df.loc[(series_id, times_list), value_colnames_vs_times]
1223
- #data_weather_v2._data_vs_times_df.loc[(33,366),:] = [1,2]
1224
- #data_weather_v2._data_vs_times_df.sort_index()
1225
-
1226
- data_growth_v2 = data_growth .get_backward_time_window (5 ).new_mutable_instance ()
1227
-
1228
-
1229
-
1230
-
1231
- if False :
1232
-
1233
- data_weather .visualise (title = 'Weather data' )
1234
- data_weather .visualise_means (title = 'Weather data' )
1235
- data_weather .visualise_arrays (include_time_as_feature = True )
1236
-
1237
- data_growth .visualise (title = 'Growth data' )
1238
- data_growth .visualise_means (title = 'Growth data' )
1239
- data_growth .visualise_arrays (include_time_as_feature = True )
1240
-
1241
-
1242
- # Ready for cross-validation
1243
- for (ot , ov ) in data_weather .generate_series_folds (series_splitter = KFold (n_splits = 5 )):
1244
- print ('Outer Loop. Training = ' + str (ot ._series_id_uniques ) + ' / Validation = ' + str (ov ._series_id_uniques ))
1245
- for (it , iv ) in ot .generate_series_folds (series_splitter = KFold (n_splits = 5 )):
1246
- print ('Inner Loop. Training = ' + str (it ._series_id_uniques ) + ' / Validation = ' + str (iv ._series_id_uniques ))
1247
- for (st , sv ) in it .generate_time_windows (time_splitter = SlidingWindowTimeSeriesSplit (count_timestamps = len (it ._time_uniques_all ), training_set_size = 100 , validation_set_size = 50 , step = 50 )):
1248
- print ('Timeseries Loop. Training = ' + str (st ._time_uniques_all ) + ' / Validation = ' + str (sv ._time_uniques_all ))
1249
-
1250
-
1251
- # Ready for cross-validation
1252
- for (ot , ov ) in data_growth .generate_series_folds (series_splitter = KFold (n_splits = 5 )):
1253
- print ('Outer Loop. Training = ' + str (ot ._series_id_uniques ) + ' / Validation = ' + str (ov ._series_id_uniques ))
1254
- for (it , iv ) in ot .generate_series_folds (series_splitter = KFold (n_splits = 5 )):
1255
- print ('Inner Loop. Training = ' + str (it ._series_id_uniques ) + ' / Validation = ' + str (iv ._series_id_uniques ))
1256
- for (st , sv ) in it .generate_time_windows (time_splitter = SlidingWindowTimeSeriesSplit (count_timestamps = len (it ._time_uniques_all ), training_set_size = 2 , validation_set_size = 5 , step = 5 )):
1257
- print ('Timeseries Loop. Training = ' + str (st ._time_uniques_all ) + ' / Validation = ' + str (sv ._time_uniques_all ))
1258
-
1259
-
1260
-
1261
-
1262
- if False :
1263
-
1264
- # Data: weather
1265
- (weather_vs_times_df , weather_vs_series_df ) = load_ramsay_weather_data_dfs ()
1266
- data_weather = MultiSeries (data_vs_times_df = weather_vs_times_df , data_vs_series_df = weather_vs_series_df , time_colname = 'day_of_year' , series_id_colnames = 'weather_station' )
1267
- #data_weather.visualise()
1268
-
1269
- # Data: growth
1270
- (growth_vs_times_df , growth_vs_series_df ) = load_ramsay_growth_data_dfs ()
1271
- growth_vs_series_df ['gender' ] = growth_vs_series_df ['gender' ].astype ('category' )
1272
- growth_vs_series_df = pd .concat ([growth_vs_series_df , pd .get_dummies (growth_vs_series_df ['gender' ])], axis = 1 )
1273
- data_growth = MultiSeries (data_vs_times_df = growth_vs_times_df , data_vs_series_df = growth_vs_series_df , time_colname = 'age' , series_id_colnames = ['gender' , 'cohort_id' ])
1274
- #data_growth.visualise()
1275
-
1276
- input_sliding_window_size = 10
1277
- output_sliding_window_size = 5
1278
-
1279
- (a4d_vs_times_windowed_input , a4d_vs_times_windowed_output ) = data_growth .select_paired_tabular_windowed_4d_arrays (input_sliding_window_size = input_sliding_window_size , output_sliding_window_size = output_sliding_window_size )
1280
- print ('a4d_vs_times_windowed_input.shape = ' + str (a4d_vs_times_windowed_input .shape ))
1281
- print ('a4d_vs_times_windowed_output.shape = ' + str (a4d_vs_times_windowed_output .shape ))
1282
-
1283
- (a4d_vs_times_windowed_input , a4d_vs_times_windowed_output ) = data_weather .select_paired_tabular_windowed_4d_arrays (input_sliding_window_size = input_sliding_window_size , output_sliding_window_size = output_sliding_window_size )
1284
- print ('a4d_vs_times_windowed_input.shape = ' + str (a4d_vs_times_windowed_input .shape ))
1285
- print ('a4d_vs_times_windowed_output.shape = ' + str (a4d_vs_times_windowed_output .shape ))
1286
-
1287
-
1288
- if False :
1289
- # Data: weather
1290
- (weather_vs_times_df , weather_vs_series_df ) = load_ramsay_weather_data_dfs ()
1291
- data_weather = MultiSeries (data_vs_times_df = weather_vs_times_df , data_vs_series_df = weather_vs_series_df , time_colname = 'day_of_year' , series_id_colnames = 'weather_station' )
1292
-
1293
- data_weather .visualise ()
1294
-
1295
- weather_vs_times_df [weather_vs_times_df .weather_station == 28 ].set_index ('day_of_year' )['tempav' ].plot ()
1296
- weather_vs_times_df [weather_vs_times_df .weather_station == 28 ].set_index ('day_of_year' )['precav' ].plot ()
1297
-
1298
-
1299
-
1300
- if False :
1301
- # Data: ECG
1302
- (ecg_vs_times_df , ecg_vs_series_df ) = load_ecg_data_dfs ()
1303
- data_ecg = MultiSeries (data_vs_times_df = ecg_vs_times_df , data_vs_series_df = ecg_vs_series_df , time_colname = 'timestamp' , series_id_colnames = 'heartbeat' )
1304
-
1305
- # 133 are normal, the remaining 67 are abnormal.
1306
- ecg_vs_series_df .groupby ('is_abnormal' ).count ()
1307
- ecg_vs_times_df ['timestamp' ].max () # divide into first half (0...47) and second half (48...95)
1308
-
1309
- data_ecg .visualise (filter_value_colnames = 'potential_difference' )
1310
- data_ecg .visualise_moments (filter_value_colnames = 'potential_difference' )
1311
-
1312
-
1313
-
1314
-
1315
- if False :
1316
- # Data: Starlight
1317
- (starlight_vs_times_df , starlight_vs_series_df ) = load_starlight_data_dfs ()
1318
- data_starlight = MultiSeries (data_vs_times_df = starlight_vs_times_df , data_vs_series_df = starlight_vs_series_df , time_colname = 'folded_time' , series_id_colnames = 'starlight_curve' )
1319
-
1320
- starlight_vs_times_df ['folded_time' ].max ()
1321
-
1322
- data_starlight .visualise (filter_value_colnames = 'magnitude' )
1323
- data_starlight .visualise_moments (filter_value_colnames = 'magnitude' )
1324
-
1325
-
1326
-
1327
- if False :
1328
-
1329
- # Data: Power: Multiple locations & days
1330
- #(power_vs_times_df, power_vs_series_df) = load_power_data_dfs(power_filename='multiple_locations_multiple_days.csv' )
1331
- (power_vs_times_df , power_vs_series_df ) = load_power_data_multiple_locations_multiple_days_dfs ()
1332
- data_power_multiple_locations_multiple_days = MultiSeries (data_vs_times_df = power_vs_times_df , data_vs_series_df = power_vs_series_df , time_colname = 'half_hour' , series_id_colnames = 'series_id' )
1333
- data_power_multiple_locations_multiple_days .visualise (title = 'Multiple locations & days' )
1334
- data_power_multiple_locations_multiple_days .visualise_moments (title = 'Multiple locations & days' )
1335
-
1336
- # Data: Power: One day, multiple locations
1337
- #(power_vs_times_df, power_vs_series_df) = load_power_data_dfs(power_filename='one_day_multiple_locations.csv' )
1338
- (power_vs_times_df , power_vs_series_df ) = load_power_data_one_day_multiple_locations_dfs ()
1339
- data_power_one_day_multiple_locations = MultiSeries (data_vs_times_df = power_vs_times_df , data_vs_series_df = power_vs_series_df , time_colname = 'half_hour' , series_id_colnames = 'series_id' )
1340
- data_power_one_day_multiple_locations .visualise (title = 'One day, multiple locations' )
1341
- data_power_one_day_multiple_locations .visualise_moments (title = 'One day, multiple locations' )
1342
-
1343
- # Data: Power: One location, multiple days
1344
- #(power_vs_times_df, power_vs_series_df) = load_power_data_dfs(power_filename='one_location_multiple_days.csv' )
1345
- (power_vs_times_df , power_vs_series_df ) = load_power_data_one_location_multiple_days_dfs ()
1346
- data_power_one_location_multiple_days = MultiSeries (data_vs_times_df = power_vs_times_df , data_vs_series_df = power_vs_series_df , time_colname = 'half_hour' , series_id_colnames = 'series_id' )
1347
- data_power_one_location_multiple_days .visualise (title = 'One location, multiple days' )
1348
- data_power_one_location_multiple_days .visualise_moments (title = 'One location, multiple days' )
1349
-
1350
-
1351
-
1352
-
1353
-
1255
+
0 commit comments