Skip to content

Commit cf0673e

Browse files
author
ahmedgc
committed
Code
Some cleanup, download data, and now builds as 0.0.2.
1 parent 220e88c commit cf0673e

35 files changed

+327
-2761
lines changed

build/lib/pysf/data.py

Lines changed: 89 additions & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
11

22

33
from .logger import LoggingHandler
4-
from .splits import SlidingWindowTimeSeriesSplit, ExpandingWindowTimeSeriesSplit
4+
from .splits import SlidingWindowTimeSeriesSplit
55
from .errors import RawResiduals
66

7-
import scipy.io as sio
7+
# Core dependencies
88
import pandas as pd
99
import xarray as xr
1010
import numpy as np
1111
import copy
1212
import matplotlib.pyplot as plt
13-
from itertools import product
1413
from sklearn.model_selection import KFold
1514

15+
# Dependencies for data acquisition
16+
import requests
17+
from io import BytesIO
18+
from zipfile import ZipFile
19+
import tempfile
20+
import os
21+
import scipy.io as sio
1622

1723

1824
def load_dummy_data_df(series_count = 10, timestamp_count = 5, time_feature_count = 3, series_feature_count = 2, vs_times_series_factor = 10000, vs_times_timestamps_factor = 100, vs_series_series_factor = 10000):
@@ -39,6 +45,85 @@ def load_dummy_data_df(series_count = 10, timestamp_count = 5, time_feature_coun
3945
return (dummy_vs_times_df, dummy_vs_series_df)
4046

4147

48+
def download_zipfile(url):
49+
content = requests.get(url)
50+
zipped = ZipFile(BytesIO(content.content))
51+
return zipped
52+
53+
54+
def download_ramsay_weather_data_dfs():
55+
zipped = download_zipfile('http://www.psych.mcgill.ca/misc/fda/downloads/FDAfuns/Matlab/fdaM.zip')
56+
tempdir = tempfile.TemporaryDirectory()
57+
zipped.extract(member='examples/weather/daily.mat', path=tempdir.name)
58+
weather_data_dict = sio.loadmat(os.path.join(tempdir.name, 'examples/weather/daily.mat'))
59+
60+
weather_tempav_df = pd.DataFrame(weather_data_dict['tempav'])
61+
weather_tempav_df['day_of_year'] = weather_tempav_df.index.values + 1
62+
weather_tempav_df = pd.melt(weather_tempav_df, id_vars=['day_of_year'])
63+
weather_tempav_df.rename(columns={'variable' : 'weather_station', 'value' : 'tempav'}, inplace=True)
64+
#weather_tempav_df
65+
66+
weather_precav_df = pd.DataFrame(weather_data_dict['precav'])
67+
weather_precav_df['day_of_year'] = weather_precav_df.index.values + 1
68+
weather_precav_df = pd.melt(weather_precav_df, id_vars=['day_of_year'])
69+
weather_precav_df.rename(columns={'variable' : 'weather_station', 'value' : 'precav'}, inplace=True)
70+
#weather_precav_df
71+
72+
weather_vs_times_df = pd.merge(weather_tempav_df, weather_precav_df)
73+
weather_vs_series_df= None
74+
return (weather_vs_times_df, weather_vs_series_df)
75+
76+
77+
def download_ramsay_growth_data_dfs():
78+
zipped = download_zipfile('http://www.psych.mcgill.ca/misc/fda/downloads/FDAfuns/Matlab/fdaM.zip')
79+
tempdir = tempfile.TemporaryDirectory()
80+
zipped.extract(member='examples/growth/growth.mat', path=tempdir.name)
81+
growth_data_dict = sio.loadmat(os.path.join(tempdir.name, 'examples/growth/growth.mat'))
82+
83+
ages_arr = growth_data_dict['age']
84+
85+
boys_df = pd.DataFrame(growth_data_dict['hgtmmat'])
86+
boys_df['age'] = ages_arr
87+
boys_df = pd.melt(boys_df, id_vars=['age'])
88+
boys_df.rename(columns={'variable' : 'cohort_id', 'value' : 'height'}, inplace=True)
89+
boys_df['gender'] = 'boy'
90+
#boys_df
91+
92+
girls_df = pd.DataFrame(growth_data_dict['hgtfmat'])
93+
girls_df['age'] = ages_arr
94+
girls_df = pd.melt(girls_df, id_vars=['age'])
95+
girls_df.rename(columns={'variable' : 'cohort_id', 'value' : 'height'}, inplace=True)
96+
girls_df['gender'] = 'girl'
97+
#girls_df
98+
99+
growth_df = pd.concat([boys_df, girls_df])
100+
101+
growth_vs_times_df = growth_df
102+
growth_vs_series_df = growth_df.drop(['age', 'height'], axis=1).drop_duplicates()
103+
104+
return (growth_vs_times_df, growth_vs_series_df)
105+
106+
107+
def download_ecg_data_dfs():
108+
zipped = download_zipfile('http://timeseriesclassification.com/Downloads/ECG200.zip')
109+
tempdir = tempfile.TemporaryDirectory()
110+
zipped.extract(member='ECG200/ECG200.csv', path=tempdir.name)
111+
ecg_filepath = os.path.join(tempdir.name, 'ECG200/ECG200.csv')
112+
113+
raw_df = pd.read_csv(ecg_filepath, names=([str(x) for x in range(96)] + ['class_label']), skiprows=101)
114+
raw_df['heartbeat'] = raw_df.index.values
115+
ecg_vs_series = raw_df[['heartbeat', 'class_label']]
116+
117+
raw_df = raw_df.melt(id_vars = ['heartbeat', 'class_label'])
118+
raw_df.rename(columns={ 'variable' : 'timestamp', 'value' : 'potential_difference' }, inplace=True)
119+
raw_df['timestamp'] = raw_df['timestamp'].astype(int)
120+
ecg_vs_times = raw_df[['heartbeat', 'timestamp', 'potential_difference']]
121+
122+
ecg_vs_series['is_abnormal'] = (ecg_vs_series['class_label'] == -1)
123+
ecg_vs_series.drop('class_label', axis=1, inplace=True)
124+
return (ecg_vs_times, ecg_vs_series)
125+
126+
42127
# Design patterns used: Flyweight, Prototype.
43128
class MultiSeries(LoggingHandler):
44129
"""The summary line for a class docstring should fit on one line.
@@ -1167,187 +1252,4 @@ def __repr__(self):
11671252
return (self.__class__.__name__ + '(filtering ' + str(rows_filtered) + '/' + str(rows_total) + ' observations, over ' + str(self._count_series_indices) + ' series x ' + str(self._count_time_indices) + ' timestamps x ' + str(self.count_features) + ' features)')
11681253

11691254

1170-
1171-
1172-
1173-
##################################################
1174-
# For testing
1175-
##################################################
1176-
1177-
1178-
if False:
1179-
1180-
(dummy_vs_times_df, dummy_vs_series_df) = load_dummy_data_df()
1181-
print(dummy_vs_times_df)
1182-
print(dummy_vs_series_df)
1183-
data_dummy = MultiSeries(data_vs_times_df=dummy_vs_times_df, data_vs_series_df=dummy_vs_series_df, time_colname='timestamp', series_id_colnames='series')
1184-
data_dummy.visualise()
1185-
1186-
if False:
1187-
1188-
(boston_vs_times_df, boston_vs_series_df) = load_boston_housing_data_df()
1189-
print(boston_vs_times_df)
1190-
print(boston_vs_series_df)
1191-
data_boston = MultiSeries(data_vs_times_df=boston_vs_times_df, data_vs_series_df=boston_vs_series_df, time_colname='feature', series_id_colnames='sample')
1192-
data_boston.visualise()
1193-
1194-
if False:
1195-
1196-
(weather_vs_times_df, weather_vs_series_df) = load_ramsay_weather_data_dfs()
1197-
data_weather = MultiSeries(data_vs_times_df=weather_vs_times_df, data_vs_series_df=weather_vs_series_df, time_colname='day_of_year', series_id_colnames='weather_station')
1198-
1199-
(growth_vs_times_df, growth_vs_series_df) = load_ramsay_growth_data_dfs()
1200-
growth_vs_series_df['gender'] = growth_vs_series_df['gender'].astype('category')
1201-
growth_vs_series_df = pd.concat([growth_vs_series_df, pd.get_dummies(growth_vs_series_df['gender'])], axis=1)
1202-
data_growth = MultiSeries(data_vs_times_df=growth_vs_times_df, data_vs_series_df=growth_vs_series_df, time_colname='age', series_id_colnames=['gender', 'cohort_id'])
1203-
1204-
#data_growth.get_backward_time_window(5, 18).visualise(filter_value_colnames='height')
1205-
1206-
data_growth.get_backward_time_window(5, 12)._time_uniques_all
1207-
1208-
data_growth.get_forward_time_window(5, 15.5)._time_uniques_all
1209-
1210-
1211-
1212-
1213-
data_weather_v2 = data_weather.get_backward_time_window(5).new_mutable_instance()
1214-
data_weather_v2._data_vs_times_df
1215-
1216-
data_weather_v2._data_vs_times_df.loc[(33,[366]), ['precav', 'tempav']] = [1,2]
1217-
data_weather_v2._data_vs_times_df
1218-
1219-
data_weather_v2.set_time_labelled_values(prediction_series=[34], prediction_features=['precav', 'tempav'], prediction_times=[366], values=[1,2])
1220-
data_weather_v2._data_vs_times_df
1221-
1222-
# new_data_vs_times_df.loc[(series_id, times_list), value_colnames_vs_times]
1223-
#data_weather_v2._data_vs_times_df.loc[(33,366),:] = [1,2]
1224-
#data_weather_v2._data_vs_times_df.sort_index()
1225-
1226-
data_growth_v2 = data_growth.get_backward_time_window(5).new_mutable_instance()
1227-
1228-
1229-
1230-
1231-
if False:
1232-
1233-
data_weather.visualise(title='Weather data')
1234-
data_weather.visualise_means(title='Weather data')
1235-
data_weather.visualise_arrays(include_time_as_feature=True)
1236-
1237-
data_growth.visualise(title='Growth data')
1238-
data_growth.visualise_means(title='Growth data')
1239-
data_growth.visualise_arrays(include_time_as_feature=True)
1240-
1241-
1242-
# Ready for cross-validation
1243-
for (ot, ov) in data_weather.generate_series_folds(series_splitter = KFold(n_splits=5)):
1244-
print('Outer Loop. Training = ' + str(ot._series_id_uniques) + ' / Validation = ' + str(ov._series_id_uniques))
1245-
for (it, iv) in ot.generate_series_folds(series_splitter = KFold(n_splits=5)):
1246-
print('Inner Loop. Training = ' + str(it._series_id_uniques) + ' / Validation = ' + str(iv._series_id_uniques))
1247-
for (st, sv) in it.generate_time_windows(time_splitter = SlidingWindowTimeSeriesSplit(count_timestamps=len(it._time_uniques_all), training_set_size=100, validation_set_size=50, step=50)):
1248-
print('Timeseries Loop. Training = ' + str(st._time_uniques_all) + ' / Validation = ' + str(sv._time_uniques_all))
1249-
1250-
1251-
# Ready for cross-validation
1252-
for (ot, ov) in data_growth.generate_series_folds(series_splitter = KFold(n_splits=5)):
1253-
print('Outer Loop. Training = ' + str(ot._series_id_uniques) + ' / Validation = ' + str(ov._series_id_uniques))
1254-
for (it, iv) in ot.generate_series_folds(series_splitter = KFold(n_splits=5)):
1255-
print('Inner Loop. Training = ' + str(it._series_id_uniques) + ' / Validation = ' + str(iv._series_id_uniques))
1256-
for (st, sv) in it.generate_time_windows(time_splitter = SlidingWindowTimeSeriesSplit(count_timestamps=len(it._time_uniques_all), training_set_size=2, validation_set_size=5, step=5)):
1257-
print('Timeseries Loop. Training = ' + str(st._time_uniques_all) + ' / Validation = ' + str(sv._time_uniques_all))
1258-
1259-
1260-
1261-
1262-
if False:
1263-
1264-
# Data: weather
1265-
(weather_vs_times_df, weather_vs_series_df) = load_ramsay_weather_data_dfs()
1266-
data_weather = MultiSeries(data_vs_times_df=weather_vs_times_df, data_vs_series_df=weather_vs_series_df, time_colname='day_of_year', series_id_colnames='weather_station')
1267-
#data_weather.visualise()
1268-
1269-
# Data: growth
1270-
(growth_vs_times_df, growth_vs_series_df) = load_ramsay_growth_data_dfs()
1271-
growth_vs_series_df['gender'] = growth_vs_series_df['gender'].astype('category')
1272-
growth_vs_series_df = pd.concat([growth_vs_series_df, pd.get_dummies(growth_vs_series_df['gender'])], axis=1)
1273-
data_growth = MultiSeries(data_vs_times_df=growth_vs_times_df, data_vs_series_df=growth_vs_series_df, time_colname='age', series_id_colnames=['gender', 'cohort_id'])
1274-
#data_growth.visualise()
1275-
1276-
input_sliding_window_size = 10
1277-
output_sliding_window_size = 5
1278-
1279-
(a4d_vs_times_windowed_input, a4d_vs_times_windowed_output) = data_growth.select_paired_tabular_windowed_4d_arrays(input_sliding_window_size=input_sliding_window_size, output_sliding_window_size=output_sliding_window_size)
1280-
print('a4d_vs_times_windowed_input.shape = ' + str(a4d_vs_times_windowed_input.shape))
1281-
print('a4d_vs_times_windowed_output.shape = ' + str(a4d_vs_times_windowed_output.shape))
1282-
1283-
(a4d_vs_times_windowed_input, a4d_vs_times_windowed_output) = data_weather.select_paired_tabular_windowed_4d_arrays(input_sliding_window_size=input_sliding_window_size, output_sliding_window_size=output_sliding_window_size)
1284-
print('a4d_vs_times_windowed_input.shape = ' + str(a4d_vs_times_windowed_input.shape))
1285-
print('a4d_vs_times_windowed_output.shape = ' + str(a4d_vs_times_windowed_output.shape))
1286-
1287-
1288-
if False:
1289-
# Data: weather
1290-
(weather_vs_times_df, weather_vs_series_df) = load_ramsay_weather_data_dfs()
1291-
data_weather = MultiSeries(data_vs_times_df=weather_vs_times_df, data_vs_series_df=weather_vs_series_df, time_colname='day_of_year', series_id_colnames='weather_station')
1292-
1293-
data_weather.visualise()
1294-
1295-
weather_vs_times_df[weather_vs_times_df.weather_station == 28].set_index('day_of_year')['tempav'].plot()
1296-
weather_vs_times_df[weather_vs_times_df.weather_station == 28].set_index('day_of_year')['precav'].plot()
1297-
1298-
1299-
1300-
if False:
1301-
# Data: ECG
1302-
(ecg_vs_times_df, ecg_vs_series_df) = load_ecg_data_dfs()
1303-
data_ecg = MultiSeries(data_vs_times_df=ecg_vs_times_df, data_vs_series_df=ecg_vs_series_df, time_colname='timestamp', series_id_colnames='heartbeat')
1304-
1305-
# 133 are normal, the remaining 67 are abnormal.
1306-
ecg_vs_series_df.groupby('is_abnormal').count()
1307-
ecg_vs_times_df['timestamp'].max() # divide into first half (0...47) and second half (48...95)
1308-
1309-
data_ecg.visualise(filter_value_colnames='potential_difference')
1310-
data_ecg.visualise_moments(filter_value_colnames='potential_difference')
1311-
1312-
1313-
1314-
1315-
if False:
1316-
# Data: Starlight
1317-
(starlight_vs_times_df, starlight_vs_series_df) = load_starlight_data_dfs()
1318-
data_starlight = MultiSeries(data_vs_times_df=starlight_vs_times_df, data_vs_series_df=starlight_vs_series_df, time_colname='folded_time', series_id_colnames='starlight_curve')
1319-
1320-
starlight_vs_times_df['folded_time'].max()
1321-
1322-
data_starlight.visualise(filter_value_colnames='magnitude')
1323-
data_starlight.visualise_moments(filter_value_colnames='magnitude')
1324-
1325-
1326-
1327-
if False:
1328-
1329-
# Data: Power: Multiple locations & days
1330-
#(power_vs_times_df, power_vs_series_df) = load_power_data_dfs(power_filename='multiple_locations_multiple_days.csv' )
1331-
(power_vs_times_df, power_vs_series_df) = load_power_data_multiple_locations_multiple_days_dfs()
1332-
data_power_multiple_locations_multiple_days = MultiSeries(data_vs_times_df=power_vs_times_df, data_vs_series_df=power_vs_series_df, time_colname='half_hour', series_id_colnames='series_id')
1333-
data_power_multiple_locations_multiple_days.visualise(title='Multiple locations & days')
1334-
data_power_multiple_locations_multiple_days.visualise_moments(title='Multiple locations & days')
1335-
1336-
# Data: Power: One day, multiple locations
1337-
#(power_vs_times_df, power_vs_series_df) = load_power_data_dfs(power_filename='one_day_multiple_locations.csv' )
1338-
(power_vs_times_df, power_vs_series_df) = load_power_data_one_day_multiple_locations_dfs()
1339-
data_power_one_day_multiple_locations = MultiSeries(data_vs_times_df=power_vs_times_df, data_vs_series_df=power_vs_series_df, time_colname='half_hour', series_id_colnames='series_id')
1340-
data_power_one_day_multiple_locations.visualise(title='One day, multiple locations')
1341-
data_power_one_day_multiple_locations.visualise_moments(title='One day, multiple locations')
1342-
1343-
# Data: Power: One location, multiple days
1344-
#(power_vs_times_df, power_vs_series_df) = load_power_data_dfs(power_filename='one_location_multiple_days.csv' )
1345-
(power_vs_times_df, power_vs_series_df) = load_power_data_one_location_multiple_days_dfs()
1346-
data_power_one_location_multiple_days = MultiSeries(data_vs_times_df=power_vs_times_df, data_vs_series_df=power_vs_series_df, time_colname='half_hour', series_id_colnames='series_id')
1347-
data_power_one_location_multiple_days.visualise(title='One location, multiple days')
1348-
data_power_one_location_multiple_days.visualise_moments(title='One location, multiple days')
1349-
1350-
1351-
1352-
1353-
1255+

build/lib/pysf/errors.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -343,12 +343,4 @@ def __setstate__(self, state):
343343
self.initLogger()
344344

345345

346-
##################################################
347-
# For testing
348-
##################################################
349-
350-
351-
# Old testing code removed.
352-
353-
354-
346+

0 commit comments

Comments
 (0)