Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Installation

timeseriescv can be installed using pip:

>>> pip install timeseriescv
>>> pip install git+https://github.com/pythagorea1/timeseriescv

Content
~~~~~~~
Expand All @@ -29,15 +29,31 @@ For now the package contains two main classes handling cross-validation:
* ``PurgedWalkForwardCV``: Walk-forward cross-validation with purging.
* ``CombPurgedKFoldCV``: Combinatorial cross-validation with purging and embargoing.

Quick Start for CPCV
~~~~~~~~~~~~

>>> from timeseriescv.cross_validation import CombPurgedKFoldCV as CPCV
>>> cpcv = CPCV(n_splits=10, n_test_splits=2)
>>> for (train_set, test_set) in cpcv.split(df):
>>> train_X = df.iloc[train_set]
>>> test_X = df.iloc[test_set]

Quick Start for WalkForward
~~~~~~~~~~~~
>>> from timeseriescv.walkforward import PurgedWalkForwardCV as WF
>>> wf = WF(n_splits=10, n_test_splits=2, purge_count=10)
>>> for (train_set, test_set) in wf.split(df):
>>> train_X = df.iloc[train_set]
>>> test_X = df.iloc[test_set]



Remarks concerning the API
~~~~~~~~~~~~~~~~~~~~~~~~~~

The API is as similar to the scikit-learn API as possible. Like the scikit-learn cross-validation classes, the ``split``
method is a generator that yields a pair of numpy arrays containing the positional indices of the samples in the train
and validation set, respectively. The main differences with the scikit-learn API are:

* The ``split`` method takes as arguments not only the predictor values ``X``, but also the prediction times ``pred_times`` and the evaluation times ``eval_times`` of each sample.
* To stay as close to the scikit-learn API as possible, this data is passed as separate parameters. But in order to ensure that they are properly aligned, ``X``, ``pred_times`` and ``eval_times`` are required to be pandas DataFrames/Series sharing the same index.

Check the docstrings of the cross-validation classes for more information.

6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ def readme():
'Topic :: Scientific/Engineering',
],
keywords='machine-learning cross-validation scikit-learn time-series',
url='https://github.com/sam31415/timeseriescv',
author='Samuel Monnier',
author_email='samuel.monnier@gmail.com',
url='https://github.com/pythagorea1/timeseriescv',
author='Kakyo Okina',
author_email='kakyo@alpacadb.com',
license='MIT',
packages=['timeseriescv'],
install_requires=[
Expand Down
116 changes: 116 additions & 0 deletions timeseriescv/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import numpy as np
import pandas as pd

from abc import abstractmethod
from typing import Iterable, Tuple, List

# fork from https://github.com/sam31415/timeseriescv


class BaseTimeSeriesCrossValidator:
"""
Abstract class for time series cross-validation.
Time series cross-validation requires each sample has a prediction time pred_time,
at which the features are used to predict the response,
and an evaluation time eval_time, at which the response is known and the error can be computed.
Importantly, it means that unlike in standard sklearn cross-validation,
the samples X, response y,
pred_times and eval_times must all be pandas dataframe/series having the same index.
It is also assumed that the
samples are time-ordered with respect to the prediction time (i.e. pred_times is non-decreasing).

Parameters
----------
n_splits : int, default=10
Number of folds. Must be at least 2.

"""

def __init__(self, n_splits=10):
n_splits = int(n_splits)
self.n_splits = n_splits
self.pred_times = None
self.eval_times = None
self.indices = None

@abstractmethod
def split(self, X: pd.DataFrame):
self.indices = np.arange(X.shape[0])
self.eval_times = pd.Series(X.index)
self.pred_times = pd.Series(X.index)


def embargo(cv: BaseTimeSeriesCrossValidator, train_indices: np.ndarray,
test_indices: np.ndarray, test_fold_end: int) -> np.ndarray:
"""
Apply the embargo procedure to part of the train set.

This amounts to dropping the train set samples whose prediction time occurs within self.embargo_dt of the test
set sample evaluation times. This method applies the embargo only to the part of the training set immediately
following the end of the test set determined by test_fold_end.

Parameters
----------
cv: Cross-validation class
Needs to have the attributes cv.pred_times, cv.eval_times, cv.embargo_dt and cv.indices.

train_indices: np.ndarray
A numpy array containing all the indices of the samples currently included in the train set.

test_indices : np.ndarray
A numpy array containing all the indices of the samples in the test set.

test_fold_end : int
Index corresponding to the end of a test set block.

Returns
-------
train_indices: np.ndarray
The same array, with the indices subject to embargo removed.

"""
if not hasattr(cv, 'embargo_td'):
raise ValueError("The passed cross-validation object should have a member cv.embargo_td defining the embargo"
"time.")
last_test_eval_time = cv.eval_times.iloc[test_indices[:test_fold_end]].max()
min_train_index = len(cv.pred_times[cv.pred_times <= last_test_eval_time + cv.embargo_td])
if min_train_index < cv.indices.shape[0]:
allowed_indices = np.concatenate((cv.indices[:test_fold_end], cv.indices[min_train_index:]))
train_indices = np.intersect1d(train_indices, allowed_indices)
return train_indices


def purge(cv: BaseTimeSeriesCrossValidator, train_indices: np.ndarray,
test_fold_start: int, test_fold_end: int) -> np.ndarray:
"""
Purge part of the train set.

Given a left boundary index test_fold_start of the test set, this method removes from the train set all the
samples whose evaluation time is posterior to the prediction time of the first test sample after the boundary.

Parameters
----------
cv: Cross-validation class
Needs to have the attributes cv.pred_times, cv.eval_times and cv.indices.

train_indices: np.ndarray
A numpy array containing all the indices of the samples currently included in the train set.

test_fold_start : int
Index corresponding to the start of a test set block.

test_fold_end : int
Index corresponding to the end of the same test set block.

Returns
-------
train_indices: np.ndarray
A numpy array containing the train indices purged at test_fold_start.

"""
time_test_fold_start = cv.pred_times.iloc[test_fold_start]
# The train indices before the start of the test fold, purged.
train_indices_1 = np.intersect1d(train_indices, cv.indices[cv.eval_times < time_test_fold_start])
# The train indices after the end of the test fold.
train_indices_2 = np.intersect1d(train_indices, cv.indices[test_fold_end:])
return np.concatenate((train_indices_1, train_indices_2))
Loading