sam31415 · pythagorea1 · Mar 17, 2020 · Mar 17, 2020 · Mar 19, 2020 · Mar 19, 2020
diff --git a/README.rst b/README.rst
@@ -19,7 +19,7 @@ Installation
 
 timeseriescv can be installed using pip:
 
-    >>> pip install timeseriescv
+    >>> pip install git+https://github.com/pythagorea1/timeseriescv
 
 Content
 ~~~~~~~
@@ -29,15 +29,31 @@ For now the package contains two main classes handling cross-validation:
 * ``PurgedWalkForwardCV``: Walk-forward cross-validation with purging.
 * ``CombPurgedKFoldCV``: Combinatorial cross-validation with purging and embargoing.
 
+Quick Start for CPCV
+~~~~~~~~~~~~
+
+    >>> from timeseriescv.cross_validation import CombPurgedKFoldCV as CPCV
+    >>> cpcv = CPCV(n_splits=10, n_test_splits=2)
+    >>> for (train_set, test_set) in cpcv.split(df):
+    >>>     train_X = df.iloc[train_set]
+    >>>     test_X = df.iloc[test_set]
+
+Quick Start for WalkForward
+~~~~~~~~~~~~
+    >>> from timeseriescv.walkforward import PurgedWalkForwardCV as WF
+    >>> wf = WF(n_splits=10, n_test_splits=2, purge_count=10)
+    >>> for (train_set, test_set) in wf.split(df):
+    >>>     train_X = df.iloc[train_set]
+    >>>     test_X = df.iloc[test_set] 
+
+
+
 Remarks concerning the API
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The API is as similar to the scikit-learn API as possible. Like the scikit-learn cross-validation classes, the ``split``
 method is a generator that yields a pair of numpy arrays containing the positional indices of the samples in the train
 and validation set, respectively. The main differences with the scikit-learn API are:
 
-* The ``split`` method takes as arguments not only the predictor values ``X``, but also the prediction times ``pred_times`` and the evaluation times ``eval_times`` of each sample.
-* To stay as close to the scikit-learn API as possible, this data is passed as separate parameters. But in order to ensure that they are properly aligned, ``X``, ``pred_times`` and ``eval_times`` are required to be pandas DataFrames/Series sharing the same index.
-
 Check the docstrings of the cross-validation classes for more information.
 
diff --git a/setup.py b/setup.py
@@ -15,9 +15,9 @@ def readme():
           'Topic :: Scientific/Engineering',
       ],
       keywords='machine-learning cross-validation scikit-learn time-series',
-      url='https://github.com/sam31415/timeseriescv',
-      author='Samuel Monnier',
-      author_email='samuel.monnier@gmail.com',
+      url='https://github.com/pythagorea1/timeseriescv',
+      author='Kakyo Okina',
+      author_email='kakyo@alpacadb.com',
       license='MIT',
       packages=['timeseriescv'],
       install_requires=[

diff --git a/timeseriescv/core.py b/timeseriescv/core.py
@@ -0,0 +1,116 @@
+import numpy as np
+import pandas as pd
+
+from abc import abstractmethod
+from typing import Iterable, Tuple, List
+
+# fork from https://github.com/sam31415/timeseriescv
+
+
+class BaseTimeSeriesCrossValidator:
+    """
+    Abstract class for time series cross-validation.
+    Time series cross-validation requires each sample has a prediction time pred_time,
+    at which the features are used to predict the response,
+    and an evaluation time eval_time, at which the response is known and the error can be computed.
+    Importantly, it means that unlike in standard sklearn cross-validation,
+    the samples X, response y,
+    pred_times and eval_times must all be pandas dataframe/series having the same index.
+    It is also assumed that the
+    samples are time-ordered with respect to the prediction time (i.e. pred_times is non-decreasing).
+
+    Parameters
+    ----------
+    n_splits : int, default=10
+        Number of folds. Must be at least 2.
+
+    """
+
+    def __init__(self, n_splits=10):
+        n_splits = int(n_splits)
+        self.n_splits = n_splits
+        self.pred_times = None
+        self.eval_times = None
+        self.indices = None
+
+    @abstractmethod
+    def split(self, X: pd.DataFrame):
+        self.indices = np.arange(X.shape[0])
+        self.eval_times = pd.Series(X.index)
+        self.pred_times = pd.Series(X.index)
+
+
+def embargo(cv: BaseTimeSeriesCrossValidator, train_indices: np.ndarray,
+            test_indices: np.ndarray, test_fold_end: int) -> np.ndarray:
+    """
+    Apply the embargo procedure to part of the train set.
+
+    This amounts to dropping the train set samples whose prediction time occurs within self.embargo_dt of the test
+    set sample evaluation times. This method applies the embargo only to the part of the training set immediately
+    following the end of the test set determined by test_fold_end.
+
+    Parameters
+    ----------
+    cv: Cross-validation class
+        Needs to have the attributes cv.pred_times, cv.eval_times, cv.embargo_dt and cv.indices.
+
+    train_indices: np.ndarray
+        A numpy array containing all the indices of the samples currently included in the train set.
+
+    test_indices : np.ndarray
+        A numpy array containing all the indices of the samples in the test set.
+
+    test_fold_end : int
+        Index corresponding to the end of a test set block.
+
+    Returns
+    -------
+    train_indices: np.ndarray
+        The same array, with the indices subject to embargo removed.
+
+    """
+    if not hasattr(cv, 'embargo_td'):
+        raise ValueError("The passed cross-validation object should have a member cv.embargo_td defining the embargo"
+                         "time.")
+    last_test_eval_time = cv.eval_times.iloc[test_indices[:test_fold_end]].max()
+    min_train_index = len(cv.pred_times[cv.pred_times <= last_test_eval_time + cv.embargo_td])
+    if min_train_index < cv.indices.shape[0]:
+        allowed_indices = np.concatenate((cv.indices[:test_fold_end], cv.indices[min_train_index:]))
+        train_indices = np.intersect1d(train_indices, allowed_indices)
+    return train_indices
+
+
+def purge(cv: BaseTimeSeriesCrossValidator, train_indices: np.ndarray,
+          test_fold_start: int, test_fold_end: int) -> np.ndarray:
+    """
+    Purge part of the train set.
+
+    Given a left boundary index test_fold_start of the test set, this method removes from the train set all the
+    samples whose evaluation time is posterior to the prediction time of the first test sample after the boundary.
+
+    Parameters
+    ----------
+    cv: Cross-validation class
+        Needs to have the attributes cv.pred_times, cv.eval_times and cv.indices.
+
+    train_indices: np.ndarray
+        A numpy array containing all the indices of the samples currently included in the train set.
+
+    test_fold_start : int
+        Index corresponding to the start of a test set block.
+
+    test_fold_end : int
+        Index corresponding to the end of the same test set block.
+
+    Returns
+    -------
+    train_indices: np.ndarray
+        A numpy array containing the train indices purged at test_fold_start.
+
+    """
+    time_test_fold_start = cv.pred_times.iloc[test_fold_start]
+    # The train indices before the start of the test fold, purged.
+    train_indices_1 = np.intersect1d(train_indices, cv.indices[cv.eval_times < time_test_fold_start])
+    # The train indices after the end of the test fold.
+    train_indices_2 = np.intersect1d(train_indices, cv.indices[test_fold_end:])
+    return np.concatenate((train_indices_1, train_indices_2))