LAMDA-NJU · xuyxu · May 27, 2021 · May 27, 2021 · May 30, 2021 · May 31, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -30,6 +30,7 @@ Version 0.1.*
 .. |Fix| replace:: :raw-html:`<span class="badge badge-danger">Fix</span>` :raw-latex:`{\small\sc [Fix]}`
 .. |API| replace:: :raw-html:`<span class="badge badge-warning">API Change</span>` :raw-latex:`{\small\sc [API Change]}`
 
+- |Feature| |API| add :obj:`TimeSeriesCascadeForestClassifier` for time series classification (`#83 <https://github.com/LAMDA-NJU/Deep-Forest/pull/83>`__) @xuyxu
 - |Fix| fix missing functionality of :meth:`_set_n_trees` @xuyxu
 - |Fix| |API| add docstrings for parameter ``bin_type`` (`#74 <https://github.com/LAMDA-NJU/Deep-Forest/pull/74>`__) @xuyxu
 - |Feature| |API| recover the parameter ``min_samples_split`` (`#73 <https://github.com/LAMDA-NJU/Deep-Forest/pull/73>`__) @xuyxu

diff --git a/build_tools/requirements.txt b/build_tools/requirements.txt
@@ -7,3 +7,4 @@ pytest-cov
 lightgbm
 xgboost
 cython>=0.28.5
+pandas>=0.25.0
diff --git a/deepforest/__init__.py b/deepforest/__init__.py
@@ -1,4 +1,8 @@
-from .cascade import CascadeForestClassifier, CascadeForestRegressor
+from .cascade import (
+    CascadeForestClassifier,
+    CascadeForestRegressor,
+    TimeSeriesCascadeForestClassifier,
+)
 from .forest import RandomForestClassifier, RandomForestRegressor
 from .forest import ExtraTreesClassifier, ExtraTreesRegressor
 from .tree import DecisionTreeClassifier, DecisionTreeRegressor
@@ -8,6 +12,7 @@
 __all__ = [
     "CascadeForestClassifier",
     "CascadeForestRegressor",
+    "TimeSeriesCascadeForestClassifier",
     "RandomForestClassifier",
     "RandomForestRegressor",
     "ExtraTreesClassifier",

diff --git a/deepforest/cascade.py b/deepforest/cascade.py
@@ -6,6 +6,7 @@
 import time
 import numbers
 import numpy as np
+import pandas as pd
 from abc import ABCMeta, abstractmethod
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils.multiclass import type_of_target
@@ -202,6 +203,34 @@
     return predictor
 
 
+def _build_time_series_feature_transformer():
+    """Build the time series feature transformer from tsfresh."""
+    # Skip Windows platform
+    import platform
+
+    if platform.system() == "Windows":
+        msg = (
+            "TimeSeriesClascadeForestClassifier currently is not available"
+            " on Windows due to the parallelization issue of tsfresh."
+        )
+        raise NotImplementedError(msg)
+
+    try:
+        __import__("tsfresh")
+    except ModuleNotFoundError:
+        msg = (
+            "Cannot load the module tsfresh when building the feature"
+            " transformer. Please make sure that tsfresh is installed."
+        )
+        raise ModuleNotFoundError(msg)
+
+    from tsfresh.transformers import RelevantFeatureAugmenter
+
+    augmenter = RelevantFeatureAugmenter(column_id="id", column_sort="time")
+
+    return augmenter
+
+
 __classifier_model_doc = """
     Parameters
     ----------
@@ -1699,3 +1728,169 @@
                 _y = _utils.merge_proba(X_aug_test_, self.n_outputs_)
 
         return _y
+
+
+class TimeSeriesCascadeForestClassifier(ClassifierMixin):
+    def __init__(
+        self,
+        n_bins=255,
+        bin_subsample=200000,
+        bin_type="percentile",
+        max_layers=20,
+        criterion="gini",
+        n_estimators=2,
+        n_trees=100,
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        use_predictor=False,
+        predictor="forest",
+        predictor_kwargs={},
+        backend="custom",
+        n_tolerant_rounds=2,
+        delta=1e-5,
+        partial_mode=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=1,
+    ):
+
+        # Feature transformer
+        self.transformer = _build_time_series_feature_transformer()
+
+        # Classifier
+        self.classifier = CascadeForestClassifier(
+            n_bins=n_bins,
+            bin_subsample=bin_subsample,
+            bin_type=bin_type,
+            max_layers=max_layers,
+            criterion=criterion,
+            n_estimators=n_estimators,
+            n_trees=n_trees,
+            max_depth=max_depth,
+            min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf,
+            use_predictor=use_predictor,
+            predictor=predictor,
+            predictor_kwargs=predictor_kwargs,
+            backend=backend,
+            n_tolerant_rounds=n_tolerant_rounds,
+            delta=delta,
+            partial_mode=partial_mode,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose,
+        )
+
+        self.verbose = verbose
+
+    def _check_input(self, X, y=None):
+        """Check the input training and evaluating time series."""
+        is_training_data = y is not None
+
+        if not isinstance(X, pd.DataFrame):
+            msg = "X should be a pandas DataFrame, but got {} instead."
+            raise ValueError(msg.format(type(X)))
+
+        if not "id" in X.columns:
+            msg = "X should have one column named: `id`."
+            raise ValueError(msg)
+
+        if not "time" in X.columns:
+            msg = "X should have one column named: `time`."
+            raise ValueError(msg)
+
+        # Check same time series length
+        length = X.groupby(["id"]).size().to_numpy()
+        if not (length == length[0]).all():
+            msg = "All time series should have the same length."
+            raise ValueError(msg)
+
+        # Additional checks for training data
+        if is_training_data:
+
+            if not isinstance(y, pd.Series):
+                msg = "y should be a pandas Series, but got {} instead."
+                raise ValueError(msg.format(type(y)))
+
+            # Check same time series id
+            if not (y.index == X["id"].unique()).all():
+                msg = "Mismatch of time series IDs in X and y."
+                raise ValueError(msg)
+
+        # Set attributes
+        self.length = length[0]
+
+    def fit(self, X, y, sample_weight=None):
+        """
+        Build a deep forest using the training time series for classification.
+
+        Parameters
+        ----------
+        X : :obj:`pandas.DataFrame` of shape (n_samples * length, n_series)
+            The input time series in a flat DataFrame. The column ``"id"`` and
+            ``"time"`` is used to locate the `time`-th record of the `id`-th
+            time series. Internally, it will be transformed into non-ordinal
+            numerical features using :mod:`tsfresh`.
+        y : :obj:`pandas.Series` of shape (n_samples,)
+            The class labels of input time series.
+        sample_weight : :obj:`numpy.ndarray` of shape (n_samples,), default=None
+            Sample weights. If ``None``, then samples are equally weighted.
+        """
+        self._check_input(X, y)
+        dummy_X = pd.DataFrame(index=y.index)
+        self.transformer.set_timeseries_container(X)
+
+        if self.verbose > 0:
+            print("{} Transforming time series".format(_utils.ctime()))
+
+        X_with_features = self.transformer.fit_transform(dummy_X, y).to_numpy()
+        self.classifier.fit(X_with_features, y, sample_weight)
+
+    def predict_proba(self, X):
+        """
+        Predict class probabilities for time series X.
+
+        Parameters
+        ----------
+        X : :obj:`pandas.DataFrame` of shape (n_samples * length, n_series)
+            The input time series in a flat DataFrame. The column ``"id"`` and
+            ``"time"`` is used to locate the `time`-th record of the `id`-th
+            time series. Internally, it will be transformed into non-ordinal
+            numerical features using :mod:`tsfresh`.
+
+        Returns
+        -------
+        proba : :obj:`numpy.ndarray` of shape (n_series, n_classes)
+            The class probabilities of the input time series.
+        """
+        self._check_input(X)
+        dummy_X = pd.DataFrame()
+        self.transformer.set_timeseries_container(X)
+
+        if self.verbose > 0:
+            print("{} Transforming time series".format(_utils.ctime()))
+
+        X_with_features = self.transformer.transform(dummy_X).to_numpy()
+
+        return self.classifier.predict_proba(X_with_features)
+
+    def predict(self, X):
+        """
+        Predict class for time series X.
+
+        Parameters
+        ----------
+        X : :obj:`pandas.DataFrame` of shape (n_samples * length, n_series)
+            The input time series in a flat DataFrame. The column ``"id"`` and
+            ``"time"`` is used to locate the `time`-th record of the `id`-th
+            time series. Internally, it will be transformed into non-ordinal
+            numerical features using :mod:`tsfresh`.
+
+        Returns
+        -------
+        y : :obj:`numpy.ndarray` of shape (n_series,)
+            The predicted classes.
+        """
+        proba = self.predict_proba(X)
+        return np.argmax(proba, axis=1)