Merge pull request #163 from fraunhoferportugal/correlated_features

Option to remove correlated features
fraunhoferportugal · Sep 11, 2024 · 222cde2 · 222cde2
2 parents b4ddb42 + c001ee6
commit 222cde2
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 23 deletions.
diff --git a/.flake8 b/.flake8
@@ -29,13 +29,13 @@ statistics = True
 # SC = flake8-spellcheck
 ignore = E203, E211, E265, E501, E999, F401, F821, W503, W505, SC100, SC200, C400, C401, C402, B008, E800, E741, F403, F405, C901, B028, E226
 max-line-length = 120
-max-doc-length = 120
+max-doc-length = 80
 import-order-style = google
 docstring-convention = google
 inline-quotes = "
 strictness=short
 dictionaries=en_US,python,technical,pandas
-min-python-version = 3.7.0
+min-python-version = 3.8.0
 exclude = .git,.tox,.nox,venv,.venv,.venv-docs,.venv-dev,.venv-note,.venv-dempy,docs,test
 max-complexity = 10
 #spellcheck-targets=comments
diff --git a/notebooks/TSFEL_HAR_Example.ipynb b/notebooks/TSFEL_HAR_Example.ipynb
@@ -200,8 +200,7 @@
    "outputs": [],
    "source": [
     "# Highly correlated features are removed\n",
-    "corr_features = tsfel.correlated_features(X_train)\n",
-    "X_train.drop(corr_features, axis=1, inplace=True)\n",
+    "corr_features, X_train = tsfel.correlated_features(X_train, drop_correlated=True)\n",
     "X_test.drop(corr_features, axis=1, inplace=True)\n",
     "\n",
     "# Remove low variance features\n",
@@ -281,6 +280,11 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "tsfel",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -291,7 +295,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/TSFEL_SMARTWATCH_HAR_Example.ipynb b/notebooks/TSFEL_SMARTWATCH_HAR_Example.ipynb
@@ -367,8 +367,7 @@
    "outputs": [],
    "source": [
     "# Highly correlated features are removed\n",
-    "corr_features = tsfel.correlated_features(x_train_feat)\n",
-    "x_train_feat.drop(corr_features, axis=1, inplace=True)\n",
+    "corr_features, x_train_feat = tsfel.correlated_features(x_train_feat, drop_correlated=True)\n",
     "x_test_feat.drop(corr_features, axis=1, inplace=True)\n",
     "\n",
     "# Remove low variance features\n",

diff --git a/notebooks/TSFEL_predicting_NormalVsPathologicalknee.ipynb b/notebooks/TSFEL_predicting_NormalVsPathologicalknee.ipynb
@@ -286,8 +286,7 @@
    "outputs": [],
    "source": [
     "# Highly correlated features are removed\n",
-    "corr_features = tsfel.correlated_features(X_train)\n",
-    "X_train.drop(corr_features, axis=1, inplace=True)\n",
+    "corr_features, X_train = tsfel.correlated_features(X_train, drop_correlated=True)\n",
     "X_test.drop(corr_features, axis=1, inplace=True)\n",
     "\n",
     "# Remove low variance features\n",

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -1,4 +1,3 @@
-pre-commit==3.7.0
-matplotlib==3.8.2
-seaborn==0.13.2
-neurokit==0.2.10
+ipython==8.27.0
+matplotlib==3.9.2
+pre-commit==3.8.0
diff --git a/tests/test_signal_processing.py b/tests/test_signal_processing.py
@@ -0,0 +1,77 @@
+"""A test suite for the signal processing methods.
+
+The name will likely change after the major refactor.
+"""
+
+import unittest
+
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_classification
+
+import tsfel
+
+
+class SignalProcessingTestCase(unittest.TestCase):
+    """Unit tests for signal processing methods."""
+
+    def test_univariate_correlated_features(self):
+        X = self.generate_univariate_correlated_dataset()
+        features_name_to_drop, filtered_X = tsfel.correlated_features(X, threshold=0.90, drop_correlated=True)
+        np.testing.assert_equal(
+            (features_name_to_drop, np.shape(filtered_X)),
+            (["Feature_4", "Feature_5", "Feature_7", "Feature_8", "Feature_9"], (1000, 5)),
+        )
+
+    def test_no_correlated_features(self):
+        X = self.generate_univariate_correlated_dataset(num_features=5, num_redundant=0)
+        features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
+        self.assertEqual(features_name_to_drop, [])
+
+    def test_empty_dataframe(self):
+        X = pd.DataFrame()
+        features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
+        self.assertEqual(features_name_to_drop, [])
+
+    def test_different_thresholds(self):
+        X = self.generate_univariate_correlated_dataset()
+        thresholds = [0.05, 0.5, 0.95]
+        expected_features_to_remove = [5, 5, 3]
+
+        for threshold, expected_n_features in zip(thresholds, expected_features_to_remove):
+            features_name_to_drop = tsfel.correlated_features(X, threshold=threshold, drop_correlated=False)
+            self.assertIsInstance(features_name_to_drop, list)
+            self.assertEqual(len(features_name_to_drop), expected_n_features)
+
+    @staticmethod
+    def generate_univariate_correlated_dataset(num_features: int = 10, num_redundant: int = 5) -> pd.DataFrame:
+        """Generate a synthetic dataset with correlated features.
+
+        Parameters
+        ----------
+            num_features: int
+                Number of features in the dataset.
+            num_redundant: int
+                Number of redundant features that are correlated.
+
+        Returns
+        -------
+            pd.DataFrame: DataFrame with the generated dataset.
+        """
+        # Generate synthetic data
+        X, _ = make_classification(
+            n_samples=1000,
+            n_features=num_features,
+            n_redundant=num_redundant,
+            random_state=42,
+        )
+
+        # Create a DataFrame with appropriate column names
+        column_names = [f"Feature_{i}" for i in range(1, num_features + 1)]
+        X_df = pd.DataFrame(X, columns=column_names)
+
+        return X_df
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tsfel/utils/signal_processing.py b/tsfel/utils/signal_processing.py
@@ -1,3 +1,5 @@
+from typing import List, Tuple, Union
+
 import numpy as np
 import pandas as pd
 from scipy.interpolate import interp1d
@@ -74,24 +76,55 @@ def merge_time_series(data, fs_resample, time_unit):
     return pd.DataFrame(data=data_new[:, 1:], columns=header_values[1:])
 
 
-def correlated_features(features, threshold=0.95):
-    """Compute pairwise correlation of features using pearson method.
+def correlated_features(
+    features: pd.DataFrame,
+    threshold: float = 0.95,
+    method: str = "pearson",
+    drop_correlated: bool = False,
+) -> Union[List[str], Tuple[List[str], pd.DataFrame]]:
+    """Identify and optionally remove highly correlated features from a
+    DataFrame.
+
+    This function computes the pairwise Pearson correlation of features using
+    pandas.corr() and identifies features that have an absolute value of the
+    correlation coefficient higher than the specified threshold. Different
+    correlation methods supported by such as 'pearson', 'spearman', or
+    'kendall'.
+
+    .. deprecated:: 0.1.11
+
+        tsfel.correlated_features will be deprecated in tsfel 0.1.11 and will be
+        removed in other upcoming releases. It will be replaced by a future
+        DropCorrelated feature class using fit and transform logic.
 
     Parameters
     ----------
-    features : DataFrame
-        features
-    threshold :
-        correlation value for removing highly correlated features
+    features : pd.DataFrame
+        A DataFrame containing the feature data.
+    threshold : float
+        The correlation value for removing highly correlated features.
+    method : str
+        Method to compute correlation. Must be one of {'pearson', 'kendall', 'spearman'}
+    drop_correlated : bool:
+        If True, drop the highly correlated features from the DataFrame.
+
     Returns
     -------
-    DataFrame
-        correlated features names
+    Union[List[str], Tuple[List[str], pd.DataFrame]]:
+        - A list of names of highly correlated features.
+        - If `drop_correlated` is True, a tuple containing the list of dropped feature names and the updated DataFrame with those features removed.
     """
-    corr_matrix = features.corr().abs()
+
+    corr_matrix = features.corr(method=method).abs()
     # Select upper triangle of correlation matrix
     upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
     # Find index and column name of features with correlation greater than 0.95
     to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
 
-    return to_drop
+    if drop_correlated:
+        features.drop(to_drop, axis=1, inplace=True)
+
+        return to_drop, features
+
+    else:
+        return to_drop