Improvements on the method to filter correlated features.

Ensures backward compatibility to previous implementations. Add other correlation coefficients other than pearson. Add a dedicated unit test script.
fraunhoferportugal · Sep 11, 2024 · c001ee6 · c001ee6
1 parent f3fd5e4
commit c001ee6
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 16 deletions.
diff --git a/.flake8 b/.flake8
@@ -29,13 +29,13 @@ statistics = True
 # SC = flake8-spellcheck
 ignore = E203, E211, E265, E501, E999, F401, F821, W503, W505, SC100, SC200, C400, C401, C402, B008, E800, E741, F403, F405, C901, B028, E226
 max-line-length = 120
-max-doc-length = 120
+max-doc-length = 80
 import-order-style = google
 docstring-convention = google
 inline-quotes = "
 strictness=short
 dictionaries=en_US,python,technical,pandas
-min-python-version = 3.7.0
+min-python-version = 3.8.0
 exclude = .git,.tox,.nox,venv,.venv,.venv-docs,.venv-dev,.venv-note,.venv-dempy,docs,test
 max-complexity = 10
 #spellcheck-targets=comments
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -1,4 +1,3 @@
-pre-commit==3.7.0
-matplotlib==3.8.2
-seaborn==0.13.2
-neurokit==0.2.10
+ipython==8.27.0
+matplotlib==3.9.2
+pre-commit==3.8.0
diff --git a/tests/test_signal_processing.py b/tests/test_signal_processing.py
@@ -0,0 +1,77 @@
+"""A test suite for the signal processing methods.
+
+The name will likely change after the major refactor.
+"""
+
+import unittest
+
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_classification
+
+import tsfel
+
+
+class SignalProcessingTestCase(unittest.TestCase):
+ """Unit tests for signal processing methods."""
+
+ def test_univariate_correlated_features(self):
+ X = self.generate_univariate_correlated_dataset()
+ features_name_to_drop, filtered_X = tsfel.correlated_features(X, threshold=0.90, drop_correlated=True)
+ np.testing.assert_equal(
+ (features_name_to_drop, np.shape(filtered_X)),
+ (["Feature_4", "Feature_5", "Feature_7", "Feature_8", "Feature_9"], (1000, 5)),
+ )
+
+ def test_no_correlated_features(self):
+ X = self.generate_univariate_correlated_dataset(num_features=5, num_redundant=0)
+ features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
+ self.assertEqual(features_name_to_drop, [])
+
+ def test_empty_dataframe(self):
+ X = pd.DataFrame()
+ features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
+ self.assertEqual(features_name_to_drop, [])
+
+ def test_different_thresholds(self):
+ X = self.generate_univariate_correlated_dataset()
+ thresholds = [0.05, 0.5, 0.95]
+ expected_features_to_remove = [5, 5, 3]
+
+ for threshold, expected_n_features in zip(thresholds, expected_features_to_remove):
+ features_name_to_drop = tsfel.correlated_features(X, threshold=threshold, drop_correlated=False)
+ self.assertIsInstance(features_name_to_drop, list)
+ self.assertEqual(len(features_name_to_drop), expected_n_features)
+
+ @staticmethod
+ def generate_univariate_correlated_dataset(num_features: int = 10, num_redundant: int = 5) -> pd.DataFrame:
+ """Generate a synthetic dataset with correlated features.
+
+ Parameters
+ ----------
+ num_features: int
+ Number of features in the dataset.
+ num_redundant: int
+ Number of redundant features that are correlated.
+
+ Returns
+ -------
+ pd.DataFrame: DataFrame with the generated dataset.
+ """
+ # Generate synthetic data
+ X, _ = make_classification(
+ n_samples=1000,
+ n_features=num_features,
+ n_redundant=num_redundant,
+ random_state=42,
+ )
+
+ # Create a DataFrame with appropriate column names
+ column_names = [f"Feature_{i}" for i in range(1, num_features + 1)]
+ X_df = pd.DataFrame(X, columns=column_names)
+
+ return X_df
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tsfel/utils/signal_processing.py b/tsfel/utils/signal_processing.py
@@ -1,3 +1,5 @@
+from typing import List, Tuple, Union
+
 import numpy as np
 import pandas as pd
 from scipy.interpolate import interp1d
@@ -74,21 +76,46 @@ def merge_time_series(data, fs_resample, time_unit):
  return pd.DataFrame(data=data_new[:, 1:], columns=header_values[1:])
 
 
-def correlated_features(features, threshold=0.95, drop_correlated=True):
- """Compute pairwise correlation of features using pearson method.
+def correlated_features(
+ features: pd.DataFrame,
+ threshold: float = 0.95,
+ method: str = "pearson",
+ drop_correlated: bool = False,
+) -> Union[List[str], Tuple[List[str], pd.DataFrame]]:
+ """Identify and optionally remove highly correlated features from a
+ DataFrame.
+
+ This function computes the pairwise Pearson correlation of features using
+ pandas.corr() and identifies features that have an absolute value of the
+ correlation coefficient higher than the specified threshold. Different
+ correlation methods supported by such as 'pearson', 'spearman', or
+ 'kendall'.
+
+ .. deprecated:: 0.1.11
+
+ tsfel.correlated_features will be deprecated in tsfel 0.1.11 and will be
+ removed in other upcoming releases. It will be replaced by a future
+ DropCorrelated feature class using fit and transform logic.
 
  Parameters
  ----------
- features : DataFrame
- features
- threshold :
- correlation value for removing highly correlated features
+ features : pd.DataFrame
+ A DataFrame containing the feature data.
+ threshold : float
+ The correlation value for removing highly correlated features.
+ method : str
+ Method to compute correlation. Must be one of {'pearson', 'kendall', 'spearman'}
+ drop_correlated : bool:
+ If True, drop the highly correlated features from the DataFrame.
+
  Returns
  -------
- DataFrame
- correlated features names
+ Union[List[str], Tuple[List[str], pd.DataFrame]]:
+ - A list of names of highly correlated features.
+ - If `drop_correlated` is True, a tuple containing the list of dropped feature names and the updated DataFrame with those features removed.
  """
- corr_matrix = features.corr().abs()
+
+ corr_matrix = features.corr(method=method).abs()
  # Select upper triangle of correlation matrix
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
  # Find index and column name of features with correlation greater than 0.95
@@ -97,4 +124,7 @@ def correlated_features(features, threshold=0.95, drop_correlated=True):
  if drop_correlated:
  features.drop(to_drop, axis=1, inplace=True)
 
- return to_drop, features
+ return to_drop, features
+
+ else:
+ return to_drop