Skip to content

Commit

Permalink
Improvements on the method to filter correlated features.
Browse files Browse the repository at this point in the history
Ensures backward compatibility to previous implementations. Add other correlation coefficients other than pearson. Add a dedicated unit test script.
  • Loading branch information
dmfolgado committed Sep 11, 2024
1 parent f3fd5e4 commit c001ee6
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 16 deletions.
4 changes: 2 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ statistics = True
# SC = flake8-spellcheck
ignore = E203, E211, E265, E501, E999, F401, F821, W503, W505, SC100, SC200, C400, C401, C402, B008, E800, E741, F403, F405, C901, B028, E226
max-line-length = 120
max-doc-length = 120
max-doc-length = 80
import-order-style = google
docstring-convention = google
inline-quotes = "
strictness=short
dictionaries=en_US,python,technical,pandas
min-python-version = 3.7.0
min-python-version = 3.8.0
exclude = .git,.tox,.nox,venv,.venv,.venv-docs,.venv-dev,.venv-note,.venv-dempy,docs,test
max-complexity = 10
#spellcheck-targets=comments
7 changes: 3 additions & 4 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pre-commit==3.7.0
matplotlib==3.8.2
seaborn==0.13.2
neurokit==0.2.10
ipython==8.27.0
matplotlib==3.9.2
pre-commit==3.8.0
77 changes: 77 additions & 0 deletions tests/test_signal_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""A test suite for the signal processing methods.
The name will likely change after the major refactor.
"""

import unittest

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

import tsfel


class SignalProcessingTestCase(unittest.TestCase):
"""Unit tests for signal processing methods."""

def test_univariate_correlated_features(self):
X = self.generate_univariate_correlated_dataset()
features_name_to_drop, filtered_X = tsfel.correlated_features(X, threshold=0.90, drop_correlated=True)
np.testing.assert_equal(
(features_name_to_drop, np.shape(filtered_X)),
(["Feature_4", "Feature_5", "Feature_7", "Feature_8", "Feature_9"], (1000, 5)),
)

def test_no_correlated_features(self):
X = self.generate_univariate_correlated_dataset(num_features=5, num_redundant=0)
features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
self.assertEqual(features_name_to_drop, [])

def test_empty_dataframe(self):
X = pd.DataFrame()
features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
self.assertEqual(features_name_to_drop, [])

def test_different_thresholds(self):
X = self.generate_univariate_correlated_dataset()
thresholds = [0.05, 0.5, 0.95]
expected_features_to_remove = [5, 5, 3]

for threshold, expected_n_features in zip(thresholds, expected_features_to_remove):
features_name_to_drop = tsfel.correlated_features(X, threshold=threshold, drop_correlated=False)
self.assertIsInstance(features_name_to_drop, list)
self.assertEqual(len(features_name_to_drop), expected_n_features)

@staticmethod
def generate_univariate_correlated_dataset(num_features: int = 10, num_redundant: int = 5) -> pd.DataFrame:
"""Generate a synthetic dataset with correlated features.
Parameters
----------
num_features: int
Number of features in the dataset.
num_redundant: int
Number of redundant features that are correlated.
Returns
-------
pd.DataFrame: DataFrame with the generated dataset.
"""
# Generate synthetic data
X, _ = make_classification(
n_samples=1000,
n_features=num_features,
n_redundant=num_redundant,
random_state=42,
)

# Create a DataFrame with appropriate column names
column_names = [f"Feature_{i}" for i in range(1, num_features + 1)]
X_df = pd.DataFrame(X, columns=column_names)

return X_df


if __name__ == "__main__":
unittest.main()
50 changes: 40 additions & 10 deletions tsfel/utils/signal_processing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List, Tuple, Union

import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
Expand Down Expand Up @@ -74,21 +76,46 @@ def merge_time_series(data, fs_resample, time_unit):
return pd.DataFrame(data=data_new[:, 1:], columns=header_values[1:])


def correlated_features(features, threshold=0.95, drop_correlated=True):
"""Compute pairwise correlation of features using pearson method.
def correlated_features(
features: pd.DataFrame,
threshold: float = 0.95,
method: str = "pearson",
drop_correlated: bool = False,
) -> Union[List[str], Tuple[List[str], pd.DataFrame]]:
"""Identify and optionally remove highly correlated features from a
DataFrame.
This function computes the pairwise Pearson correlation of features using
pandas.corr() and identifies features that have an absolute value of the
correlation coefficient higher than the specified threshold. Different
correlation methods supported by such as 'pearson', 'spearman', or
'kendall'.
.. deprecated:: 0.1.11
tsfel.correlated_features will be deprecated in tsfel 0.1.11 and will be
removed in other upcoming releases. It will be replaced by a future
DropCorrelated feature class using fit and transform logic.
Parameters
----------
features : DataFrame
features
threshold :
correlation value for removing highly correlated features
features : pd.DataFrame
A DataFrame containing the feature data.
threshold : float
The correlation value for removing highly correlated features.
method : str
Method to compute correlation. Must be one of {'pearson', 'kendall', 'spearman'}
drop_correlated : bool:
If True, drop the highly correlated features from the DataFrame.
Returns
-------
DataFrame
correlated features names
Union[List[str], Tuple[List[str], pd.DataFrame]]:
- A list of names of highly correlated features.
- If `drop_correlated` is True, a tuple containing the list of dropped feature names and the updated DataFrame with those features removed.
"""
corr_matrix = features.corr().abs()

corr_matrix = features.corr(method=method).abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find index and column name of features with correlation greater than 0.95
Expand All @@ -97,4 +124,7 @@ def correlated_features(features, threshold=0.95, drop_correlated=True):
if drop_correlated:
features.drop(to_drop, axis=1, inplace=True)

return to_drop, features
return to_drop, features

else:
return to_drop

0 comments on commit c001ee6

Please sign in to comment.