Skip to content

Commit

Permalink
Merge pull request #163 from fraunhoferportugal/correlated_features
Browse files Browse the repository at this point in the history
Option to remove correlated features
  • Loading branch information
mbarandas authored Sep 11, 2024
2 parents b4ddb42 + c001ee6 commit 222cde2
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 23 deletions.
4 changes: 2 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ statistics = True
# SC = flake8-spellcheck
ignore = E203, E211, E265, E501, E999, F401, F821, W503, W505, SC100, SC200, C400, C401, C402, B008, E800, E741, F403, F405, C901, B028, E226
max-line-length = 120
max-doc-length = 120
max-doc-length = 80
import-order-style = google
docstring-convention = google
inline-quotes = "
strictness=short
dictionaries=en_US,python,technical,pandas
min-python-version = 3.7.0
min-python-version = 3.8.0
exclude = .git,.tox,.nox,venv,.venv,.venv-docs,.venv-dev,.venv-note,.venv-dempy,docs,test
max-complexity = 10
#spellcheck-targets=comments
10 changes: 7 additions & 3 deletions notebooks/TSFEL_HAR_Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,7 @@
"outputs": [],
"source": [
"# Highly correlated features are removed\n",
"corr_features = tsfel.correlated_features(X_train)\n",
"X_train.drop(corr_features, axis=1, inplace=True)\n",
"corr_features, X_train = tsfel.correlated_features(X_train, drop_correlated=True)\n",
"X_test.drop(corr_features, axis=1, inplace=True)\n",
"\n",
"# Remove low variance features\n",
Expand Down Expand Up @@ -281,6 +280,11 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "tsfel",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
Expand All @@ -291,7 +295,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
3 changes: 1 addition & 2 deletions notebooks/TSFEL_SMARTWATCH_HAR_Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,7 @@
"outputs": [],
"source": [
"# Highly correlated features are removed\n",
"corr_features = tsfel.correlated_features(x_train_feat)\n",
"x_train_feat.drop(corr_features, axis=1, inplace=True)\n",
"corr_features, x_train_feat = tsfel.correlated_features(x_train_feat, drop_correlated=True)\n",
"x_test_feat.drop(corr_features, axis=1, inplace=True)\n",
"\n",
"# Remove low variance features\n",
Expand Down
3 changes: 1 addition & 2 deletions notebooks/TSFEL_predicting_NormalVsPathologicalknee.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,7 @@
"outputs": [],
"source": [
"# Highly correlated features are removed\n",
"corr_features = tsfel.correlated_features(X_train)\n",
"X_train.drop(corr_features, axis=1, inplace=True)\n",
"corr_features, X_train = tsfel.correlated_features(X_train, drop_correlated=True)\n",
"X_test.drop(corr_features, axis=1, inplace=True)\n",
"\n",
"# Remove low variance features\n",
Expand Down
7 changes: 3 additions & 4 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pre-commit==3.7.0
matplotlib==3.8.2
seaborn==0.13.2
neurokit==0.2.10
ipython==8.27.0
matplotlib==3.9.2
pre-commit==3.8.0
77 changes: 77 additions & 0 deletions tests/test_signal_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""A test suite for the signal processing methods.
The name will likely change after the major refactor.
"""

import unittest

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

import tsfel


class SignalProcessingTestCase(unittest.TestCase):
"""Unit tests for signal processing methods."""

def test_univariate_correlated_features(self):
X = self.generate_univariate_correlated_dataset()
features_name_to_drop, filtered_X = tsfel.correlated_features(X, threshold=0.90, drop_correlated=True)
np.testing.assert_equal(
(features_name_to_drop, np.shape(filtered_X)),
(["Feature_4", "Feature_5", "Feature_7", "Feature_8", "Feature_9"], (1000, 5)),
)

def test_no_correlated_features(self):
X = self.generate_univariate_correlated_dataset(num_features=5, num_redundant=0)
features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
self.assertEqual(features_name_to_drop, [])

def test_empty_dataframe(self):
X = pd.DataFrame()
features_name_to_drop = tsfel.correlated_features(X, threshold=0.90, drop_correlated=False)
self.assertEqual(features_name_to_drop, [])

def test_different_thresholds(self):
X = self.generate_univariate_correlated_dataset()
thresholds = [0.05, 0.5, 0.95]
expected_features_to_remove = [5, 5, 3]

for threshold, expected_n_features in zip(thresholds, expected_features_to_remove):
features_name_to_drop = tsfel.correlated_features(X, threshold=threshold, drop_correlated=False)
self.assertIsInstance(features_name_to_drop, list)
self.assertEqual(len(features_name_to_drop), expected_n_features)

@staticmethod
def generate_univariate_correlated_dataset(num_features: int = 10, num_redundant: int = 5) -> pd.DataFrame:
"""Generate a synthetic dataset with correlated features.
Parameters
----------
num_features: int
Number of features in the dataset.
num_redundant: int
Number of redundant features that are correlated.
Returns
-------
pd.DataFrame: DataFrame with the generated dataset.
"""
# Generate synthetic data
X, _ = make_classification(
n_samples=1000,
n_features=num_features,
n_redundant=num_redundant,
random_state=42,
)

# Create a DataFrame with appropriate column names
column_names = [f"Feature_{i}" for i in range(1, num_features + 1)]
X_df = pd.DataFrame(X, columns=column_names)

return X_df


if __name__ == "__main__":
unittest.main()
53 changes: 43 additions & 10 deletions tsfel/utils/signal_processing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List, Tuple, Union

import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
Expand Down Expand Up @@ -74,24 +76,55 @@ def merge_time_series(data, fs_resample, time_unit):
return pd.DataFrame(data=data_new[:, 1:], columns=header_values[1:])


def correlated_features(features, threshold=0.95):
"""Compute pairwise correlation of features using pearson method.
def correlated_features(
features: pd.DataFrame,
threshold: float = 0.95,
method: str = "pearson",
drop_correlated: bool = False,
) -> Union[List[str], Tuple[List[str], pd.DataFrame]]:
"""Identify and optionally remove highly correlated features from a
DataFrame.
This function computes the pairwise Pearson correlation of features using
pandas.corr() and identifies features that have an absolute value of the
correlation coefficient higher than the specified threshold. Different
correlation methods supported by such as 'pearson', 'spearman', or
'kendall'.
.. deprecated:: 0.1.11
tsfel.correlated_features will be deprecated in tsfel 0.1.11 and will be
removed in other upcoming releases. It will be replaced by a future
DropCorrelated feature class using fit and transform logic.
Parameters
----------
features : DataFrame
features
threshold :
correlation value for removing highly correlated features
features : pd.DataFrame
A DataFrame containing the feature data.
threshold : float
The correlation value for removing highly correlated features.
method : str
Method to compute correlation. Must be one of {'pearson', 'kendall', 'spearman'}
drop_correlated : bool:
If True, drop the highly correlated features from the DataFrame.
Returns
-------
DataFrame
correlated features names
Union[List[str], Tuple[List[str], pd.DataFrame]]:
- A list of names of highly correlated features.
- If `drop_correlated` is True, a tuple containing the list of dropped feature names and the updated DataFrame with those features removed.
"""
corr_matrix = features.corr().abs()

corr_matrix = features.corr(method=method).abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find index and column name of features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

return to_drop
if drop_correlated:
features.drop(to_drop, axis=1, inplace=True)

return to_drop, features

else:
return to_drop

0 comments on commit 222cde2

Please sign in to comment.