Skip to content

Commit

Permalink
Merge pull request #34 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Update seq_missing and block_missing
  • Loading branch information
WenjieDu committed Jun 3, 2024
2 parents fe79111 + 41886b7 commit fabfd30
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 49 deletions.
33 changes: 20 additions & 13 deletions pygrinder/block_missing/block_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,30 @@

import numpy as np
import torch
from tsdb.utils.logging import logger


def random_select_start_indices(
block_width,
feature_idx,
step_idx,
p,
hit_rate,
n_samples,
n_steps,
n_features,
) -> np.ndarray:
all_feature_indices = [
i * n_features + j for i in range(n_samples) for j in feature_idx
]

if hit_rate > 1:
logger.warning(f"hit_rate={hit_rate} > 1")

all_feature_start_indices = [i * n_steps for i in all_feature_indices]
selected_feature_start_indices = np.random.choice(
all_feature_start_indices,
math.ceil(len(all_feature_start_indices) * p),
replace=False,
math.ceil(len(all_feature_start_indices) * hit_rate),
replace=hit_rate > 1,
)
selected_feature_start_indices = np.asarray(selected_feature_start_indices)

Expand All @@ -47,7 +52,7 @@ def random_select_start_indices(

def _block_missing_numpy(
X: np.ndarray,
p: float,
factor: float,
block_len: int,
block_width: int,
feature_idx: list = None,
Expand All @@ -57,8 +62,9 @@ def _block_missing_numpy(
X = np.copy(X)

n_samples, n_steps, n_features = X.shape
hit_rate = factor * n_steps * n_features / (block_len * block_width)
start_indices = random_select_start_indices(
block_width, feature_idx, step_idx, p, n_samples, n_steps, n_features
block_width, feature_idx, step_idx, hit_rate, n_samples, n_steps, n_features
)

X = X.transpose(0, 2, 1)
Expand All @@ -73,7 +79,7 @@ def _block_missing_numpy(

def _block_missing_torch(
X: torch.Tensor,
p: float,
factor: float,
block_len: int,
block_width: int,
feature_idx: list = None,
Expand All @@ -83,8 +89,9 @@ def _block_missing_torch(
X = torch.clone(X)

n_samples, n_steps, n_features = X.shape
hit_rate = factor * n_steps * n_features / (block_len * block_width)
start_indices = random_select_start_indices(
block_width, feature_idx, step_idx, p, n_samples, n_steps, n_features
block_width, feature_idx, step_idx, hit_rate, n_samples, n_steps, n_features
)

X = X.transpose(1, 2)
Expand All @@ -99,7 +106,7 @@ def _block_missing_torch(

def block_missing(
X: Union[np.ndarray, torch.Tensor],
p: float,
factor: float,
block_len: int,
block_width: int,
feature_idx: list = None,
Expand All @@ -109,7 +116,7 @@ def block_missing(
X = np.asarray(X)
n_samples, n_steps, n_features = X.shape

assert 0 < p <= 1, f"p must be in range (0, 1), but got {p}"
# assert 0 < p <= 1, f"p must be in range (0, 1), but got {p}"

assert isinstance(
block_len, int
Expand All @@ -132,7 +139,7 @@ def block_missing(
max(feature_idx) <= n_features
), f"values in `feature_idx` must be <= {n_features}, but got {max(feature_idx)}"
else:
feature_idx = list(range(n_features - block_width))
feature_idx = list(range(n_features - block_width + 1))

if step_idx is not None:
assert isinstance(
Expand All @@ -146,12 +153,12 @@ def block_missing(
n_steps - max(step_idx) >= block_len
), f"n_steps - max(step_idx) must be >= block_len, but got {n_steps - max(step_idx)}"
else:
step_idx = list(range(n_steps - block_len))
step_idx = list(range(n_steps - block_len + 1))

if isinstance(X, np.ndarray):
corrupted_X = _block_missing_numpy(
X,
p,
factor,
block_len,
block_width,
feature_idx,
Expand All @@ -160,7 +167,7 @@ def block_missing(
elif isinstance(X, torch.Tensor):
corrupted_X = _block_missing_torch(
X,
p,
factor,
block_len,
block_width,
feature_idx,
Expand Down
16 changes: 11 additions & 5 deletions pygrinder/sequential_missing/seq_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@

import numpy as np
import torch
from tsdb.utils.logging import logger


def random_select_start_indices(
feature_idx,
step_idx,
p,
hit_rate,
n_samples,
n_steps,
n_features,
Expand All @@ -29,10 +30,13 @@ def random_select_start_indices(
]
all_feature_start_indices = [i * n_steps for i in all_feature_indices]

if hit_rate > 1:
logger.warning(f"hit_rate={hit_rate} > 1")

selected_feature_start_indices = np.random.choice(
all_feature_start_indices,
math.ceil(len(all_feature_start_indices) * p),
replace=False,
math.ceil(len(all_feature_start_indices) * hit_rate),
replace=hit_rate > 1,
)
selected_feature_start_indices = np.asarray(selected_feature_start_indices)

Expand All @@ -57,8 +61,9 @@ def _seq_missing_numpy(
X = np.copy(X)

n_samples, n_steps, n_features = X.shape
hit_rate = p * n_steps / seq_len
start_indices = random_select_start_indices(
feature_idx, step_idx, p, n_samples, n_steps, n_features
feature_idx, step_idx, hit_rate, n_samples, n_steps, n_features
)

X = X.transpose(0, 2, 1)
Expand All @@ -82,8 +87,9 @@ def _seq_missing_torch(
X = torch.clone(X)

n_samples, n_steps, n_features = X.shape
hit_rate = p * n_steps / seq_len
start_indices = random_select_start_indices(
feature_idx, step_idx, p, n_samples, n_steps, n_features
feature_idx, step_idx, hit_rate, n_samples, n_steps, n_features
)

X = X.transpose(1, 2)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ numpy
scikit-learn
pandas
torch
tsdb
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"scipy",
"pandas",
"torch",
"tsdb",
],
setup_requires=["setuptools>=38.6.0"],
classifiers=[
Expand Down
50 changes: 19 additions & 31 deletions tests/test_pygrinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,53 +145,41 @@ def test_4_seq_missing(self):
n_samples, n_steps, n_features = 128, 24, 10
X = np.random.randn(n_samples, n_steps, n_features)

p, seq_len = 1, 3
p, seq_len = 0.1, 3
X_with_seq_missing = seq_missing(X, p, seq_len)
actual_missing_rate = calc_missing_rate(X_with_seq_missing)
assert round(actual_missing_rate, 5) == seq_len / n_steps * p
print(f"sequence {p} actual_missing_rate: {actual_missing_rate}")
# assert round(actual_missing_rate, 1) == p

p, seq_len = 0.6, 3
p, seq_len = 0.5, 3
X_with_seq_missing = seq_missing(X, p, seq_len)
actual_missing_rate = calc_missing_rate(X_with_seq_missing)
assert round(actual_missing_rate, 5) == seq_len / n_steps * p
print(f"sequence {p} actual_missing_rate: {actual_missing_rate}")
# assert round(actual_missing_rate, 1) == p

X = torch.from_numpy(X)
p, seq_len = 0.6, 3
p, seq_len = 0.9, 3
X_with_seq_missing = seq_missing(X, p, seq_len)
actual_missing_rate = calc_missing_rate(X_with_seq_missing)
assert round(actual_missing_rate, 5) == seq_len / n_steps * p
print(f"sequence {p} actual_missing_rate: {actual_missing_rate}")
# assert round(actual_missing_rate, 1) == p

def test_4_block_missing(self):
n_samples, n_steps, n_features = 128, 24, 10
X = np.random.randn(n_samples, n_steps, n_features)

p, block_len, block_width = 1, 3, 2
X_with_block_missing = block_missing(X, p, block_len, block_width)
factor, block_len, block_width = 0.1, 5, 5
X_with_block_missing = block_missing(X, factor, block_len, block_width)
actual_missing_rate = calc_missing_rate(X_with_block_missing)
print(f"actual_missing_rate: {actual_missing_rate}")
# # assertion may not work because block missing may be overlap
# assert (
# round(actual_missing_rate, 5)
# == (block_len * block_width) / (n_steps * n_features) * p
# )

p, block_len, block_width = 0.6, 3, 2
X_with_block_missing = block_missing(X, p, block_len, block_width)
print(f"block {factor} actual_missing_rate: {actual_missing_rate}")

factor, block_len, block_width = 0.5, 5, 5
X_with_block_missing = block_missing(X, factor, block_len, block_width)
actual_missing_rate = calc_missing_rate(X_with_block_missing)
print(f"actual_missing_rate: {actual_missing_rate}")
# # assertion may not work because block missing may be overlap
# assert (
# round(actual_missing_rate, 5)
# == (block_len * block_width) / (n_steps * n_features) * p
# )
print(f"block {factor} actual_missing_rate: {actual_missing_rate}")

X = torch.from_numpy(X)
p, block_len, block_width = 0.6, 3, 2
X_with_block_missing = block_missing(X, p, block_len, block_width)
factor, block_len, block_width = 0.9, 5, 5
X_with_block_missing = block_missing(X, factor, block_len, block_width)
actual_missing_rate = calc_missing_rate(X_with_block_missing)
print(f"actual_missing_rate: {actual_missing_rate}")
# # assertion may not work because block missing may be overlap
# assert (
# round(actual_missing_rate, 5)
# == (block_len * block_width) / (n_steps * n_features) * p
# )
print(f"block {factor} actual_missing_rate: {actual_missing_rate}")

0 comments on commit fabfd30

Please sign in to comment.