Skip to content

Commit

Permalink
convert frame improved
Browse files Browse the repository at this point in the history
  • Loading branch information
deadsoul44 committed Nov 20, 2024
1 parent a938441 commit 8c669bc
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 37 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.7.5"
version = "0.7.6"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ conda install conda-forge::perpetual

To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual):

```toml
```shell
cargo add perpetual
```

Expand Down
4 changes: 2 additions & 2 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.7.5"
version = "0.7.6"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.6", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.5", path = "../" }
perpetual_rs = {package="perpetual", version = "0.7.6", path = "../" }
numpy = "0.22.1"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
Expand Down
2 changes: 1 addition & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "perpetual"
version = "0.7.5"
version = "0.7.6"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
53 changes: 21 additions & 32 deletions python-package/python/perpetual/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,38 +117,33 @@ def convert_input_frame(
cat_to_num = []
if categorical_features_:
for i in categorical_features_:
categories = np.unique(X_[:, i].astype(dtype="str", copy=False))
categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
if categories[-1] == "nan":
categories = list(categories)
else:
categories = list(categories)
categories.append("nan")
inversed = inversed.astype(np.float32)

if len(categories) > max_cat:
cat_to_num.append(i)
logger.warning(
f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
)
categories = [c for c in list(categories) if c != "nan"]
categories.insert(0, "nan")
cat_mapping[features_[i]] = categories

feature_name = features_[i]
cat_mapping[feature_name] = categories
ind_nan = len(categories) - 1
inversed[inversed == ind_nan] = np.nan
X_[:, i] = inversed

categorical_features_ = [
x for x in categorical_features_ if x not in cat_to_num
]

if cat_mapping:
logger.info(f"Categorical features: {categorical_features_}")
logger.info(f"Mapping of categories: {cat_mapping}")

for feature_name, categories in cat_mapping.items():

def f(x):
try:
return (
np.nan
if str(x[feature_index]) == "nan"
else float(categories.index(str(x[feature_index])))
)
except (ValueError, IndexError):
return np.nan

feature_index = features_.index(feature_name)
X_[:, feature_index] = np.apply_along_axis(f, 1, X_)

if not np.issubdtype(X_.dtype, "float64"):
X_ = X_.astype(dtype="float64", copy=False)
flat_data = X_.ravel(order="F")
Expand Down Expand Up @@ -183,18 +178,12 @@ def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, i
if cat_mapping:
for feature_name, categories in cat_mapping.items():
feature_index = features_.index(feature_name)

def f(x):
try:
return (
np.nan
if str(x[feature_index]) == "nan"
else float(categories.index(str(x[feature_index])))
)
except (ValueError, IndexError):
return np.nan

X_[:, feature_index] = np.apply_along_axis(f, 1, X_)
x_enc = np.searchsorted(
categories, X_[:, feature_index].astype(str)
).astype(np.float32)
ind_nan = len(categories) - 1
x_enc[x_enc == ind_nan] = np.nan
X_[:, feature_index] = x_enc

if not np.issubdtype(X_.dtype, "float64"):
X_ = X_.astype(dtype="float64", copy=False)
Expand Down

0 comments on commit 8c669bc

Please sign in to comment.