diff --git a/Cargo.toml b/Cargo.toml index 9f3c160..940de76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.7.5" +version = "0.7.6" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" diff --git a/README.md b/README.md index a4fc3d0..ce4c459 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ conda install conda-forge::perpetual To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual): -```toml +```shell cargo add perpetual ``` diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index ac40204..49b9b5d 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.7.5" +version = "0.7.6" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.22.6", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.7.5", path = "../" } +perpetual_rs = {package="perpetual", version = "0.7.6", path = "../" } numpy = "0.22.1" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index ce38f51..784f558 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.7.5" +version = "0.7.6" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/python-package/python/perpetual/utils.py b/python-package/python/perpetual/utils.py index 92b454e..df51ff5 100644 --- a/python-package/python/perpetual/utils.py +++ b/python-package/python/perpetual/utils.py @@ -117,38 +117,34 @@ def convert_input_frame( cat_to_num = [] if categorical_features_: for i in categorical_features_: - categories = np.unique(X_[:, i].astype(dtype="str", copy=False)) + categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True) + + categories = list(categories) + if "nan" in categories: + categories.remove("nan") + categories.insert(0, "nan") + + inversed = inversed + 1.0 + if len(categories) > max_cat: cat_to_num.append(i) logger.warning( f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold." ) - categories = [c for c in list(categories) if c != "nan"] - categories.insert(0, "nan") - cat_mapping[features_[i]] = categories + + feature_name = features_[i] + cat_mapping[feature_name] = categories + ind_nan = len(categories) + inversed[inversed == ind_nan] = np.nan + X_[:, i] = inversed + categorical_features_ = [ x for x in categorical_features_ if x not in cat_to_num ] - if cat_mapping: logger.info(f"Categorical features: {categorical_features_}") logger.info(f"Mapping of categories: {cat_mapping}") - for feature_name, categories in cat_mapping.items(): - - def f(x): - try: - return ( - np.nan - if str(x[feature_index]) == "nan" - else float(categories.index(str(x[feature_index]))) - ) - except (ValueError, IndexError): - return np.nan - - feature_index = features_.index(feature_name) - X_[:, feature_index] = np.apply_along_axis(f, 1, X_) - if not np.issubdtype(X_.dtype, "float64"): X_ = X_.astype(dtype="float64", copy=False) flat_data = X_.ravel(order="F") @@ -183,18 +179,13 @@ def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, i if cat_mapping: for feature_name, categories in cat_mapping.items(): feature_index = features_.index(feature_name) - - def f(x): - try: - return ( - np.nan - if str(x[feature_index]) == "nan" - else float(categories.index(str(x[feature_index]))) - ) - except (ValueError, IndexError): - return np.nan - - X_[:, feature_index] = np.apply_along_axis(f, 1, X_) + cats = categories.copy() + cats.remove("nan") + x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str)) + x_enc = x_enc + 1.0 + ind_nan = len(categories) + x_enc[x_enc == ind_nan] = np.nan + X_[:, feature_index] = x_enc if not np.issubdtype(X_.dtype, "float64"): X_ = X_.astype(dtype="float64", copy=False)