max_cat added

perpetual-ml · Nov 19, 2024 · 61de63c · 61de63c
1 parent 54c2c30
commit 61de63c
Show file tree

Hide file tree

Showing 7 changed files with 136 additions and 33 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "perpetual"
-version = "0.7.3"
+version = "0.7.4"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"

diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ Documentation for the Python API can be found [here](https://perpetual-ml.github
 
 ## Installation
 
-The package can be installed directly from [pypi](https://pypi.org/project/perpetual).
+The package can be installed directly from [pypi](https://pypi.org/project/perpetual):
 
 ```shell
 pip install perpetual
@@ -64,10 +64,10 @@ Using [conda-forge](https://anaconda.org/conda-forge/perpetual):
 conda install conda-forge::perpetual
 ```
 
-To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
+To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual):
 
 ```toml
-perpetual = "0.7.3"
+cargo add perpetual
 ```
 
 ## Contribution

diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-perpetual"
-version = "0.7.3"
+version = "0.7.4"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]
 
 [dependencies]
 pyo3 = { version = "0.22.6", features = ["extension-module"] }
-perpetual_rs = {package="perpetual", version = "0.7.3", path = "../" }
+perpetual_rs = {package="perpetual", version = "0.7.4", path = "../" }
 numpy = "0.22.1"
 ndarray = "0.16.1"
 serde_plain = { version = "1.0" }

diff --git a/python-package/examples/fetch_openml.ipynb b/python-package/examples/fetch_openml.ipynb
@@ -0,0 +1,81 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from perpetual import PerpetualBooster\n",
+    "from sklearn.datasets import fetch_openml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = PerpetualBooster()\n",
+    "model.fit(data, target, budget=0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.number_of_trees"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "perpetual"
-version = "0.7.3"
+version = "0.7.4"
 description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
 license = { file = "LICENSE" }
 keywords = [

diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py
@@ -57,10 +57,11 @@ def __init__(
         memory_limit: Optional[float] = None,
         stopping_rounds: Optional[int] = None,
         max_bin: int = 256,
+        max_cat: int = 1000,
     ):
         """PerpetualBooster class, used to generate gradient boosted decision tree ensembles.
         The following parameters can also be specified in the fit method to override the values in the constructor:
-            budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit.
+            budget, alpha, reset, categorical_features, timeout, iteration_limit, memory_limit, and stopping_rounds.
 
         Args:
             objective (str, optional): Learning objective function to be used for optimization.
@@ -104,21 +105,24 @@ def __init__(
                 - "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
             log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
             feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
-            budget: a positive number for fitting budget. Increasing this number will more
+            budget (float, optional): a positive number for fitting budget. Increasing this number will more
                 likely result in more boosting rounds and more increased predictive power.
                 Default value is 1.0.
-            alpha: only used in quantile regression.
-            reset: whether to reset the model or continue training.
-            categorical_features: The names or indices for categorical features.
-                `auto` for Polars or Pandas categorical data type.
-            timeout: optional fit timeout in seconds
-            iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
+            alpha (float, optional): only used in quantile regression.
+            reset (bool, optional): whether to reset the model or continue training.
+            categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
+                Defaults to `auto` for Polars or Pandas categorical data types.
+            timeout (float, optional): optional fit timeout in seconds
+            iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
                 The algorithm automatically stops for most of the cases before hitting this limit.
                 If you want to experiment with very high budget (>2.0), you can also increase this limit.
-            memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
+            memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
                 available memory and the algorithm requirements.
-            stopping_rounds: optional limit for auto stopping.
-            max_bin: number bins for feature discretization.
+            stopping_rounds (int, optional): optional limit for auto stopping.
+            max_bin (int, optional): maximum number of bins for feature discretization. Defaults to 256.
+            max_cat (int, optional): Maximum number of unique categories for a categorical feature.
+                Features with more categories will be treated as numerical.
+                Defaults to 1000.
 
         Raises:
             TypeError: Raised if an invalid dtype is passed.
@@ -181,6 +185,7 @@ def __init__(
         self.memory_limit = memory_limit
         self.stopping_rounds = stopping_rounds
         self.max_bin = max_bin
+        self.max_cat = max_cat
 
         booster = CratePerpetualBooster(
             objective=self.objective,
@@ -220,24 +225,26 @@ def fit(
             sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
                 training the model. If None is passed, a weight of 1 will be used for every record.
                 Defaults to None.
-            budget: a positive number for fitting budget. Increasing this number will more
+            budget (float, optional): a positive number for fitting budget. Increasing this number will more
                 likely result in more boosting rounds and more increased predictive power.
-                Default value is 1.0.
-            alpha: only used in quantile regression.
-            reset: whether to reset the model or continue training.
-            categorical_features: The names or indices for categorical features.
-                `auto` for Polars or Pandas categorical data type.
-            timeout: optional fit timeout in seconds
-            iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
+                Defaults to 1.0.
+            alpha (float, optional): only used in quantile regression.
+            reset (bool, optional): whether to reset the model or continue training.
+            categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
+                Defaults to `auto` for Polars or Pandas categorical data types.
+            timeout (float, optional): optional fit timeout in seconds
+            iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
                 The algorithm automatically stops for most of the cases before hitting this limit.
                 If you want to experiment with very high budget (>2.0), you can also increase this limit.
-            memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
+            memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
                 available memory and the algorithm requirements.
-            stopping_rounds: optional limit for auto stopping. Defaults to 3.
+            stopping_rounds (int, optional): optional limit for auto stopping. Defaults to 3.
         """
 
         features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
-            convert_input_frame(X, categorical_features or self.categorical_features)
+            convert_input_frame(
+                X, categorical_features or self.categorical_features, self.max_cat
+            )
         )
         self.n_features_ = cols
         self.cat_mapping = cat_mapping

diff --git a/python-package/python/perpetual/utils.py b/python-package/python/perpetual/utils.py
@@ -1,7 +1,11 @@
+import logging
 import numpy as np
 from typing import Dict, Iterable, List, Optional, Tuple
 
 
+logger = logging.getLogger(__name__)
+
+
 def type_df(df):
     library_name = type(df).__module__.split(".")[0]
     if type(df).__name__ == "DataFrame":
@@ -61,7 +65,7 @@ def convert_input_array(x, objective) -> np.ndarray:
 
 
 def convert_input_frame(
-    X, categorical_features
+    X, categorical_features, max_cat
 ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
     """Convert data to format needed by booster.
 
@@ -110,18 +114,28 @@ def convert_input_frame(
         categorical_features_ = [features_.index(c) for c in categorical_features]
 
     cat_mapping = {}  # key: feature_name, value: ordered category names
+    cat_to_num = []
     if categorical_features_:
         for i in categorical_features_:
             categories = np.unique(X_[:, i].astype(dtype="str", copy=False))
+            if len(categories) > max_cat:
+                cat_to_num.append(i)
+                logger.warning(
+                    f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
+                )
+                continue
             categories = [c for c in list(categories) if c != "nan"]
             categories.insert(0, "nan")
             cat_mapping[features_[i]] = categories
+        categorical_features_ = [
+            x for x in categorical_features_ if x not in cat_to_num
+        ]
 
     if cat_mapping:
-        print(f"Categorical features: {categorical_features_}")
-        print(f"Mapping of categories: {cat_mapping}")
+        logger.info(f"Categorical features: {categorical_features_}")
+        logger.info(f"Mapping of categories: {cat_mapping}")
+
         for feature_name, categories in cat_mapping.items():
-            feature_index = features_.index(feature_name)
 
             def f(x):
                 try:
@@ -133,6 +147,7 @@ def f(x):
                 except (ValueError, IndexError):
                     return np.nan
 
+            feature_index = features_.index(feature_name)
             X_[:, feature_index] = np.apply_along_axis(f, 1, X_)
 
     if not np.issubdtype(X_.dtype, "float64"):