Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

max_cat added #28

Merged
merged 2 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.7.3"
version = "0.7.4"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Documentation for the Python API can be found [here](https://perpetual-ml.github

## Installation

The package can be installed directly from [pypi](https://pypi.org/project/perpetual).
The package can be installed directly from [pypi](https://pypi.org/project/perpetual):

```shell
pip install perpetual
Expand All @@ -64,10 +64,10 @@ Using [conda-forge](https://anaconda.org/conda-forge/perpetual):
conda install conda-forge::perpetual
```

To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual):

```toml
perpetual = "0.7.3"
cargo add perpetual
```

## Contribution
Expand Down
4 changes: 2 additions & 2 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.7.3"
version = "0.7.4"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.6", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.3", path = "../" }
perpetual_rs = {package="perpetual", version = "0.7.4", path = "../" }
numpy = "0.22.1"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
Expand Down
81 changes: 81 additions & 0 deletions python-package/examples/fetch_openml.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from perpetual import PerpetualBooster\n",
"from sklearn.datasets import fetch_openml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = PerpetualBooster()\n",
"model.fit(data, target, budget=0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.number_of_trees"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py311",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "perpetual"
version = "0.7.3"
version = "0.7.4"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
51 changes: 29 additions & 22 deletions python-package/python/perpetual/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ def __init__(
memory_limit: Optional[float] = None,
stopping_rounds: Optional[int] = None,
max_bin: int = 256,
max_cat: int = 1000,
):
"""PerpetualBooster class, used to generate gradient boosted decision tree ensembles.
The following parameters can also be specified in the fit method to override the values in the constructor:
budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit.
budget, alpha, reset, categorical_features, timeout, iteration_limit, memory_limit, and stopping_rounds.

Args:
objective (str, optional): Learning objective function to be used for optimization.
Expand Down Expand Up @@ -104,21 +105,24 @@ def __init__(
- "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
budget: a positive number for fitting budget. Increasing this number will more
budget (float, optional): a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
`auto` for Polars or Pandas categorical data type.
timeout: optional fit timeout in seconds
iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
alpha (float, optional): only used in quantile regression.
reset (bool, optional): whether to reset the model or continue training.
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
Defaults to `auto` for Polars or Pandas categorical data types.
timeout (float, optional): optional fit timeout in seconds
iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
The algorithm automatically stops for most of the cases before hitting this limit.
If you want to experiment with very high budget (>2.0), you can also increase this limit.
memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
available memory and the algorithm requirements.
stopping_rounds: optional limit for auto stopping.
max_bin: number bins for feature discretization.
stopping_rounds (int, optional): optional limit for auto stopping.
max_bin (int, optional): maximum number of bins for feature discretization. Defaults to 256.
max_cat (int, optional): Maximum number of unique categories for a categorical feature.
Features with more categories will be treated as numerical.
Defaults to 1000.

Raises:
TypeError: Raised if an invalid dtype is passed.
Expand Down Expand Up @@ -181,6 +185,7 @@ def __init__(
self.memory_limit = memory_limit
self.stopping_rounds = stopping_rounds
self.max_bin = max_bin
self.max_cat = max_cat

booster = CratePerpetualBooster(
objective=self.objective,
Expand Down Expand Up @@ -220,24 +225,26 @@ def fit(
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
training the model. If None is passed, a weight of 1 will be used for every record.
Defaults to None.
budget: a positive number for fitting budget. Increasing this number will more
budget (float, optional): a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
`auto` for Polars or Pandas categorical data type.
timeout: optional fit timeout in seconds
iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
Defaults to 1.0.
alpha (float, optional): only used in quantile regression.
reset (bool, optional): whether to reset the model or continue training.
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
Defaults to `auto` for Polars or Pandas categorical data types.
timeout (float, optional): optional fit timeout in seconds
iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
The algorithm automatically stops for most of the cases before hitting this limit.
If you want to experiment with very high budget (>2.0), you can also increase this limit.
memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
available memory and the algorithm requirements.
stopping_rounds: optional limit for auto stopping. Defaults to 3.
stopping_rounds (int, optional): optional limit for auto stopping. Defaults to 3.
"""

features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
convert_input_frame(X, categorical_features or self.categorical_features)
convert_input_frame(
X, categorical_features or self.categorical_features, self.max_cat
)
)
self.n_features_ = cols
self.cat_mapping = cat_mapping
Expand Down
23 changes: 19 additions & 4 deletions python-package/python/perpetual/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import logging
import numpy as np
from typing import Dict, Iterable, List, Optional, Tuple


logger = logging.getLogger(__name__)


def type_df(df):
library_name = type(df).__module__.split(".")[0]
if type(df).__name__ == "DataFrame":
Expand Down Expand Up @@ -61,7 +65,7 @@ def convert_input_array(x, objective) -> np.ndarray:


def convert_input_frame(
X, categorical_features
X, categorical_features, max_cat
) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
"""Convert data to format needed by booster.

Expand Down Expand Up @@ -110,18 +114,28 @@ def convert_input_frame(
categorical_features_ = [features_.index(c) for c in categorical_features]

cat_mapping = {} # key: feature_name, value: ordered category names
cat_to_num = []
if categorical_features_:
for i in categorical_features_:
categories = np.unique(X_[:, i].astype(dtype="str", copy=False))
if len(categories) > max_cat:
cat_to_num.append(i)
logger.warning(
f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
)
continue
categories = [c for c in list(categories) if c != "nan"]
categories.insert(0, "nan")
cat_mapping[features_[i]] = categories
categorical_features_ = [
x for x in categorical_features_ if x not in cat_to_num
]

if cat_mapping:
print(f"Categorical features: {categorical_features_}")
print(f"Mapping of categories: {cat_mapping}")
logger.info(f"Categorical features: {categorical_features_}")
logger.info(f"Mapping of categories: {cat_mapping}")

for feature_name, categories in cat_mapping.items():
feature_index = features_.index(feature_name)

def f(x):
try:
Expand All @@ -133,6 +147,7 @@ def f(x):
except (ValueError, IndexError):
return np.nan

feature_index = features_.index(feature_name)
X_[:, feature_index] = np.apply_along_axis(f, 1, X_)

if not np.issubdtype(X_.dtype, "float64"):
Expand Down
4 changes: 2 additions & 2 deletions scripts/make_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@

data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto")
features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto", 1000)
features_, titanic_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping)

data_test.to_csv("resources/titanic_test_df.csv", index=False)
Expand All @@ -97,6 +97,6 @@
df = fetch_openml(data_id=546)
X = df.data
y = df.target
features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto")
features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto", 1000)
pd.Series(sensory_flat).to_csv("resources/sensory_flat.csv", index=False, header=False)
pd.Series(y).to_csv("resources/sensory_y.csv", index=False, header=False)
Loading