Skip to content

Commit

Permalink
Merge pull request #35 from perpetual-ml/update_log_bm
Browse files Browse the repository at this point in the history
improved logging and added benchmarks
  • Loading branch information
deadsoul44 authored Dec 2, 2024
2 parents 58cbbaf + 2f794d1 commit 3361a40
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 30 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.7.8"
version = "0.7.10"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -27,7 +27,7 @@ serde = { version = "1.0.215", features = ["derive"] }
approx = "0.5"
log = "0.4"
rand = "0.8.5"
sysinfo = "0.32.0"
sysinfo = "0.32.1"

[dev-dependencies]
criterion = "0.5"
Expand Down
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
[![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual)
[![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual)
[![Static Badge](https://img.shields.io/badge/join-discord-blue?logo=discord)](https://discord.gg/AyUK7rr6wy)
![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual)

</div>

Expand All @@ -33,7 +34,25 @@ The following table summarizes the results for the [Cover Types](https://scikit-
| ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- |
| 0.9 | 100 | 0.091 | 0.084 | 72x | 78x |

You can reproduce the results using the scripts in the [examples](./python-package/examples) folder.
The results can be reproduced using the scripts in the [examples](./python-package/examples) folder.

PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/). The results are summarized in the following table for regression tasks:

| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
| -------------------------------------------- | --------------------------- | ----------------------------------------------------------------- | -------------- | --------------------------- | ----------------------------------------------------------------- | -------------- |
| [Airlines_DepDelay_10M](openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 <td style="background-color:green;color:white;"> 28.8 </td> |
| [bates_regr_100](openml.org/t/361940) | 3421 | 15.1 <td style="background-color:green;color:white;"> 1.084 </td> | OOM | OOM | OOM |
| [BNG(libras_move)](openml.org/t/7327) | 1956 | 4.2 <td style="background-color:green;color:white;"> 2.51 </td> | 1922 | 97.6 | 2.53 |
| [BNG(satellite_image)](openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 <td style="background-color:green;color:white;"> 0.721 </td> |
| [COMET_MC](openml.org/t/14949) | 44 | 1.0 <td style="background-color:green;color:white;"> 0.0615 </td> | 47 | 5.0 | 0.0662 |
| [friedman1](openml.org/t/361939) | 275 | 4.2 <td style="background-color:green;color:white;"> 1.047 </td> | 278 | 5.1 | 1.487 |
| [poker](openml.org/t/10102) | 38 | 0.6 <td style="background-color:green;color:white;"> 0.256 </td> | 41 | 1.2 | 0.722 |
| [subset_higgs](openml.org/t/361955) | 868 | 10.6 <td style="background-color:green;color:white;"> 0.420 </td> | 870 | 24.5 | 0.421 |
| [BNG(autoHorse)](openml.org/t/7319) | 107 | 1.1 <td style="background-color:green;color:white;"> 19.0 </td> | 107 | 3.2 | 20.5 |
| [BNG(pbc)](openml.org/t/7318) | 48 | 0.6 <td style="background-color:green;color:white;"> 836.5 </td> | 51 | 0.2 | 957.1 |
| average | 465 | 3.9 | - | 464 | 19.7 | - |

PerpetualBooster outperformed AutoGluon on 8 out of 10 datasets, training equally fast and inferring 5x faster. The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark).

## Usage

Expand Down
10 changes: 5 additions & 5 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.7.8"
version = "0.7.10"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -18,10 +18,10 @@ name = "perpetual"
crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.6", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.8", path = "../" }
numpy = "0.22.1"
pyo3 = { version = "0.23.2", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.10", path = "../" }
numpy = "0.23.0"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
serde = { version = "1.0.215" }
pyo3-log = "0.11"
pyo3-log = "0.12.0"
28 changes: 24 additions & 4 deletions python-package/examples/fetch_openml.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"from perpetual import PerpetualBooster\n",
"from sklearn.datasets import fetch_openml"
"from sklearn.datasets import fetch_openml\n",
"from importlib.metadata import version"
]
},
{
Expand All @@ -16,7 +18,25 @@
"metadata": {},
"outputs": [],
"source": [
"data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)"
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"perpetual: {version('perpetual')}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data, target = fetch_openml(data_id=45667, return_X_y=True, as_frame=True)"
]
},
{
Expand All @@ -43,8 +63,8 @@
"metadata": {},
"outputs": [],
"source": [
"model = PerpetualBooster()\n",
"model.fit(data, target, budget=0.1)"
"model = PerpetualBooster(objective=\"SquaredLoss\", log_iterations=1)\n",
"model.fit(data, target, budget=0.5)"
]
},
{
Expand Down
22 changes: 17 additions & 5 deletions python-package/examples/toy_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import logging\n",
"import pandas as pd\n",
"from sklearn.datasets import load_breast_cancer, load_iris\n",
"from sklearn.ensemble import RandomForestClassifier\n",
Expand All @@ -27,6 +28,15 @@
"print(f\"perpetual: {version('perpetual')}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -36,6 +46,8 @@
"def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n",
" start = time.time()\n",
" model.fit(X_train, y_train, budget=budget) if budget else model.fit(X_train, y_train)\n",
" if budget:\n",
" print(model.number_of_trees)\n",
" duration = time.time() - start\n",
" return duration, accuracy_score(y_test, model.predict(X_test)), log_loss(y_test, model.predict_proba(X_test))\n",
"\n",
Expand All @@ -44,13 +56,13 @@
"\n",
"for name, (X, y) in datasets.items():\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
" pb = PerpetualBooster(objective=\"LogLoss\")\n",
" pb = PerpetualBooster(objective=\"LogLoss\", log_iterations=1, stopping_rounds=1, iteration_limit=1000)\n",
" rf = RandomForestClassifier()\n",
" results = pd.concat([results,\n",
" pd.DataFrame([[name, \"Perpetual\", \"0.1\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1)]], columns=results.columns),\n",
" pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n",
" pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n",
" pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n",
" #pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n",
" #pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n",
" #pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n",
" ],\n",
" ignore_index=True)"
]
Expand Down
4 changes: 2 additions & 2 deletions python-package/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[build-system]
requires = ["maturin>=1.0.0,<1.7.0"]
requires = ["maturin>=1.0,<1.7"]
build-backend = "maturin"

[project]
name = "perpetual"
version = "0.7.8"
version = "0.7.10"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
23 changes: 12 additions & 11 deletions src/booster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -533,27 +533,28 @@ impl PerpetualBooster {
}
}

if tree.stopper != TreeStopper::LossDecrement {
n_low_loss_rounds += 1;
} else {
n_low_loss_rounds = 0;
}

(grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha);
loss = calc_loss(y, &yhat, sample_weight, alpha);

if verbose {
info!(
"round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}",
"round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}, loss: {:4?}",
i,
tree.nodes.len(),
tree.depth,
tree.stopper,
loss.iter().sum::<f32>() / loss.len() as f32,
);
}

if tree.stopper != TreeStopper::LossDecrement {
n_low_loss_rounds += 1;
} else {
n_low_loss_rounds = 0;
}

self.trees.push(tree);

(grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha);
loss = calc_loss(y, &yhat, sample_weight, alpha);

if stopping >= stopping_rounds.unwrap_or(STOPPING_ROUNDS) {
info!("Auto stopping since stopping round limit reached.");
break;
Expand All @@ -573,7 +574,7 @@ impl PerpetualBooster {

if self.log_iterations > 0 {
info!(
"Finished training a booster with {0} trees in {1}.",
"Finished training a booster with {0} trees in {1} seconds.",
self.trees.len(),
start.elapsed().as_secs()
);
Expand Down

0 comments on commit 3361a40

Please sign in to comment.