From b5ac4ad599ecdb5ea05e189721c44250ed612f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Mon, 2 Dec 2024 19:39:44 +0300 Subject: [PATCH 1/3] improved logging and added benchmarks --- Cargo.toml | 4 ++-- README.md | 21 +++++++++++++++- python-package/Cargo.toml | 10 ++++---- python-package/examples/fetch_openml.ipynb | 28 ++++++++++++++++++---- python-package/examples/toy_datasets.ipynb | 22 +++++++++++++---- python-package/pyproject.toml | 2 +- src/booster.rs | 23 +++++++++--------- 7 files changed, 81 insertions(+), 29 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ad10ab9..71e9acd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.7.8" +version = "0.7.9" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -27,7 +27,7 @@ serde = { version = "1.0.215", features = ["derive"] } approx = "0.5" log = "0.4" rand = "0.8.5" -sysinfo = "0.32.0" +sysinfo = "0.32.1" [dev-dependencies] criterion = "0.5" diff --git a/README.md b/README.md index ce4c459..88c3f4b 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual) [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual) [![Static Badge](https://img.shields.io/badge/join-discord-blue?logo=discord)](https://discord.gg/AyUK7rr6wy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual) @@ -33,7 +34,25 @@ The following table summarizes the results for the [Cover Types](https://scikit- | ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- | | 0.9 | 100 | 0.091 | 0.084 | 72x | 78x | -You can reproduce the results using the scripts in the [examples](./python-package/examples) folder. +The results can be reproduced using the scripts in the [examples](./python-package/examples) folder. + +PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/). The results are summarized in the following table for regression tasks: + +| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE | +| -------------------------------------------- | --------------------------- | ----------------------------------------------------------------- | -------------- | --------------------------- | ----------------------------------------------------------------- | -------------- | +| [Airlines_DepDelay_10M](openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 28.8 | +| [bates_regr_100](openml.org/t/361940) | 3421 | 15.1 1.084 | OOM | OOM | OOM | +| [BNG(libras_move)](openml.org/t/7327) | 1956 | 4.2 2.51 | 1922 | 97.6 | 2.53 | +| [BNG(satellite_image)](openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 0.721 | +| [COMET_MC](openml.org/t/14949) | 44 | 1.0 0.0615 | 47 | 5.0 | 0.0662 | +| [friedman1](openml.org/t/361939) | 275 | 4.2 1.047 | 278 | 5.1 | 1.487 | +| [poker](openml.org/t/10102) | 38 | 0.6 0.256 | 41 | 1.2 | 0.722 | +| [subset_higgs](openml.org/t/361955) | 868 | 10.6 0.420 | 870 | 24.5 | 0.421 | +| [BNG(autoHorse)](openml.org/t/7319) | 107 | 1.1 19.0 | 107 | 3.2 | 20.5 | +| [BNG(pbc)](openml.org/t/7318) | 48 | 0.6 836.5 | 51 | 0.2 | 957.1 | +| average | 465 | 3.9 | - | 464 | 19.7 | - | + +PerpetualBooster outperformed AutoGluon on 8 out of 10 datasets, training equally fast and inferring 5x faster. The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark). ## Usage diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index 8125ef7..f639deb 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.7.8" +version = "0.7.9" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -18,10 +18,10 @@ name = "perpetual" crate-type = ["cdylib", "rlib"] [dependencies] -pyo3 = { version = "0.22.6", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.7.8", path = "../" } -numpy = "0.22.1" +pyo3 = { version = "0.23.2", features = ["extension-module"] } +perpetual_rs = {package="perpetual", version = "0.7.9", path = "../" } +numpy = "0.23.0" ndarray = "0.16.1" serde_plain = { version = "1.0" } serde = { version = "1.0.215" } -pyo3-log = "0.11" +pyo3-log = "0.12.0" diff --git a/python-package/examples/fetch_openml.ipynb b/python-package/examples/fetch_openml.ipynb index 5c12f1f..5db4832 100644 --- a/python-package/examples/fetch_openml.ipynb +++ b/python-package/examples/fetch_openml.ipynb @@ -6,8 +6,10 @@ "metadata": {}, "outputs": [], "source": [ + "import logging\n", "from perpetual import PerpetualBooster\n", - "from sklearn.datasets import fetch_openml" + "from sklearn.datasets import fetch_openml\n", + "from importlib.metadata import version" ] }, { @@ -16,7 +18,25 @@ "metadata": {}, "outputs": [], "source": [ - "data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)" + "logging.basicConfig(level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"perpetual: {version('perpetual')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data, target = fetch_openml(data_id=45667, return_X_y=True, as_frame=True)" ] }, { @@ -43,8 +63,8 @@ "metadata": {}, "outputs": [], "source": [ - "model = PerpetualBooster()\n", - "model.fit(data, target, budget=0.1)" + "model = PerpetualBooster(objective=\"SquaredLoss\", log_iterations=1)\n", + "model.fit(data, target, budget=0.5)" ] }, { diff --git a/python-package/examples/toy_datasets.ipynb b/python-package/examples/toy_datasets.ipynb index bab11bf..d28cdd2 100644 --- a/python-package/examples/toy_datasets.ipynb +++ b/python-package/examples/toy_datasets.ipynb @@ -2,11 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", + "import logging\n", "import pandas as pd\n", "from sklearn.datasets import load_breast_cancer, load_iris\n", "from sklearn.ensemble import RandomForestClassifier\n", @@ -27,6 +28,15 @@ "print(f\"perpetual: {version('perpetual')}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -36,6 +46,8 @@ "def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n", " start = time.time()\n", " model.fit(X_train, y_train, budget=budget) if budget else model.fit(X_train, y_train)\n", + " if budget:\n", + " print(model.number_of_trees)\n", " duration = time.time() - start\n", " return duration, accuracy_score(y_test, model.predict(X_test)), log_loss(y_test, model.predict_proba(X_test))\n", "\n", @@ -44,13 +56,13 @@ "\n", "for name, (X, y) in datasets.items():\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - " pb = PerpetualBooster(objective=\"LogLoss\")\n", + " pb = PerpetualBooster(objective=\"LogLoss\", log_iterations=1, stopping_rounds=1, iteration_limit=1000)\n", " rf = RandomForestClassifier()\n", " results = pd.concat([results,\n", " pd.DataFrame([[name, \"Perpetual\", \"0.1\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1)]], columns=results.columns),\n", - " pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n", - " pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n", - " pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n", + " #pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n", + " #pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n", + " #pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n", " ],\n", " ignore_index=True)" ] diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index e6b0a43..d442b8d 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.7.8" +version = "0.7.9" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/src/booster.rs b/src/booster.rs index 274ae4d..a4b1306 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -533,27 +533,28 @@ impl PerpetualBooster { } } + if tree.stopper != TreeStopper::LossDecrement { + n_low_loss_rounds += 1; + } else { + n_low_loss_rounds = 0; + } + + (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha); + loss = calc_loss(y, &yhat, sample_weight, alpha); + if verbose { info!( - "round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}", + "round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}, loss: {:4?}", i, tree.nodes.len(), tree.depth, tree.stopper, + loss.iter().sum::() / loss.len() as f32, ); } - if tree.stopper != TreeStopper::LossDecrement { - n_low_loss_rounds += 1; - } else { - n_low_loss_rounds = 0; - } - self.trees.push(tree); - (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha); - loss = calc_loss(y, &yhat, sample_weight, alpha); - if stopping >= stopping_rounds.unwrap_or(STOPPING_ROUNDS) { info!("Auto stopping since stopping round limit reached."); break; @@ -573,7 +574,7 @@ impl PerpetualBooster { if self.log_iterations > 0 { info!( - "Finished training a booster with {0} trees in {1}.", + "Finished training a booster with {0} trees in {1} seconds.", self.trees.len(), start.elapsed().as_secs() ); From a8290a1238e9b1c625864af15ac929afc2ef538c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Mon, 2 Dec 2024 19:54:37 +0300 Subject: [PATCH 2/3] maturin update --- Cargo.toml | 2 +- python-package/Cargo.toml | 4 ++-- python-package/pyproject.toml | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 71e9acd..8e20118 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.7.9" +version = "0.7.10" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index f639deb..afd1332 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.7.9" +version = "0.7.10" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.7.9", path = "../" } +perpetual_rs = {package="perpetual", version = "0.7.10", path = "../" } numpy = "0.23.0" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index d442b8d..962feef 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -1,10 +1,10 @@ [build-system] -requires = ["maturin>=1.0.0,<1.7.0"] +requires = ["maturin"] build-backend = "maturin" [project] name = "perpetual" -version = "0.7.9" +version = "0.7.10" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ @@ -30,7 +30,7 @@ classifiers = [ ] [project.optional-dependencies] -dev = ["pandas", "polars", "pyarrow", "maturin==1.6.0", "pytest", "seaborn", "scikit-learn", "mkdocs-material", "mkdocstrings[python]", "mkdocs-autorefs", "ruff"] +dev = ["pandas", "polars", "pyarrow", "maturin", "pytest", "seaborn", "scikit-learn", "mkdocs-material", "mkdocstrings[python]", "mkdocs-autorefs", "ruff"] [tool.maturin] sdist-include = ["LICENSE", "README.md"] From 2f794d14729130903a5cbf34c10af7495f579bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Mon, 2 Dec 2024 20:02:45 +0300 Subject: [PATCH 3/3] maturin revert --- python-package/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 962feef..faf93e1 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin"] +requires = ["maturin>=1.0,<1.7"] build-backend = "maturin" [project] @@ -30,7 +30,7 @@ classifiers = [ ] [project.optional-dependencies] -dev = ["pandas", "polars", "pyarrow", "maturin", "pytest", "seaborn", "scikit-learn", "mkdocs-material", "mkdocstrings[python]", "mkdocs-autorefs", "ruff"] +dev = ["pandas", "polars", "pyarrow", "maturin==1.6.0", "pytest", "seaborn", "scikit-learn", "mkdocs-material", "mkdocstrings[python]", "mkdocs-autorefs", "ruff"] [tool.maturin] sdist-include = ["LICENSE", "README.md"]