Merge pull request #35 from perpetual-ml/update_log_bm

improved logging and added benchmarks
perpetual-ml · Dec 2, 2024 · 3361a40 · 3361a40
2 parents 58cbbaf + 2f794d1
commit 3361a40
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 30 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "perpetual"
-version = "0.7.8"
+version = "0.7.10"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -27,7 +27,7 @@ serde = { version = "1.0.215", features = ["derive"] }
 approx = "0.5"
 log = "0.4"
 rand = "0.8.5"
-sysinfo = "0.32.0"
+sysinfo = "0.32.1"
 
 [dev-dependencies]
 criterion = "0.5"

diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@
 [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual)
 [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual)
 [![Static Badge](https://img.shields.io/badge/join-discord-blue?logo=discord)](https://discord.gg/AyUK7rr6wy)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual)
 
 </div>
 
@@ -33,7 +34,25 @@ The following table summarizes the results for the [Cover Types](https://scikit-
 | ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- |
 | 0.9              | 100                   | 0.091              | 0.084             | 72x                | 78x               |
 
-You can reproduce the results using the scripts in the [examples](./python-package/examples) folder.
+The results can be reproduced using the scripts in the [examples](./python-package/examples) folder.
+
+PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/). The results are summarized in the following table for regression tasks:
+
+| OpenML Task                                  | Perpetual Training Duration | Perpetual Inference Duration                                      | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration                                      | AutoGluon RMSE |
+| -------------------------------------------- | --------------------------- | ----------------------------------------------------------------- | -------------- | --------------------------- | ----------------------------------------------------------------- | -------------- |
+| [Airlines_DepDelay_10M](openml.org/t/359929) | 518                         | 11.3                                                              | 29.0           | 520                         | 30.9 <td style="background-color:green;color:white;"> 28.8 </td>  |
+| [bates_regr_100](openml.org/t/361940)        | 3421                        | 15.1 <td style="background-color:green;color:white;"> 1.084 </td> | OOM            | OOM                         | OOM                                                               |
+| [BNG(libras_move)](openml.org/t/7327)        | 1956                        | 4.2 <td style="background-color:green;color:white;"> 2.51 </td>   | 1922           | 97.6                        | 2.53                                                              |
+| [BNG(satellite_image)](openml.org/t/7326)    | 334                         | 1.6                                                               | 0.731          | 337                         | 10.0 <td style="background-color:green;color:white;"> 0.721 </td> |
+| [COMET_MC](openml.org/t/14949)               | 44                          | 1.0 <td style="background-color:green;color:white;"> 0.0615 </td> | 47             | 5.0                         | 0.0662                                                            |
+| [friedman1](openml.org/t/361939)             | 275                         | 4.2 <td style="background-color:green;color:white;"> 1.047 </td>  | 278            | 5.1                         | 1.487                                                             |
+| [poker](openml.org/t/10102)                  | 38                          | 0.6 <td style="background-color:green;color:white;"> 0.256 </td>  | 41             | 1.2                         | 0.722                                                             |
+| [subset_higgs](openml.org/t/361955)          | 868                         | 10.6 <td style="background-color:green;color:white;"> 0.420 </td> | 870            | 24.5                        | 0.421                                                             |
+| [BNG(autoHorse)](openml.org/t/7319)          | 107                         | 1.1 <td style="background-color:green;color:white;"> 19.0 </td>   | 107            | 3.2                         | 20.5                                                              |
+| [BNG(pbc)](openml.org/t/7318)                | 48                          | 0.6 <td style="background-color:green;color:white;"> 836.5 </td>  | 51             | 0.2                         | 957.1                                                             |
+| average                                      | 465                         | 3.9                                                               | -              | 464                         | 19.7                                                              | -              |
+
+PerpetualBooster outperformed AutoGluon on 8 out of 10 datasets, training equally fast and inferring 5x faster. The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark).
 
 ## Usage
 

diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-perpetual"
-version = "0.7.8"
+version = "0.7.10"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -18,10 +18,10 @@ name = "perpetual"
 crate-type = ["cdylib", "rlib"]
 
 [dependencies]
-pyo3 = { version = "0.22.6", features = ["extension-module"] }
-perpetual_rs = {package="perpetual", version = "0.7.8", path = "../" }
-numpy = "0.22.1"
+pyo3 = { version = "0.23.2", features = ["extension-module"] }
+perpetual_rs = {package="perpetual", version = "0.7.10", path = "../" }
+numpy = "0.23.0"
 ndarray = "0.16.1"
 serde_plain = { version = "1.0" }
 serde = { version = "1.0.215" }
-pyo3-log = "0.11"
+pyo3-log = "0.12.0"
diff --git a/python-package/examples/fetch_openml.ipynb b/python-package/examples/fetch_openml.ipynb
@@ -6,8 +6,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import logging\n",
     "from perpetual import PerpetualBooster\n",
-    "from sklearn.datasets import fetch_openml"
+    "from sklearn.datasets import fetch_openml\n",
+    "from importlib.metadata import version"
    ]
   },
   {
@@ -16,7 +18,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)"
+    "logging.basicConfig(level=logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"perpetual: {version('perpetual')}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data, target = fetch_openml(data_id=45667, return_X_y=True, as_frame=True)"
    ]
   },
   {
@@ -43,8 +63,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = PerpetualBooster()\n",
-    "model.fit(data, target, budget=0.1)"
+    "model = PerpetualBooster(objective=\"SquaredLoss\", log_iterations=1)\n",
+    "model.fit(data, target, budget=0.5)"
    ]
   },
   {

diff --git a/python-package/examples/toy_datasets.ipynb b/python-package/examples/toy_datasets.ipynb
@@ -2,11 +2,12 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import time\n",
+    "import logging\n",
     "import pandas as pd\n",
     "from sklearn.datasets import load_breast_cancer, load_iris\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
@@ -27,6 +28,15 @@
     "print(f\"perpetual: {version('perpetual')}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logging.basicConfig(level=logging.INFO)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -36,6 +46,8 @@
     "def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n",
     "    start = time.time()\n",
     "    model.fit(X_train, y_train, budget=budget) if budget else model.fit(X_train, y_train)\n",
+    "    if budget:\n",
+    "        print(model.number_of_trees)\n",
     "    duration = time.time() - start\n",
     "    return duration, accuracy_score(y_test, model.predict(X_test)), log_loss(y_test, model.predict_proba(X_test))\n",
     "\n",
@@ -44,13 +56,13 @@
     "\n",
     "for name, (X, y) in datasets.items():\n",
     "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
-    "    pb = PerpetualBooster(objective=\"LogLoss\")\n",
+    "    pb = PerpetualBooster(objective=\"LogLoss\", log_iterations=1, stopping_rounds=1, iteration_limit=1000)\n",
     "    rf = RandomForestClassifier()\n",
     "    results = pd.concat([results,\n",
     "                         pd.DataFrame([[name, \"Perpetual\", \"0.1\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1)]], columns=results.columns),\n",
-    "                         pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n",
-    "                         pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n",
-    "                         pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n",
+    "                         #pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n",
+    "                         #pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n",
+    "                         #pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n",
     "                        ],\n",
     "                    ignore_index=True)"
    ]

diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -1,10 +1,10 @@
 [build-system]
-requires = ["maturin>=1.0.0,<1.7.0"]
+requires = ["maturin>=1.0,<1.7"]
 build-backend = "maturin"
 
 [project]
 name = "perpetual"
-version = "0.7.8"
+version = "0.7.10"
 description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
 license = { file = "LICENSE" }
 keywords = [

diff --git a/src/booster.rs b/src/booster.rs
@@ -533,27 +533,28 @@ impl PerpetualBooster {
                 }
             }
 
+            if tree.stopper != TreeStopper::LossDecrement {
+                n_low_loss_rounds += 1;
+            } else {
+                n_low_loss_rounds = 0;
+            }
+
+            (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha);
+            loss = calc_loss(y, &yhat, sample_weight, alpha);
+
             if verbose {
                 info!(
-                    "round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}",
+                    "round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}, loss: {:4?}",
                     i,
                     tree.nodes.len(),
                     tree.depth,
                     tree.stopper,
+                    loss.iter().sum::<f32>() / loss.len() as f32,
                 );
             }
 
-            if tree.stopper != TreeStopper::LossDecrement {
-                n_low_loss_rounds += 1;
-            } else {
-                n_low_loss_rounds = 0;
-            }
-
             self.trees.push(tree);
 
-            (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha);
-            loss = calc_loss(y, &yhat, sample_weight, alpha);
-
             if stopping >= stopping_rounds.unwrap_or(STOPPING_ROUNDS) {
                 info!("Auto stopping since stopping round limit reached.");
                 break;
@@ -573,7 +574,7 @@ impl PerpetualBooster {
 
         if self.log_iterations > 0 {
             info!(
-                "Finished training a booster with {0} trees in {1}.",
+                "Finished training a booster with {0} trees in {1} seconds.",
                 self.trees.len(),
                 start.elapsed().as_secs()
             );