Merge pull request #22 from perpetual-ml/json_dump

json dump before fit
perpetual-ml · Oct 30, 2024 · e4a4ad2 · e4a4ad2
2 parents ccfdd49 + 5d6b545
commit e4a4ad2
Show file tree

Hide file tree

Showing 10 changed files with 225 additions and 12 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "perpetual"
-version = "0.6.0"
+version = "0.6.1"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"

diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ pip install perpetual
 To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
 
 ```toml
-perpetual = "0.6.0"
+perpetual = "0.6.1"
 ```
 
 ## Paper

diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-perpetual"
-version = "0.6.0"
+version = "0.6.1"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]
 
 [dependencies]
 pyo3 = { version = "0.22.5", features = ["extension-module"] }
-perpetual_rs = {package="perpetual", version = "0.6.0", path = "../" }
+perpetual_rs = {package="perpetual", version = "0.6.1", path = "../" }
 numpy = "0.22.0"
 ndarray = "0.16.1"
 serde_plain = { version = "1.0" }

diff --git a/python-package/examples/categorical_data_titanic.ipynb b/python-package/examples/categorical_data_titanic.ipynb
@@ -264,7 +264,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "py310",
+   "display_name": "py311",
    "language": "python",
    "name": "python3"
   },
@@ -278,7 +278,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.20"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

diff --git a/python-package/examples/openml.ipynb b/python-package/examples/openml.ipynb
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -128,7 +128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [

diff --git a/python-package/examples/santander.ipynb b/python-package/examples/santander.ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "967af9d9",
+   "metadata": {
+    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+    "execution": {
+     "iopub.execute_input": "2024-10-21T07:01:07.130508Z",
+     "iopub.status.busy": "2024-10-21T07:01:07.130061Z",
+     "iopub.status.idle": "2024-10-21T07:01:08.048111Z",
+     "shell.execute_reply": "2024-10-21T07:01:08.046970Z"
+    },
+    "papermill": {
+     "duration": 0.926499,
+     "end_time": "2024-10-21T07:01:08.050965",
+     "exception": false,
+     "start_time": "2024-10-21T07:01:07.124466",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from perpetual import PerpetualBooster\n",
+    "from sklearn.metrics import roc_auc_score\n",
+    "from sklearn.model_selection import KFold\n",
+    "from autogluon.tabular import TabularPredictor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c257f8fc",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-10-21T07:01:28.223537Z",
+     "iopub.status.busy": "2024-10-21T07:01:28.222764Z",
+     "iopub.status.idle": "2024-10-21T07:01:34.667262Z",
+     "shell.execute_reply": "2024-10-21T07:01:34.666013Z"
+    },
+    "papermill": {
+     "duration": 6.453134,
+     "end_time": "2024-10-21T07:01:34.670004",
+     "exception": false,
+     "start_time": "2024-10-21T07:01:28.216870",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "X_train = pd.read_csv('../../resources/santander-train.csv', index_col=0)\n",
+    "y_train = X_train.pop('TARGET')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0590d0a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-10-21T07:01:34.680697Z",
+     "iopub.status.busy": "2024-10-21T07:01:34.680290Z",
+     "iopub.status.idle": "2024-10-21T07:01:34.689412Z",
+     "shell.execute_reply": "2024-10-21T07:01:34.688210Z"
+    },
+    "papermill": {
+     "duration": 0.017414,
+     "end_time": "2024-10-21T07:01:34.691792",
+     "exception": false,
+     "start_time": "2024-10-21T07:01:34.674378",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "X_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "22eba1d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv = KFold(shuffle=True, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "921f491f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scores = []\n",
+    "for train, test in cv.split(X_train, y_train):\n",
+    "    model = PerpetualBooster(objective=\"LogLoss\")\n",
+    "    model.fit(X_train.iloc[train], y_train.iloc[train], budget=1.0)\n",
+    "    probabilities = model.predict_proba(X_train.iloc[test])\n",
+    "    score = roc_auc_score(y_train.iloc[test], probabilities[:, 1])\n",
+    "    scores.append(score)\n",
+    "    print(model.number_of_trees)\n",
+    "print(np.mean(scores))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a04e569",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-10-21T07:01:34.702317Z",
+     "iopub.status.busy": "2024-10-21T07:01:34.701880Z",
+     "iopub.status.idle": "2024-10-21T07:02:04.983918Z",
+     "shell.execute_reply": "2024-10-21T07:02:04.982720Z"
+    },
+    "papermill": {
+     "duration": 30.294535,
+     "end_time": "2024-10-21T07:02:04.990727",
+     "exception": false,
+     "start_time": "2024-10-21T07:01:34.696192",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "scores = []\n",
+    "X_train['TARGET'] = y_train\n",
+    "for train, test in cv.split(X_train, y_train):\n",
+    "    model = TabularPredictor(label=\"TARGET\", verbosity=0)\n",
+    "    model.fit(X_train.iloc[train])\n",
+    "    probabilities = model.predict_proba(X_train.iloc[test])\n",
+    "    score = roc_auc_score(y_train.iloc[test], probabilities.to_numpy()[:, 1])\n",
+    "    print(score)\n",
+    "    scores.append(score)\n",
+    "print(np.mean(scores))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "none",
+   "dataSources": [
+    {
+     "databundleVersionId": 860641,
+     "sourceId": 4986,
+     "sourceType": "competition"
+    }
+   ],
+   "dockerImageVersionId": 30786,
+   "isGpuEnabled": false,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 62.454609,
+   "end_time": "2024-10-21T07:02:06.520206",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2024-10-21T07:01:04.065597",
+   "version": "2.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "perpetual"
-version = "0.6.0"
+version = "0.6.1"
 description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
 license = { file = "LICENSE" }
 keywords = [

diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py
@@ -147,6 +147,20 @@ def __init__(
         self.log_iterations = log_iterations
         self.feature_importance_method = feature_importance_method
 
+        booster = CratePerpetualBooster(
+            objective=self.objective,
+            num_threads=self.num_threads,
+            monotone_constraints=dict(),
+            force_children_to_bound_parent=self.force_children_to_bound_parent,
+            missing=self.missing,
+            allow_missing_splits=self.allow_missing_splits,
+            create_missing_branch=self.create_missing_branch,
+            terminate_missing_features=set(),
+            missing_node_treatment=self.missing_node_treatment,
+            log_iterations=self.log_iterations,
+        )
+        self.booster = cast(BoosterType, booster)
+
     def fit(
         self,
         X,

diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py
@@ -625,6 +625,8 @@ def test_booster_metadata(
         f64_model_path = tmp_path / "modelf64_sl.json"
         X, y = X_y
         model = PerpetualBooster(objective="SquaredLoss")
+        save_func(model, f64_model_path)
+        model.json_dump()
         model.fit(X, y)
         preds = model.predict(X)
         save_func(model, f64_model_path)

diff --git a/python-package/tests/test_multi_output.py b/python-package/tests/test_multi_output.py
@@ -9,8 +9,8 @@ def test_multi_output():
     y = np.array(X.pop("Cover_Type"))
     X_test = pd.read_csv("../resources/cover_types_test.csv", index_col=False)
     y_test = np.array(X_test.pop("Cover_Type"))
-    model = PerpetualBooster(num_threads=1)
-    model.fit(X, y)
+    model = PerpetualBooster()
+    model.fit(X, y, iteration_limit=40)
     pred_test = model.predict(X_test)
     proba_test = model.predict_proba(X_test)
     log_odds_test = model.predict_log_proba(X_test)