diff --git a/Cargo.toml b/Cargo.toml index 31cdd9d..60934b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.6.0" +version = "0.6.1" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" diff --git a/README.md b/README.md index 435e69f..abd816d 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ pip install perpetual To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). ```toml -perpetual = "0.6.0" +perpetual = "0.6.1" ``` ## Paper diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index e2812b7..6f5dab6 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.6.0" +version = "0.6.1" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.22.5", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.6.0", path = "../" } +perpetual_rs = {package="perpetual", version = "0.6.1", path = "../" } numpy = "0.22.0" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/examples/categorical_data_titanic.ipynb b/python-package/examples/categorical_data_titanic.ipynb index 2c1f576..d79b778 100644 --- a/python-package/examples/categorical_data_titanic.ipynb +++ b/python-package/examples/categorical_data_titanic.ipynb @@ -264,7 +264,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py310", + "display_name": "py311", "language": "python", "name": "python3" }, @@ -278,7 +278,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.20" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/python-package/examples/openml.ipynb b/python-package/examples/openml.ipynb index f2c4441..604aa55 100644 --- a/python-package/examples/openml.ipynb +++ b/python-package/examples/openml.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ diff --git a/python-package/examples/santander.ipynb b/python-package/examples/santander.ipynb new file mode 100644 index 0000000..2b90f75 --- /dev/null +++ b/python-package/examples/santander.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "967af9d9", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "execution": { + "iopub.execute_input": "2024-10-21T07:01:07.130508Z", + "iopub.status.busy": "2024-10-21T07:01:07.130061Z", + "iopub.status.idle": "2024-10-21T07:01:08.048111Z", + "shell.execute_reply": "2024-10-21T07:01:08.046970Z" + }, + "papermill": { + "duration": 0.926499, + "end_time": "2024-10-21T07:01:08.050965", + "exception": false, + "start_time": "2024-10-21T07:01:07.124466", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from perpetual import PerpetualBooster\n", + "from sklearn.metrics import roc_auc_score\n", + "from sklearn.model_selection import KFold\n", + "from autogluon.tabular import TabularPredictor" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c257f8fc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-10-21T07:01:28.223537Z", + "iopub.status.busy": "2024-10-21T07:01:28.222764Z", + "iopub.status.idle": "2024-10-21T07:01:34.667262Z", + "shell.execute_reply": "2024-10-21T07:01:34.666013Z" + }, + "papermill": { + "duration": 6.453134, + "end_time": "2024-10-21T07:01:34.670004", + "exception": false, + "start_time": "2024-10-21T07:01:28.216870", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "X_train = pd.read_csv('../../resources/santander-train.csv', index_col=0)\n", + "y_train = X_train.pop('TARGET')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0590d0a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-10-21T07:01:34.680697Z", + "iopub.status.busy": "2024-10-21T07:01:34.680290Z", + "iopub.status.idle": "2024-10-21T07:01:34.689412Z", + "shell.execute_reply": "2024-10-21T07:01:34.688210Z" + }, + "papermill": { + "duration": 0.017414, + "end_time": "2024-10-21T07:01:34.691792", + "exception": false, + "start_time": "2024-10-21T07:01:34.674378", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "22eba1d7", + "metadata": {}, + "outputs": [], + "source": [ + "cv = KFold(shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "921f491f", + "metadata": {}, + "outputs": [], + "source": [ + "scores = []\n", + "for train, test in cv.split(X_train, y_train):\n", + " model = PerpetualBooster(objective=\"LogLoss\")\n", + " model.fit(X_train.iloc[train], y_train.iloc[train], budget=1.0)\n", + " probabilities = model.predict_proba(X_train.iloc[test])\n", + " score = roc_auc_score(y_train.iloc[test], probabilities[:, 1])\n", + " scores.append(score)\n", + " print(model.number_of_trees)\n", + "print(np.mean(scores))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a04e569", + "metadata": { + "execution": { + "iopub.execute_input": "2024-10-21T07:01:34.702317Z", + "iopub.status.busy": "2024-10-21T07:01:34.701880Z", + "iopub.status.idle": "2024-10-21T07:02:04.983918Z", + "shell.execute_reply": "2024-10-21T07:02:04.982720Z" + }, + "papermill": { + "duration": 30.294535, + "end_time": "2024-10-21T07:02:04.990727", + "exception": false, + "start_time": "2024-10-21T07:01:34.696192", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "scores = []\n", + "X_train['TARGET'] = y_train\n", + "for train, test in cv.split(X_train, y_train):\n", + " model = TabularPredictor(label=\"TARGET\", verbosity=0)\n", + " model.fit(X_train.iloc[train])\n", + " probabilities = model.predict_proba(X_train.iloc[test])\n", + " score = roc_auc_score(y_train.iloc[test], probabilities.to_numpy()[:, 1])\n", + " print(score)\n", + " scores.append(score)\n", + "print(np.mean(scores))" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "databundleVersionId": 860641, + "sourceId": 4986, + "sourceType": "competition" + } + ], + "dockerImageVersionId": 30786, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "papermill": { + "default_parameters": {}, + "duration": 62.454609, + "end_time": "2024-10-21T07:02:06.520206", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-10-21T07:01:04.065597", + "version": "2.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index d311d90..41ad96b 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.6.0" +version = "0.6.1" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py index 82d8963..d0cd0f8 100644 --- a/python-package/python/perpetual/booster.py +++ b/python-package/python/perpetual/booster.py @@ -147,6 +147,20 @@ def __init__( self.log_iterations = log_iterations self.feature_importance_method = feature_importance_method + booster = CratePerpetualBooster( + objective=self.objective, + num_threads=self.num_threads, + monotone_constraints=dict(), + force_children_to_bound_parent=self.force_children_to_bound_parent, + missing=self.missing, + allow_missing_splits=self.allow_missing_splits, + create_missing_branch=self.create_missing_branch, + terminate_missing_features=set(), + missing_node_treatment=self.missing_node_treatment, + log_iterations=self.log_iterations, + ) + self.booster = cast(BoosterType, booster) + def fit( self, X, diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py index a747feb..e39f3a7 100644 --- a/python-package/tests/test_booster.py +++ b/python-package/tests/test_booster.py @@ -625,6 +625,8 @@ def test_booster_metadata( f64_model_path = tmp_path / "modelf64_sl.json" X, y = X_y model = PerpetualBooster(objective="SquaredLoss") + save_func(model, f64_model_path) + model.json_dump() model.fit(X, y) preds = model.predict(X) save_func(model, f64_model_path) diff --git a/python-package/tests/test_multi_output.py b/python-package/tests/test_multi_output.py index 7009047..85738b8 100644 --- a/python-package/tests/test_multi_output.py +++ b/python-package/tests/test_multi_output.py @@ -9,8 +9,8 @@ def test_multi_output(): y = np.array(X.pop("Cover_Type")) X_test = pd.read_csv("../resources/cover_types_test.csv", index_col=False) y_test = np.array(X_test.pop("Cover_Type")) - model = PerpetualBooster(num_threads=1) - model.fit(X, y) + model = PerpetualBooster() + model.fit(X, y, iteration_limit=40) pred_test = model.predict(X_test) proba_test = model.predict_proba(X_test) log_odds_test = model.predict_log_proba(X_test)