Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

json dump before fit #22

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.6.0"
version = "0.6.1"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ pip install perpetual
To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).

```toml
perpetual = "0.6.0"
perpetual = "0.6.1"
```

## Paper
Expand Down
4 changes: 2 additions & 2 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.6.0"
version = "0.6.1"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.5", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.6.0", path = "../" }
perpetual_rs = {package="perpetual", version = "0.6.1", path = "../" }
numpy = "0.22.0"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
Expand Down
4 changes: 2 additions & 2 deletions python-package/examples/categorical_data_titanic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "py310",
"display_name": "py311",
"language": "python",
"name": "python3"
},
Expand All @@ -278,7 +278,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
6 changes: 3 additions & 3 deletions python-package/examples/openml.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -105,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -128,7 +128,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand Down
197 changes: 197 additions & 0 deletions python-package/examples/santander.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "967af9d9",
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
"execution": {
"iopub.execute_input": "2024-10-21T07:01:07.130508Z",
"iopub.status.busy": "2024-10-21T07:01:07.130061Z",
"iopub.status.idle": "2024-10-21T07:01:08.048111Z",
"shell.execute_reply": "2024-10-21T07:01:08.046970Z"
},
"papermill": {
"duration": 0.926499,
"end_time": "2024-10-21T07:01:08.050965",
"exception": false,
"start_time": "2024-10-21T07:01:07.124466",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from perpetual import PerpetualBooster\n",
"from sklearn.metrics import roc_auc_score\n",
"from sklearn.model_selection import KFold\n",
"from autogluon.tabular import TabularPredictor"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c257f8fc",
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-21T07:01:28.223537Z",
"iopub.status.busy": "2024-10-21T07:01:28.222764Z",
"iopub.status.idle": "2024-10-21T07:01:34.667262Z",
"shell.execute_reply": "2024-10-21T07:01:34.666013Z"
},
"papermill": {
"duration": 6.453134,
"end_time": "2024-10-21T07:01:34.670004",
"exception": false,
"start_time": "2024-10-21T07:01:28.216870",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"X_train = pd.read_csv('../../resources/santander-train.csv', index_col=0)\n",
"y_train = X_train.pop('TARGET')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0590d0a",
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-21T07:01:34.680697Z",
"iopub.status.busy": "2024-10-21T07:01:34.680290Z",
"iopub.status.idle": "2024-10-21T07:01:34.689412Z",
"shell.execute_reply": "2024-10-21T07:01:34.688210Z"
},
"papermill": {
"duration": 0.017414,
"end_time": "2024-10-21T07:01:34.691792",
"exception": false,
"start_time": "2024-10-21T07:01:34.674378",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "22eba1d7",
"metadata": {},
"outputs": [],
"source": [
"cv = KFold(shuffle=True, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "921f491f",
"metadata": {},
"outputs": [],
"source": [
"scores = []\n",
"for train, test in cv.split(X_train, y_train):\n",
" model = PerpetualBooster(objective=\"LogLoss\")\n",
" model.fit(X_train.iloc[train], y_train.iloc[train], budget=1.0)\n",
" probabilities = model.predict_proba(X_train.iloc[test])\n",
" score = roc_auc_score(y_train.iloc[test], probabilities[:, 1])\n",
" scores.append(score)\n",
" print(model.number_of_trees)\n",
"print(np.mean(scores))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a04e569",
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-21T07:01:34.702317Z",
"iopub.status.busy": "2024-10-21T07:01:34.701880Z",
"iopub.status.idle": "2024-10-21T07:02:04.983918Z",
"shell.execute_reply": "2024-10-21T07:02:04.982720Z"
},
"papermill": {
"duration": 30.294535,
"end_time": "2024-10-21T07:02:04.990727",
"exception": false,
"start_time": "2024-10-21T07:01:34.696192",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"scores = []\n",
"X_train['TARGET'] = y_train\n",
"for train, test in cv.split(X_train, y_train):\n",
" model = TabularPredictor(label=\"TARGET\", verbosity=0)\n",
" model.fit(X_train.iloc[train])\n",
" probabilities = model.predict_proba(X_train.iloc[test])\n",
" score = roc_auc_score(y_train.iloc[test], probabilities.to_numpy()[:, 1])\n",
" print(score)\n",
" scores.append(score)\n",
"print(np.mean(scores))"
]
}
],
"metadata": {
"kaggle": {
"accelerator": "none",
"dataSources": [
{
"databundleVersionId": 860641,
"sourceId": 4986,
"sourceType": "competition"
}
],
"dockerImageVersionId": 30786,
"isGpuEnabled": false,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "py311",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
},
"papermill": {
"default_parameters": {},
"duration": 62.454609,
"end_time": "2024-10-21T07:02:06.520206",
"environment_variables": {},
"exception": null,
"input_path": "__notebook__.ipynb",
"output_path": "__notebook__.ipynb",
"parameters": {},
"start_time": "2024-10-21T07:01:04.065597",
"version": "2.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 1 addition & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "perpetual"
version = "0.6.0"
version = "0.6.1"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
14 changes: 14 additions & 0 deletions python-package/python/perpetual/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,20 @@ def __init__(
self.log_iterations = log_iterations
self.feature_importance_method = feature_importance_method

booster = CratePerpetualBooster(
objective=self.objective,
num_threads=self.num_threads,
monotone_constraints=dict(),
force_children_to_bound_parent=self.force_children_to_bound_parent,
missing=self.missing,
allow_missing_splits=self.allow_missing_splits,
create_missing_branch=self.create_missing_branch,
terminate_missing_features=set(),
missing_node_treatment=self.missing_node_treatment,
log_iterations=self.log_iterations,
)
self.booster = cast(BoosterType, booster)

def fit(
self,
X,
Expand Down
2 changes: 2 additions & 0 deletions python-package/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,8 @@ def test_booster_metadata(
f64_model_path = tmp_path / "modelf64_sl.json"
X, y = X_y
model = PerpetualBooster(objective="SquaredLoss")
save_func(model, f64_model_path)
model.json_dump()
model.fit(X, y)
preds = model.predict(X)
save_func(model, f64_model_path)
Expand Down
4 changes: 2 additions & 2 deletions python-package/tests/test_multi_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def test_multi_output():
y = np.array(X.pop("Cover_Type"))
X_test = pd.read_csv("../resources/cover_types_test.csv", index_col=False)
y_test = np.array(X_test.pop("Cover_Type"))
model = PerpetualBooster(num_threads=1)
model.fit(X, y)
model = PerpetualBooster()
model.fit(X, y, iteration_limit=40)
pred_test = model.predict(X_test)
proba_test = model.predict_proba(X_test)
log_odds_test = model.predict_log_proba(X_test)
Expand Down
Loading