Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optional fit params #25

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.6.2"
version = "0.7.0"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -21,9 +21,9 @@ codegen-units = 1

[dependencies]
rayon = "1.8"
thiserror = "1.0.65"
thiserror = "2.0.3"
serde_json = { version = "1.0.132", features = ["float_roundtrip"] }
serde = { version = "1.0.213", features = ["derive"] }
serde = { version = "1.0.215", features = ["derive"] }
approx = "0.5"
log = "0.4"
rand = "0.8.5"
Expand All @@ -32,8 +32,8 @@ sysinfo = "0.32.0"
[dev-dependencies]
criterion = "0.5"
polars = "0.41"
reqwest = { version = "0.12.8", features = ["blocking"] }
csv = "1.3"
reqwest = { version = "0.12.9", features = ["blocking"] }
csv = "1.3.1"
chrono = "0.4"

[[bench]]
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,22 @@ The package can be installed directly from [pypi](https://pypi.org/project/perpe
pip install perpetual
```

Using [conda-forge](https://anaconda.org/conda-forge/perpetual):

```shell
conda install conda-forge::perpetual
```

To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).

```toml
perpetual = "0.6.2"
perpetual = "0.7.0"
```

## Contribution

Contributions are welcome. Check CONTRIBUTING.md for the guideline.

## Paper

PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm.
10 changes: 5 additions & 5 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.6.2"
version = "0.7.0"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -18,10 +18,10 @@ name = "perpetual"
crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.5", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.6.2", path = "../" }
numpy = "0.22.0"
pyo3 = { version = "0.22.6", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.0", path = "../" }
numpy = "0.22.1"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
serde = { version = "1.0.210" }
serde = { version = "1.0.215" }
pyo3-log = "0.11"
2 changes: 1 addition & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "perpetual"
version = "0.6.2"
version = "0.7.0"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
58 changes: 44 additions & 14 deletions python-package/python/perpetual/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,17 @@ def __init__(
missing_node_treatment: str = "None",
log_iterations: int = 0,
feature_importance_method: str = "Gain",
budget: Optional[float] = None,
alpha: Optional[float] = None,
reset: Optional[bool] = None,
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
timeout: Optional[float] = None,
iteration_limit: Optional[int] = None,
memory_limit: Optional[float] = None,
):
"""PerpetualBooster Class, used to generate gradient boosted decision tree ensembles.
"""PerpetualBooster class, used to generate gradient boosted decision tree ensembles.
The following parameters can also be specified in the fit method to override the values in the constructor:
budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit.
Args:
objective (str, optional): Learning objective function to be used for optimization.
Expand Down Expand Up @@ -93,6 +102,19 @@ def __init__(
- "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
budget: a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
`auto` for Polars or Pandas categorical data type.
timeout: optional fit timeout in seconds
iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
The algorithm automatically stops for most of the cases before hitting this limit.
If you want to experiment with very high budget (>2.0), you can also increase this limit.
memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
available memory and the algorithm requirements.
Raises:
TypeError: Raised if an invalid dtype is passed.
Expand Down Expand Up @@ -146,6 +168,13 @@ def __init__(
self.missing_node_treatment = missing_node_treatment
self.log_iterations = log_iterations
self.feature_importance_method = feature_importance_method
self.budget = budget
self.alpha = alpha
self.reset = reset
self.categorical_features = categorical_features
self.timeout = timeout
self.iteration_limit = iteration_limit
self.memory_limit = memory_limit

booster = CratePerpetualBooster(
objective=self.objective,
Expand All @@ -166,13 +195,13 @@ def fit(
X,
y,
sample_weight=None,
budget: float = 1.0,
alpha: Union[float, None] = None,
reset: Union[bool, None] = None,
budget: Optional[float] = None,
alpha: Optional[float] = None,
reset: Optional[bool] = None,
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
timeout: Union[float, None] = None,
iteration_limit: Union[int, None] = None,
memory_limit: Union[float, None] = None,
timeout: Optional[float] = None,
iteration_limit: Optional[int] = None,
memory_limit: Optional[float] = None,
) -> Self:
"""Fit the gradient booster on a provided dataset.
Expand All @@ -185,6 +214,7 @@ def fit(
Defaults to None.
budget: a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
Expand All @@ -198,7 +228,7 @@ def fit(
"""

features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
convert_input_frame(X, categorical_features)
convert_input_frame(X, categorical_features or self.categorical_features)
)
self.n_features_ = cols
self.cat_mapping = cat_mapping
Expand Down Expand Up @@ -262,14 +292,14 @@ def fit(
rows=rows,
cols=cols,
y=y_,
budget=budget,
budget=budget or self.budget,
sample_weight=sample_weight_, # type: ignore
alpha=alpha,
reset=reset,
alpha=alpha or self.alpha,
reset=reset or self.reset,
categorical_features=categorical_features_, # type: ignore
timeout=timeout,
iteration_limit=iteration_limit,
memory_limit=memory_limit,
timeout=timeout or self.timeout,
iteration_limit=iteration_limit or self.iteration_limit,
memory_limit=memory_limit or self.memory_limit,
)

return self
Expand Down
4 changes: 2 additions & 2 deletions python-package/src/booster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ impl PerpetualBooster {
rows: usize,
cols: usize,
y: PyReadonlyArray1<f64>,
budget: f32,
budget: Option<f32>,
sample_weight: Option<PyReadonlyArray1<f64>>,
alpha: Option<f32>,
reset: Option<bool>,
Expand All @@ -158,7 +158,7 @@ impl PerpetualBooster {
match self.booster.fit(
&data,
y,
budget,
budget.unwrap_or(1.0),
sample_weight_,
alpha,
reset,
Expand Down
4 changes: 2 additions & 2 deletions python-package/src/multi_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ impl MultiOutputBooster {
rows: usize,
cols: usize,
y: PyReadonlyArray1<f64>,
budget: f32,
budget: Option<f32>,
sample_weight: Option<PyReadonlyArray1<f64>>,
alpha: Option<f32>,
reset: Option<bool>,
Expand All @@ -180,7 +180,7 @@ impl MultiOutputBooster {
match self.booster.fit(
&data,
&y_data,
budget,
budget.unwrap_or(1.0),
sample_weight_,
alpha,
reset,
Expand Down
8 changes: 4 additions & 4 deletions src/bin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess:
} else if b2.num == 0 {
return Ordering::Greater;
}
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
});
} else {
Expand All @@ -129,8 +129,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess:
} else if b2.num == 0 {
return Ordering::Greater;
}
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
});
}
Expand Down
24 changes: 11 additions & 13 deletions src/booster.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::bin::Bin;
use crate::binning::bin_matrix;
use crate::constants::{
FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT,
ROW_COLUMN_RATIO_LIMIT, STOPPING_ROUNDS, TIMEOUT_FACTOR,
FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, STOPPING_ROUNDS,
TIMEOUT_FACTOR,
};
use crate::constraints::ConstraintMap;
use crate::data::Matrix;
Expand Down Expand Up @@ -254,7 +254,7 @@ impl PerpetualBooster {
/// * `categorical_features` - categorical features.
/// * `timeout` - fit timeout limit in seconds.
/// * `iteration_limit` - optional limit for the number of boosting rounds.
/// * `memory_limit` - optional limit for memory allocation.
/// * `memory_limit` - optional limit for memory allocation.
pub fn fit(
&mut self,
data: &Matrix<f64>,
Expand Down Expand Up @@ -401,16 +401,14 @@ impl PerpetualBooster {
let mut rng = StdRng::seed_from_u64(self.seed);

// Column sampling is only applied when (n_rows / n_columns) < ROW_COLUMN_RATIO_LIMIT.
// ROW_COLUMN_RATIO_LIMIT is set to 100 by default.
let colsample_bytree = f64::min(
1.0,
(data.rows as f64 / data.cols as f64) / ROW_COLUMN_RATIO_LIMIT as f64,
);

let col_amount = usize::max(
usize::min(MIN_COL_AMOUNT, col_index.len()),
((col_index.len() as f64) * colsample_bytree).floor() as usize,
);
// ROW_COLUMN_RATIO_LIMIT is calculated using budget.
// budget = 1.0 -> ROW_COLUMN_RATIO_LIMIT = 100
// budget = 2.0 -> ROW_COLUMN_RATIO_LIMIT = 10
let row_column_ratio_limit = 10.0_f32.powf(-budget) * 1000.0;
let colsample_bytree = (data.rows as f32 / data.cols as f32) / row_column_ratio_limit;

let col_amount = (((col_index.len() as f32) * colsample_bytree).floor() as usize)
.clamp(usize::min(MIN_COL_AMOUNT, col_index.len()), col_index.len());

let mem_bin = mem::size_of::<Bin>();
let mem_hist: usize;
Expand Down
3 changes: 1 addition & 2 deletions src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9;
pub const N_NODES_ALLOC_LIMIT: usize = 3000;
pub const ITER_LIMIT: usize = 1000;
pub const GENERALIZATION_THRESHOLD: f32 = 0.99;
pub const ROW_COLUMN_RATIO_LIMIT: usize = 100;
pub const MIN_COL_AMOUNT: usize = 30;
pub const MIN_COL_AMOUNT: usize = 40;
pub const HESSIAN_EPS: f32 = 1e-3;
pub const TIMEOUT_FACTOR: f32 = 0.95;
Loading