Skip to content

Commit

Permalink
optional fit params
Browse files Browse the repository at this point in the history
  • Loading branch information
deadsoul44 committed Nov 13, 2024
1 parent a9ce0c8 commit 7a0c836
Show file tree
Hide file tree
Showing 10 changed files with 86 additions and 49 deletions.
10 changes: 5 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.6.2"
version = "0.7.0"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -21,9 +21,9 @@ codegen-units = 1

[dependencies]
rayon = "1.8"
thiserror = "1.0.65"
thiserror = "2.0.3"
serde_json = { version = "1.0.132", features = ["float_roundtrip"] }
serde = { version = "1.0.213", features = ["derive"] }
serde = { version = "1.0.215", features = ["derive"] }
approx = "0.5"
log = "0.4"
rand = "0.8.5"
Expand All @@ -32,8 +32,8 @@ sysinfo = "0.32.0"
[dev-dependencies]
criterion = "0.5"
polars = "0.41"
reqwest = { version = "0.12.8", features = ["blocking"] }
csv = "1.3"
reqwest = { version = "0.12.9", features = ["blocking"] }
csv = "1.3.1"
chrono = "0.4"

[[bench]]
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,22 @@ The package can be installed directly from [pypi](https://pypi.org/project/perpe
pip install perpetual
```

Using [conda-forge](https://anaconda.org/conda-forge/perpetual):

```shell
conda install conda-forge::perpetual
```

To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).

```toml
perpetual = "0.6.2"
perpetual = "0.7.0"
```

## Contribution

Contributions are welcome. Check CONTRIBUTING.md for the guideline.

## Paper

PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm.
10 changes: 5 additions & 5 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.6.2"
version = "0.7.0"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -18,10 +18,10 @@ name = "perpetual"
crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.5", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.6.2", path = "../" }
numpy = "0.22.0"
pyo3 = { version = "0.22.6", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.0", path = "../" }
numpy = "0.22.1"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
serde = { version = "1.0.210" }
serde = { version = "1.0.215" }
pyo3-log = "0.11"
2 changes: 1 addition & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "perpetual"
version = "0.6.2"
version = "0.7.0"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
58 changes: 44 additions & 14 deletions python-package/python/perpetual/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,17 @@ def __init__(
missing_node_treatment: str = "None",
log_iterations: int = 0,
feature_importance_method: str = "Gain",
budget: Optional[float] = None,
alpha: Optional[float] = None,
reset: Optional[bool] = None,
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
timeout: Optional[float] = None,
iteration_limit: Optional[int] = None,
memory_limit: Optional[float] = None,
):
"""PerpetualBooster Class, used to generate gradient boosted decision tree ensembles.
"""PerpetualBooster class, used to generate gradient boosted decision tree ensembles.
The following parameters can also be specified in the fit method to override the values in the constructor:
budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit.
Args:
objective (str, optional): Learning objective function to be used for optimization.
Expand Down Expand Up @@ -93,6 +102,19 @@ def __init__(
- "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
budget: a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
`auto` for Polars or Pandas categorical data type.
timeout: optional fit timeout in seconds
iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
The algorithm automatically stops for most of the cases before hitting this limit.
If you want to experiment with very high budget (>2.0), you can also increase this limit.
memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
available memory and the algorithm requirements.
Raises:
TypeError: Raised if an invalid dtype is passed.
Expand Down Expand Up @@ -146,6 +168,13 @@ def __init__(
self.missing_node_treatment = missing_node_treatment
self.log_iterations = log_iterations
self.feature_importance_method = feature_importance_method
self.budget = budget
self.alpha = alpha
self.reset = reset
self.categorical_features = categorical_features
self.timeout = timeout
self.iteration_limit = iteration_limit
self.memory_limit = memory_limit

booster = CratePerpetualBooster(
objective=self.objective,
Expand All @@ -166,13 +195,13 @@ def fit(
X,
y,
sample_weight=None,
budget: float = 1.0,
alpha: Union[float, None] = None,
reset: Union[bool, None] = None,
budget: Optional[float] = None,
alpha: Optional[float] = None,
reset: Optional[bool] = None,
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
timeout: Union[float, None] = None,
iteration_limit: Union[int, None] = None,
memory_limit: Union[float, None] = None,
timeout: Optional[float] = None,
iteration_limit: Optional[int] = None,
memory_limit: Optional[float] = None,
) -> Self:
"""Fit the gradient booster on a provided dataset.
Expand All @@ -185,6 +214,7 @@ def fit(
Defaults to None.
budget: a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
Expand All @@ -198,7 +228,7 @@ def fit(
"""

features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
convert_input_frame(X, categorical_features)
convert_input_frame(X, categorical_features or self.categorical_features)
)
self.n_features_ = cols
self.cat_mapping = cat_mapping
Expand Down Expand Up @@ -262,14 +292,14 @@ def fit(
rows=rows,
cols=cols,
y=y_,
budget=budget,
budget=budget or self.budget,
sample_weight=sample_weight_, # type: ignore
alpha=alpha,
reset=reset,
alpha=alpha or self.alpha,
reset=reset or self.reset,
categorical_features=categorical_features_, # type: ignore
timeout=timeout,
iteration_limit=iteration_limit,
memory_limit=memory_limit,
timeout=timeout or self.timeout,
iteration_limit=iteration_limit or self.iteration_limit,
memory_limit=memory_limit or self.memory_limit,
)

return self
Expand Down
4 changes: 2 additions & 2 deletions python-package/src/booster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ impl PerpetualBooster {
rows: usize,
cols: usize,
y: PyReadonlyArray1<f64>,
budget: f32,
budget: Option<f32>,
sample_weight: Option<PyReadonlyArray1<f64>>,
alpha: Option<f32>,
reset: Option<bool>,
Expand All @@ -158,7 +158,7 @@ impl PerpetualBooster {
match self.booster.fit(
&data,
y,
budget,
budget.unwrap_or(1.0),
sample_weight_,
alpha,
reset,
Expand Down
4 changes: 2 additions & 2 deletions python-package/src/multi_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ impl MultiOutputBooster {
rows: usize,
cols: usize,
y: PyReadonlyArray1<f64>,
budget: f32,
budget: Option<f32>,
sample_weight: Option<PyReadonlyArray1<f64>>,
alpha: Option<f32>,
reset: Option<bool>,
Expand All @@ -180,7 +180,7 @@ impl MultiOutputBooster {
match self.booster.fit(
&data,
&y_data,
budget,
budget.unwrap_or(1.0),
sample_weight_,
alpha,
reset,
Expand Down
8 changes: 4 additions & 4 deletions src/bin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess:
} else if b2.num == 0 {
return Ordering::Greater;
}
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
});
} else {
Expand All @@ -129,8 +129,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess:
} else if b2.num == 0 {
return Ordering::Greater;
}
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
});
}
Expand Down
24 changes: 11 additions & 13 deletions src/booster.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::bin::Bin;
use crate::binning::bin_matrix;
use crate::constants::{
FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT,
ROW_COLUMN_RATIO_LIMIT, STOPPING_ROUNDS, TIMEOUT_FACTOR,
FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, STOPPING_ROUNDS,
TIMEOUT_FACTOR,
};
use crate::constraints::ConstraintMap;
use crate::data::Matrix;
Expand Down Expand Up @@ -254,7 +254,7 @@ impl PerpetualBooster {
/// * `categorical_features` - categorical features.
/// * `timeout` - fit timeout limit in seconds.
/// * `iteration_limit` - optional limit for the number of boosting rounds.
/// * `memory_limit` - optional limit for memory allocation.
/// * `memory_limit` - optional limit for memory allocation.
pub fn fit(
&mut self,
data: &Matrix<f64>,
Expand Down Expand Up @@ -401,16 +401,14 @@ impl PerpetualBooster {
let mut rng = StdRng::seed_from_u64(self.seed);

// Column sampling is only applied when (n_rows / n_columns) < ROW_COLUMN_RATIO_LIMIT.
// ROW_COLUMN_RATIO_LIMIT is set to 100 by default.
let colsample_bytree = f64::min(
1.0,
(data.rows as f64 / data.cols as f64) / ROW_COLUMN_RATIO_LIMIT as f64,
);

let col_amount = usize::max(
usize::min(MIN_COL_AMOUNT, col_index.len()),
((col_index.len() as f64) * colsample_bytree).floor() as usize,
);
// ROW_COLUMN_RATIO_LIMIT is calculated using budget.
// budget = 1.0 -> ROW_COLUMN_RATIO_LIMIT = 100
// budget = 2.0 -> ROW_COLUMN_RATIO_LIMIT = 10
let row_column_ratio_limit = 10.0_f32.powf(-budget) * 1000.0;
let colsample_bytree = (data.rows as f32 / data.cols as f32) / row_column_ratio_limit;

let col_amount = (((col_index.len() as f32) * colsample_bytree).floor() as usize)
.clamp(usize::min(MIN_COL_AMOUNT, col_index.len()), col_index.len());

let mem_bin = mem::size_of::<Bin>();
let mem_hist: usize;
Expand Down
3 changes: 1 addition & 2 deletions src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9;
pub const N_NODES_ALLOC_LIMIT: usize = 3000;
pub const ITER_LIMIT: usize = 1000;
pub const GENERALIZATION_THRESHOLD: f32 = 0.99;
pub const ROW_COLUMN_RATIO_LIMIT: usize = 100;
pub const MIN_COL_AMOUNT: usize = 30;
pub const MIN_COL_AMOUNT: usize = 40;
pub const HESSIAN_EPS: f32 = 1e-3;
pub const TIMEOUT_FACTOR: f32 = 0.95;

0 comments on commit 7a0c836

Please sign in to comment.