optional fit params

perpetual-ml · Nov 13, 2024 · 7a0c836 · 7a0c836
1 parent a9ce0c8
commit 7a0c836
Show file tree

Hide file tree

Showing 10 changed files with 86 additions and 49 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "perpetual"
-version = "0.6.2"
+version = "0.7.0"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -21,9 +21,9 @@ codegen-units = 1
 
 [dependencies]
 rayon = "1.8"
-thiserror = "1.0.65"
+thiserror = "2.0.3"
 serde_json = { version = "1.0.132", features = ["float_roundtrip"] }
-serde = { version = "1.0.213", features = ["derive"] }
+serde = { version = "1.0.215", features = ["derive"] }
 approx = "0.5"
 log = "0.4"
 rand = "0.8.5"
@@ -32,8 +32,8 @@ sysinfo = "0.32.0"
 [dev-dependencies]
 criterion = "0.5"
 polars = "0.41"
-reqwest = { version = "0.12.8", features = ["blocking"] }
-csv = "1.3"
+reqwest = { version = "0.12.9", features = ["blocking"] }
+csv = "1.3.1"
 chrono = "0.4"
 
 [[bench]]

diff --git a/README.md b/README.md
@@ -58,12 +58,22 @@ The package can be installed directly from [pypi](https://pypi.org/project/perpe
 pip install perpetual
 ```
 
+Using [conda-forge](https://anaconda.org/conda-forge/perpetual):
+
+```shell
+conda install conda-forge::perpetual
+```
+
 To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
 
 ```toml
-perpetual = "0.6.2"
+perpetual = "0.7.0"
 ```
 
+## Contribution
+
+Contributions are welcome. Check CONTRIBUTING.md for the guideline.
+
 ## Paper
 
 PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm.
diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-perpetual"
-version = "0.6.2"
+version = "0.7.0"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -18,10 +18,10 @@ name = "perpetual"
 crate-type = ["cdylib", "rlib"]
 
 [dependencies]
-pyo3 = { version = "0.22.5", features = ["extension-module"] }
-perpetual_rs = {package="perpetual", version = "0.6.2", path = "../" }
-numpy = "0.22.0"
+pyo3 = { version = "0.22.6", features = ["extension-module"] }
+perpetual_rs = {package="perpetual", version = "0.7.0", path = "../" }
+numpy = "0.22.1"
 ndarray = "0.16.1"
 serde_plain = { version = "1.0" }
-serde = { version = "1.0.210" }
+serde = { version = "1.0.215" }
 pyo3-log = "0.11"
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "perpetual"
-version = "0.6.2"
+version = "0.7.0"
 description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
 license = { file = "LICENSE" }
 keywords = [

diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py
@@ -48,8 +48,17 @@ def __init__(
         missing_node_treatment: str = "None",
         log_iterations: int = 0,
         feature_importance_method: str = "Gain",
+        budget: Optional[float] = None,
+        alpha: Optional[float] = None,
+        reset: Optional[bool] = None,
+        categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
+        timeout: Optional[float] = None,
+        iteration_limit: Optional[int] = None,
+        memory_limit: Optional[float] = None,
     ):
-        """PerpetualBooster Class, used to generate gradient boosted decision tree ensembles.
+        """PerpetualBooster class, used to generate gradient boosted decision tree ensembles.
+        The following parameters can also be specified in the fit method to override the values in the constructor:
+            budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit.
 
         Args:
             objective (str, optional): Learning objective function to be used for optimization.
@@ -93,6 +102,19 @@ def __init__(
                 - "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
             log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
             feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
+            budget: a positive number for fitting budget. Increasing this number will more
+                likely result in more boosting rounds and more increased predictive power.
+                Default value is 1.0.
+            alpha: only used in quantile regression.
+            reset: whether to reset the model or continue training.
+            categorical_features: The names or indices for categorical features.
+                `auto` for Polars or Pandas categorical data type.
+            timeout: optional fit timeout in seconds
+            iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
+                The algorithm automatically stops for most of the cases before hitting this limit.
+                If you want to experiment with very high budget (>2.0), you can also increase this limit.
+            memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
+                available memory and the algorithm requirements.
 
         Raises:
             TypeError: Raised if an invalid dtype is passed.
@@ -146,6 +168,13 @@ def __init__(
         self.missing_node_treatment = missing_node_treatment
         self.log_iterations = log_iterations
         self.feature_importance_method = feature_importance_method
+        self.budget = budget
+        self.alpha = alpha
+        self.reset = reset
+        self.categorical_features = categorical_features
+        self.timeout = timeout
+        self.iteration_limit = iteration_limit
+        self.memory_limit = memory_limit
 
         booster = CratePerpetualBooster(
             objective=self.objective,
@@ -166,13 +195,13 @@ def fit(
         X,
         y,
         sample_weight=None,
-        budget: float = 1.0,
-        alpha: Union[float, None] = None,
-        reset: Union[bool, None] = None,
+        budget: Optional[float] = None,
+        alpha: Optional[float] = None,
+        reset: Optional[bool] = None,
         categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
-        timeout: Union[float, None] = None,
-        iteration_limit: Union[int, None] = None,
-        memory_limit: Union[float, None] = None,
+        timeout: Optional[float] = None,
+        iteration_limit: Optional[int] = None,
+        memory_limit: Optional[float] = None,
     ) -> Self:
         """Fit the gradient booster on a provided dataset.
 
@@ -185,6 +214,7 @@ def fit(
                 Defaults to None.
             budget: a positive number for fitting budget. Increasing this number will more
                 likely result in more boosting rounds and more increased predictive power.
+                Default value is 1.0.
             alpha: only used in quantile regression.
             reset: whether to reset the model or continue training.
             categorical_features: The names or indices for categorical features.
@@ -198,7 +228,7 @@ def fit(
         """
 
         features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
-            convert_input_frame(X, categorical_features)
+            convert_input_frame(X, categorical_features or self.categorical_features)
         )
         self.n_features_ = cols
         self.cat_mapping = cat_mapping
@@ -262,14 +292,14 @@ def fit(
             rows=rows,
             cols=cols,
             y=y_,
-            budget=budget,
+            budget=budget or self.budget,
             sample_weight=sample_weight_,  # type: ignore
-            alpha=alpha,
-            reset=reset,
+            alpha=alpha or self.alpha,
+            reset=reset or self.reset,
             categorical_features=categorical_features_,  # type: ignore
-            timeout=timeout,
-            iteration_limit=iteration_limit,
-            memory_limit=memory_limit,
+            timeout=timeout or self.timeout,
+            iteration_limit=iteration_limit or self.iteration_limit,
+            memory_limit=memory_limit or self.memory_limit,
         )
 
         return self

diff --git a/python-package/src/booster.rs b/python-package/src/booster.rs
@@ -135,7 +135,7 @@ impl PerpetualBooster {
         rows: usize,
         cols: usize,
         y: PyReadonlyArray1<f64>,
-        budget: f32,
+        budget: Option<f32>,
         sample_weight: Option<PyReadonlyArray1<f64>>,
         alpha: Option<f32>,
         reset: Option<bool>,
@@ -158,7 +158,7 @@ impl PerpetualBooster {
         match self.booster.fit(
             &data,
             y,
-            budget,
+            budget.unwrap_or(1.0),
             sample_weight_,
             alpha,
             reset,

diff --git a/python-package/src/multi_output.rs b/python-package/src/multi_output.rs
@@ -154,7 +154,7 @@ impl MultiOutputBooster {
         rows: usize,
         cols: usize,
         y: PyReadonlyArray1<f64>,
-        budget: f32,
+        budget: Option<f32>,
         sample_weight: Option<PyReadonlyArray1<f64>>,
         alpha: Option<f32>,
         reset: Option<bool>,
@@ -180,7 +180,7 @@ impl MultiOutputBooster {
         match self.booster.fit(
             &data,
             &y_data,
-            budget,
+            budget.unwrap_or(1.0),
             sample_weight_,
             alpha,
             reset,

diff --git a/src/bin.rs b/src/bin.rs
@@ -116,8 +116,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess:
                 } else if b2.num == 0 {
                     return Ordering::Greater;
                 }
-                let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
-                let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
+                let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
+                let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
                 div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
             });
         } else {
@@ -129,8 +129,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess:
                 } else if b2.num == 0 {
                     return Ordering::Greater;
                 }
-                let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
-                let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
+                let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
+                let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
                 div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
             });
         }

diff --git a/src/booster.rs b/src/booster.rs
@@ -1,8 +1,8 @@
 use crate::bin::Bin;
 use crate::binning::bin_matrix;
 use crate::constants::{
-    FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT,
-    ROW_COLUMN_RATIO_LIMIT, STOPPING_ROUNDS, TIMEOUT_FACTOR,
+    FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, STOPPING_ROUNDS,
+    TIMEOUT_FACTOR,
 };
 use crate::constraints::ConstraintMap;
 use crate::data::Matrix;
@@ -254,7 +254,7 @@ impl PerpetualBooster {
     /// * `categorical_features` - categorical features.
     /// * `timeout` - fit timeout limit in seconds.
     /// * `iteration_limit` - optional limit for the number of boosting rounds.
-    /// * `memory_limit` - optional limit for memory allocation. 
+    /// * `memory_limit` - optional limit for memory allocation.
     pub fn fit(
         &mut self,
         data: &Matrix<f64>,
@@ -401,16 +401,14 @@ impl PerpetualBooster {
         let mut rng = StdRng::seed_from_u64(self.seed);
 
         // Column sampling is only applied when (n_rows / n_columns) < ROW_COLUMN_RATIO_LIMIT.
-        // ROW_COLUMN_RATIO_LIMIT is set to 100 by default.
-        let colsample_bytree = f64::min(
-            1.0,
-            (data.rows as f64 / data.cols as f64) / ROW_COLUMN_RATIO_LIMIT as f64,
-        );
-
-        let col_amount = usize::max(
-            usize::min(MIN_COL_AMOUNT, col_index.len()),
-            ((col_index.len() as f64) * colsample_bytree).floor() as usize,
-        );
+        // ROW_COLUMN_RATIO_LIMIT is calculated using budget.
+        // budget = 1.0 -> ROW_COLUMN_RATIO_LIMIT = 100
+        // budget = 2.0 -> ROW_COLUMN_RATIO_LIMIT = 10
+        let row_column_ratio_limit = 10.0_f32.powf(-budget) * 1000.0;
+        let colsample_bytree = (data.rows as f32 / data.cols as f32) / row_column_ratio_limit;
+
+        let col_amount = (((col_index.len() as f32) * colsample_bytree).floor() as usize)
+            .clamp(usize::min(MIN_COL_AMOUNT, col_index.len()), col_index.len());
 
         let mem_bin = mem::size_of::<Bin>();
         let mem_hist: usize;

diff --git a/src/constants.rs b/src/constants.rs
@@ -3,7 +3,6 @@ pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9;
 pub const N_NODES_ALLOC_LIMIT: usize = 3000;
 pub const ITER_LIMIT: usize = 1000;
 pub const GENERALIZATION_THRESHOLD: f32 = 0.99;
-pub const ROW_COLUMN_RATIO_LIMIT: usize = 100;
-pub const MIN_COL_AMOUNT: usize = 30;
+pub const MIN_COL_AMOUNT: usize = 40;
 pub const HESSIAN_EPS: f32 = 1e-3;
 pub const TIMEOUT_FACTOR: f32 = 0.95;