fix black formatting

perpetual-ml · Jun 7, 2024 · 2473b8a · 2473b8a
1 parent 9f51f24
commit 2473b8a
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 38 deletions.
diff --git a/python-package/python/perpetual/__init__.py b/python-package/python/perpetual/__init__.py
@@ -37,11 +37,10 @@
 }
 
 
-
-
 @dataclass
 class Node:
     """Dataclass representation of a node, this represents all of the fields present in a tree node."""
+
     num: int
     weight_value: float
     hessian_sum: float
@@ -137,8 +136,9 @@ def get_metadata(self, key: str) -> str:
         """pass"""
 
 
-
-def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]:
+def convert_input_frame(
+    X: FrameLike, categorical_features
+) -> tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]:
     """Convert data to format needed by booster.
 
     Returns:
@@ -149,18 +149,27 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str],
         X_ = X.to_numpy()
         features_ = X.columns.to_list()
         if categorical_features == "auto":
-            categorical_columns = X.select_dtypes(include=['category']).columns.tolist()
-            categorical_features_ = [features_.index(c) for c in categorical_columns] or None
+            categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
+            categorical_features_ = [
+                features_.index(c) for c in categorical_columns
+            ] or None
     else:
         # Assume it's a numpy array.
         X_ = X
         features_ = list(map(str, range(X_.shape[1])))
 
-    if categorical_features and all(isinstance(s, int) for s in categorical_features) and isinstance(categorical_features, list):
+    if (
+        categorical_features
+        and all(isinstance(s, int) for s in categorical_features)
+        and isinstance(categorical_features, list)
+    ):
         categorical_features_ = categorical_features
-    elif categorical_features and all(isinstance(s, str) for s in categorical_features) and isinstance(categorical_features, list):
+    elif (
+        categorical_features
+        and all(isinstance(s, str) for s in categorical_features)
+        and isinstance(categorical_features, list)
+    ):
         categorical_features_ = [features_.index(c) for c in categorical_features]
-
 
     cat_mapping = {}  # key: feature_name, value: ordered category names
     if categorical_features_:
@@ -175,13 +184,18 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str],
         print(f"Mapping of categories: {cat_mapping}")
         for feature_name, categories in cat_mapping.items():
             feature_index = features_.index(feature_name)
+
             def f(x):
                 try:
-                    return np.nan if str(x[feature_index]) == "nan" else float(categories.index(str(x[feature_index])))
+                    return (
+                        np.nan
+                        if str(x[feature_index]) == "nan"
+                        else float(categories.index(str(x[feature_index])))
+                    )
                 except (ValueError, IndexError):
                     return np.nan
-            X_[:, feature_index] = np.apply_along_axis(f, 1, X_)
 
+            X_[:, feature_index] = np.apply_along_axis(f, 1, X_)
 
     if not np.issubdtype(X_.dtype, "float64"):
         X_ = X_.astype(dtype="float64", copy=False)
@@ -194,8 +208,9 @@ def f(x):
     return features_, flat_data, rows, cols, categorical_features_, cat_mapping
 
 
-
-def transform_input_frame(X: FrameLike, cat_mapping) -> tuple[list[str], np.ndarray, int, int]:
+def transform_input_frame(
+    X: FrameLike, cat_mapping
+) -> tuple[list[str], np.ndarray, int, int]:
     """Convert data to format needed by booster.
 
     Returns:
@@ -212,18 +227,24 @@ def transform_input_frame(X: FrameLike, cat_mapping) -> tuple[list[str], np.ndar
     if cat_mapping:
         for feature_name, categories in cat_mapping.items():
             feature_index = features_.index(feature_name)
+
             def f(x):
                 try:
-                    return np.nan if str(x[feature_index]) == "nan" else float(categories.index(str(x[feature_index])))
+                    return (
+                        np.nan
+                        if str(x[feature_index]) == "nan"
+                        else float(categories.index(str(x[feature_index])))
+                    )
                 except (ValueError, IndexError):
                     return np.nan
+
             X_[:, feature_index] = np.apply_along_axis(f, 1, X_)
 
     if not np.issubdtype(X_.dtype, "float64"):
         X_ = X_.astype(dtype="float64", copy=False)
     flat_data = X_.ravel(order="F")
     rows, cols = X_.shape
-    
+
     return features_, flat_data, rows, cols
 
 
@@ -297,9 +318,9 @@ def __init__(
                 create a separate branch for missing, creating a ternary tree, the missing node will be given the same
                 weight value as the parent node. If this parameter is `False`, missing will be sent
                 down either the left or right branch, creating a binary tree. Defaults to `False`.
-            terminate_missing_features (set[Any], optional): An optional iterable of features 
+            terminate_missing_features (set[Any], optional): An optional iterable of features
                 (either strings, or integer values specifying the feature indices if numpy arrays are used for fitting),
-                for which the missing node will always be terminated, even if `allow_missing_splits` is set to true. 
+                for which the missing node will always be terminated, even if `allow_missing_splits` is set to true.
                 This value is only valid if `create_missing_branch` is also True.
             missing_node_treatment (str, optional): Method for selecting the `weight` for the missing node, if `create_missing_branch` is set to `True`. Defaults to "None". Valid options are:
                 - "None": Calculate missing node weight values without any constraints.
@@ -319,7 +340,7 @@ def __init__(
             force_children_to_bound_parent (bool, optional): Setting this parameter to `True` will restrict children nodes, so that they always contain the parent node inside of their range. Without setting this it's possible that both, the left and the right nodes could be greater, than or less than, the parent node. Defaults to `False`.
             log_iterations (bool, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
             feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
-            
+
         Raises:
             TypeError: Raised if an invalid dtype is passed.
 
@@ -353,7 +374,7 @@ def __init__(
             ```
 
         """
-       
+
         terminate_missing_features_ = (
             set() if terminate_missing_features is None else terminate_missing_features
         )
@@ -415,14 +436,16 @@ def fit(
             categorical_features: The names or indices for categorical features. auto for pandas categorical data type
         """
 
-        features_, flat_data, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, categorical_features)
+        features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
+            convert_input_frame(X, categorical_features)
+        )
         self.n_features_ = cols
         self._set_metadata_attributes("n_features_", self.n_features_)
         self.cat_mapping = cat_mapping
         self._set_metadata_attributes("cat_mapping", self.cat_mapping)
         self.feature_names_in_ = features_
         self._set_metadata_attributes("feature_names_in_", self.feature_names_in_)
-      
+
         y_ = _convert_input_array(y)
 
         if sample_weight is None:
@@ -434,7 +457,9 @@ def fit(
         # by the rust code.
         monotone_constraints_ = self._standardize_monotonicity_map(X)
         self.booster.monotone_constraints = monotone_constraints_
-        self.booster.terminate_missing_features = (self._standardize_terminate_missing_features(X))
+        self.booster.terminate_missing_features = (
+            self._standardize_terminate_missing_features(X)
+        )
 
         self.booster.fit(
             flat_data=flat_data,
@@ -479,7 +504,7 @@ def predict(self, X: FrameLike, parallel: Union[bool, None] = None) -> np.ndarra
             cols=cols,
             parallel=parallel_,
         )
-    
+
     @property
     def feature_importances_(self) -> np.ndarray:
         vals = self.calculate_feature_importance(
@@ -531,7 +556,6 @@ def predict_contributions(
         )
         return np.reshape(contributions, (rows, cols + 1))
 
-
     def partial_dependence(
         self,
         X: FrameLike,
@@ -811,7 +835,6 @@ def _get_metadata_attributes(self, key: str) -> Any:
         value = self.get_metadata(key)
         return self.meta_data_attributes[key].deserialize(value)
 
-
     @property
     def number_of_trees(self) -> int:
         """The number of trees in the model.
@@ -821,7 +844,6 @@ def number_of_trees(self) -> int:
         """
         return self.booster.number_of_trees
 
-
     # Make picklable with getstate and setstate
     def __getstate__(self) -> dict[Any, Any]:
         booster_json = self.json_dump()
@@ -967,4 +989,6 @@ def _id(i: int) -> str:
             for n in tree
         ]
 
-        return pd.DataFrame.from_records(vals).sort_values(['Tree', 'Node'], ascending=[True, True])
+        return pd.DataFrame.from_records(vals).sort_values(
+            ["Tree", "Node"], ascending=[True, True]
+        )
diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py
@@ -134,6 +134,8 @@ def test_get_node_list(X_y):
 test_args = itertools.product(
     [True, False], ["Weight", "Cover", "Gain", "TotalGain", "TotalCover"]
 )
+
+
 @pytest.mark.parametrize("is_numpy,importance_method", test_args)
 def test_feature_importance_method_init(
     X_y: tuple[pd.DataFrame, pd.Series], is_numpy: bool, importance_method: str
@@ -151,7 +153,7 @@ def test_feature_importance_method_init(
 
     imp = model.calculate_feature_importance(method=importance_method, normalize=True)
 
-    #for ft, cf in zip(model.feature_names_in_, model.feature_importances_):
+    # for ft, cf in zip(model.feature_names_in_, model.feature_importances_):
     #    print(imp.get(ft, 0.0), cf)
     #    print(imp.get(ft, 0.0) == cf)
 
@@ -326,7 +328,7 @@ def test_booster_contributions(X_y):
 
     contribs_weight = model.predict_contributions(X, method="Weight")
     assert np.allclose(contribs_weight.sum(1), preds)
-    #assert not np.allclose(contribs_weight, contribs_average)
+    # assert not np.allclose(contribs_weight, contribs_average)
 
     contribs_difference = model.predict_contributions(X, method="BranchDifference")
     assert not np.allclose(contribs_difference.sum(1), preds)
@@ -388,7 +390,9 @@ def test_missing_branch_with_contributions(X_y):
     assert np.allclose(model_miss_branch_conts.sum(1), model_miss_branch_preds)
     assert not np.allclose(model_miss_branch_preds, model_miss_leaf_preds)
 
-    model_miss_branch_conts = model_miss_branch.predict_contributions(X, method="weight")
+    model_miss_branch_conts = model_miss_branch.predict_contributions(
+        X, method="weight"
+    )
     assert np.allclose(model_miss_branch_conts.sum(1), model_miss_branch_preds)
 
     model_miss_branch_conts = model_miss_branch.predict_contributions(
@@ -730,10 +734,23 @@ def test_booster_saving_with_monotone_constraints(
         model_loaded = load_func(f64_model_path)
         assert all(preds == model_loaded.predict(X))
 
+
 def test_categorical(X_y):
     X = pd.read_csv("../resources/adult_test_df.csv", index_col=False)
-    y = np.array(pd.read_csv("../resources/adult_test_y.csv", index_col=False, header=None).squeeze('columns'))
-    cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
-    X[cols] = X[cols].astype('category')
+    y = np.array(
+        pd.read_csv(
+            "../resources/adult_test_y.csv", index_col=False, header=None
+        ).squeeze("columns")
+    )
+    cols = [
+        "workclass",
+        "education",
+        "marital-status",
+        "occupation",
+        "relationship",
+        "race",
+        "native-country",
+    ]
+    X[cols] = X[cols].astype("category")
     model = PerpetualBooster()
-    model.fit(X, y)
+    model.fit(X, y)
diff --git a/scripts/run-python-tests.ps1 b/scripts/run-python-tests.ps1
@@ -1,6 +1,6 @@
 Set-Location python-package
-black python/perpetual/
-black tests/
+python -m black python/perpetual/
+python -m black tests/
 maturin develop --release
 pytest .
 Set-Location ..
diff --git a/scripts/run-python-tests.sh b/scripts/run-python-tests.sh
@@ -1,6 +1,6 @@
 cd python-package
-black python/perpetual/
-black tests/
+python -m black python/perpetual/
+python -m black tests/
 maturin develop --release
 pytest .
 cd ..