Skip to content

Commit

Permalink
fix black formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
deadsoul44 committed Jun 7, 2024
1 parent 9f51f24 commit 2473b8a
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 38 deletions.
78 changes: 51 additions & 27 deletions python-package/python/perpetual/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,10 @@
}




@dataclass
class Node:
"""Dataclass representation of a node, this represents all of the fields present in a tree node."""

num: int
weight_value: float
hessian_sum: float
Expand Down Expand Up @@ -137,8 +136,9 @@ def get_metadata(self, key: str) -> str:
"""pass"""



def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]:
def convert_input_frame(
X: FrameLike, categorical_features
) -> tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]:
"""Convert data to format needed by booster.
Returns:
Expand All @@ -149,18 +149,27 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str],
X_ = X.to_numpy()
features_ = X.columns.to_list()
if categorical_features == "auto":
categorical_columns = X.select_dtypes(include=['category']).columns.tolist()
categorical_features_ = [features_.index(c) for c in categorical_columns] or None
categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
categorical_features_ = [
features_.index(c) for c in categorical_columns
] or None
else:
# Assume it's a numpy array.
X_ = X
features_ = list(map(str, range(X_.shape[1])))

if categorical_features and all(isinstance(s, int) for s in categorical_features) and isinstance(categorical_features, list):
if (
categorical_features
and all(isinstance(s, int) for s in categorical_features)
and isinstance(categorical_features, list)
):
categorical_features_ = categorical_features
elif categorical_features and all(isinstance(s, str) for s in categorical_features) and isinstance(categorical_features, list):
elif (
categorical_features
and all(isinstance(s, str) for s in categorical_features)
and isinstance(categorical_features, list)
):
categorical_features_ = [features_.index(c) for c in categorical_features]


cat_mapping = {} # key: feature_name, value: ordered category names
if categorical_features_:
Expand All @@ -175,13 +184,18 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str],
print(f"Mapping of categories: {cat_mapping}")
for feature_name, categories in cat_mapping.items():
feature_index = features_.index(feature_name)

def f(x):
try:
return np.nan if str(x[feature_index]) == "nan" else float(categories.index(str(x[feature_index])))
return (
np.nan
if str(x[feature_index]) == "nan"
else float(categories.index(str(x[feature_index])))
)
except (ValueError, IndexError):
return np.nan
X_[:, feature_index] = np.apply_along_axis(f, 1, X_)

X_[:, feature_index] = np.apply_along_axis(f, 1, X_)

if not np.issubdtype(X_.dtype, "float64"):
X_ = X_.astype(dtype="float64", copy=False)
Expand All @@ -194,8 +208,9 @@ def f(x):
return features_, flat_data, rows, cols, categorical_features_, cat_mapping



def transform_input_frame(X: FrameLike, cat_mapping) -> tuple[list[str], np.ndarray, int, int]:
def transform_input_frame(
X: FrameLike, cat_mapping
) -> tuple[list[str], np.ndarray, int, int]:
"""Convert data to format needed by booster.
Returns:
Expand All @@ -212,18 +227,24 @@ def transform_input_frame(X: FrameLike, cat_mapping) -> tuple[list[str], np.ndar
if cat_mapping:
for feature_name, categories in cat_mapping.items():
feature_index = features_.index(feature_name)

def f(x):
try:
return np.nan if str(x[feature_index]) == "nan" else float(categories.index(str(x[feature_index])))
return (
np.nan
if str(x[feature_index]) == "nan"
else float(categories.index(str(x[feature_index])))
)
except (ValueError, IndexError):
return np.nan

X_[:, feature_index] = np.apply_along_axis(f, 1, X_)

if not np.issubdtype(X_.dtype, "float64"):
X_ = X_.astype(dtype="float64", copy=False)
flat_data = X_.ravel(order="F")
rows, cols = X_.shape

return features_, flat_data, rows, cols


Expand Down Expand Up @@ -297,9 +318,9 @@ def __init__(
create a separate branch for missing, creating a ternary tree, the missing node will be given the same
weight value as the parent node. If this parameter is `False`, missing will be sent
down either the left or right branch, creating a binary tree. Defaults to `False`.
terminate_missing_features (set[Any], optional): An optional iterable of features
terminate_missing_features (set[Any], optional): An optional iterable of features
(either strings, or integer values specifying the feature indices if numpy arrays are used for fitting),
for which the missing node will always be terminated, even if `allow_missing_splits` is set to true.
for which the missing node will always be terminated, even if `allow_missing_splits` is set to true.
This value is only valid if `create_missing_branch` is also True.
missing_node_treatment (str, optional): Method for selecting the `weight` for the missing node, if `create_missing_branch` is set to `True`. Defaults to "None". Valid options are:
- "None": Calculate missing node weight values without any constraints.
Expand All @@ -319,7 +340,7 @@ def __init__(
force_children_to_bound_parent (bool, optional): Setting this parameter to `True` will restrict children nodes, so that they always contain the parent node inside of their range. Without setting this it's possible that both, the left and the right nodes could be greater, than or less than, the parent node. Defaults to `False`.
log_iterations (bool, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
Raises:
TypeError: Raised if an invalid dtype is passed.
Expand Down Expand Up @@ -353,7 +374,7 @@ def __init__(
```
"""

terminate_missing_features_ = (
set() if terminate_missing_features is None else terminate_missing_features
)
Expand Down Expand Up @@ -415,14 +436,16 @@ def fit(
categorical_features: The names or indices for categorical features. auto for pandas categorical data type
"""

features_, flat_data, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, categorical_features)
features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
convert_input_frame(X, categorical_features)
)
self.n_features_ = cols
self._set_metadata_attributes("n_features_", self.n_features_)
self.cat_mapping = cat_mapping
self._set_metadata_attributes("cat_mapping", self.cat_mapping)
self.feature_names_in_ = features_
self._set_metadata_attributes("feature_names_in_", self.feature_names_in_)

y_ = _convert_input_array(y)

if sample_weight is None:
Expand All @@ -434,7 +457,9 @@ def fit(
# by the rust code.
monotone_constraints_ = self._standardize_monotonicity_map(X)
self.booster.monotone_constraints = monotone_constraints_
self.booster.terminate_missing_features = (self._standardize_terminate_missing_features(X))
self.booster.terminate_missing_features = (
self._standardize_terminate_missing_features(X)
)

self.booster.fit(
flat_data=flat_data,
Expand Down Expand Up @@ -479,7 +504,7 @@ def predict(self, X: FrameLike, parallel: Union[bool, None] = None) -> np.ndarra
cols=cols,
parallel=parallel_,
)

@property
def feature_importances_(self) -> np.ndarray:
vals = self.calculate_feature_importance(
Expand Down Expand Up @@ -531,7 +556,6 @@ def predict_contributions(
)
return np.reshape(contributions, (rows, cols + 1))


def partial_dependence(
self,
X: FrameLike,
Expand Down Expand Up @@ -811,7 +835,6 @@ def _get_metadata_attributes(self, key: str) -> Any:
value = self.get_metadata(key)
return self.meta_data_attributes[key].deserialize(value)


@property
def number_of_trees(self) -> int:
"""The number of trees in the model.
Expand All @@ -821,7 +844,6 @@ def number_of_trees(self) -> int:
"""
return self.booster.number_of_trees


# Make picklable with getstate and setstate
def __getstate__(self) -> dict[Any, Any]:
booster_json = self.json_dump()
Expand Down Expand Up @@ -967,4 +989,6 @@ def _id(i: int) -> str:
for n in tree
]

return pd.DataFrame.from_records(vals).sort_values(['Tree', 'Node'], ascending=[True, True])
return pd.DataFrame.from_records(vals).sort_values(
["Tree", "Node"], ascending=[True, True]
)
31 changes: 24 additions & 7 deletions python-package/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ def test_get_node_list(X_y):
test_args = itertools.product(
[True, False], ["Weight", "Cover", "Gain", "TotalGain", "TotalCover"]
)


@pytest.mark.parametrize("is_numpy,importance_method", test_args)
def test_feature_importance_method_init(
X_y: tuple[pd.DataFrame, pd.Series], is_numpy: bool, importance_method: str
Expand All @@ -151,7 +153,7 @@ def test_feature_importance_method_init(

imp = model.calculate_feature_importance(method=importance_method, normalize=True)

#for ft, cf in zip(model.feature_names_in_, model.feature_importances_):
# for ft, cf in zip(model.feature_names_in_, model.feature_importances_):
# print(imp.get(ft, 0.0), cf)
# print(imp.get(ft, 0.0) == cf)

Expand Down Expand Up @@ -326,7 +328,7 @@ def test_booster_contributions(X_y):

contribs_weight = model.predict_contributions(X, method="Weight")
assert np.allclose(contribs_weight.sum(1), preds)
#assert not np.allclose(contribs_weight, contribs_average)
# assert not np.allclose(contribs_weight, contribs_average)

contribs_difference = model.predict_contributions(X, method="BranchDifference")
assert not np.allclose(contribs_difference.sum(1), preds)
Expand Down Expand Up @@ -388,7 +390,9 @@ def test_missing_branch_with_contributions(X_y):
assert np.allclose(model_miss_branch_conts.sum(1), model_miss_branch_preds)
assert not np.allclose(model_miss_branch_preds, model_miss_leaf_preds)

model_miss_branch_conts = model_miss_branch.predict_contributions(X, method="weight")
model_miss_branch_conts = model_miss_branch.predict_contributions(
X, method="weight"
)
assert np.allclose(model_miss_branch_conts.sum(1), model_miss_branch_preds)

model_miss_branch_conts = model_miss_branch.predict_contributions(
Expand Down Expand Up @@ -730,10 +734,23 @@ def test_booster_saving_with_monotone_constraints(
model_loaded = load_func(f64_model_path)
assert all(preds == model_loaded.predict(X))


def test_categorical(X_y):
X = pd.read_csv("../resources/adult_test_df.csv", index_col=False)
y = np.array(pd.read_csv("../resources/adult_test_y.csv", index_col=False, header=None).squeeze('columns'))
cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
X[cols] = X[cols].astype('category')
y = np.array(
pd.read_csv(
"../resources/adult_test_y.csv", index_col=False, header=None
).squeeze("columns")
)
cols = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"native-country",
]
X[cols] = X[cols].astype("category")
model = PerpetualBooster()
model.fit(X, y)
model.fit(X, y)
4 changes: 2 additions & 2 deletions scripts/run-python-tests.ps1
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Set-Location python-package
black python/perpetual/
black tests/
python -m black python/perpetual/
python -m black tests/
maturin develop --release
pytest .
Set-Location ..
4 changes: 2 additions & 2 deletions scripts/run-python-tests.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cd python-package
black python/perpetual/
black tests/
python -m black python/perpetual/
python -m black tests/
maturin develop --release
pytest .
cd ..

0 comments on commit 2473b8a

Please sign in to comment.