Skip to content

Commit

Permalink
allow fit_linear_frac in multi-task gam
Browse files Browse the repository at this point in the history
  • Loading branch information
csinva committed Apr 3, 2024
1 parent 6a3b68e commit 0be79e0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 16 deletions.
55 changes: 41 additions & 14 deletions imodels/algebraic/gam_multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
fit_target_curves=True,
use_correlation_screening_for_features=False,
use_single_task_with_reweighting=False,
fit_linear_frac: float = None,
random_state=42,
):
"""
Expand All @@ -74,6 +75,8 @@ def __init__(
fit an EBM to predict the single target, then apply linear reweighting
use_correlation_screening_for_features: bool
whether to use correlation screening for features
fit_linear_frac: float
If not None, the fraction of features to use for the linear model (the rest are used for the EBM)
"""
self.ebm_kwargs = ebm_kwargs
self.multitask = multitask
Expand All @@ -87,6 +90,7 @@ def __init__(
self.fit_target_curves = fit_target_curves
self.use_single_task_with_reweighting = use_single_task_with_reweighting
self.use_correlation_screening_for_features = use_correlation_screening_for_features
self.fit_linear_frac = fit_linear_frac

# override ebm_kwargs
ebm_kwargs['random_state'] = random_state
Expand Down Expand Up @@ -134,44 +138,49 @@ def fit(self, X, y, sample_weight=None):
self.ebm_.fit(X, y, sample_weight=sample_weight)
return self

# fit EBM to each column of X
# fit EBM(s)
self.ebms_ = []
num_features = X.shape[1]
num_samples, num_features = X.shape
idxs_ebm, idxs_lin = self._split_data(num_samples)

# fit EBM
if self.use_single_task_with_reweighting:
# fit an EBM to predict the single output
self.ebms_.append(self._initialize_ebm_internal(y))
self.ebms_[-1].fit(X, y, sample_weight=sample_weight)
self.ebms_.append(self._initialize_ebm_internal(y[idxs_ebm]))
self.ebms_[-1].fit(X[idxs_ebm], y[idxs_ebm],
sample_weight=sample_weight[idxs_ebm])
elif self.n_outputs_ == 1:
# with 1 output, we fit an EBM to each feature
for task_num in tqdm(range(num_features)):
y_ = np.ascontiguousarray(X[:, task_num])
X_ = deepcopy(X)
y_ = np.ascontiguousarray(X[idxs_ebm][:, task_num])
X_ = deepcopy(X[idxs_ebm])
X_[:, task_num] = 0
self.ebms_.append(self._initialize_ebm_internal(y_))
if isinstance(self, ClassifierMixin):
_, y_ = np.unique(y_, return_inverse=True)
elif self.use_normalize_feature_targets:
y_ = StandardScaler().fit_transform(y_.reshape(-1, 1)).ravel()
self.ebms_[task_num].fit(X_, y_, sample_weight=sample_weight)
self.ebms_[task_num].fit(
X_, y_, sample_weight=sample_weight[idxs_ebm])

# also fit an EBM to the target
if self.fit_target_curves:
self.ebms_.append(self._initialize_ebm_internal(y))
self.ebms_[num_features].fit(X, y, sample_weight=sample_weight)
self.ebms_.append(self._initialize_ebm_internal(y[idxs_ebm]))
self.ebms_[num_features].fit(
X[idxs_ebm], y[idxs_ebm], sample_weight=sample_weight[idxs_ebm])
elif self.n_outputs_ > 1:
# with multiple outputs, we fit an EBM to each output
for task_num in tqdm(range(self.n_outputs_)):
self.ebms_.append(self._initialize_ebm_internal(y))
y_ = np.ascontiguousarray(y[:, task_num])
self.ebms_[task_num].fit(X, y_, sample_weight=sample_weight)
self.ebms_.append(self._initialize_ebm_internal(y[idxs_ebm]))
y_ = np.ascontiguousarray(y[idxs_ebm][:, task_num])
self.ebms_[task_num].fit(
X[idxs_ebm], y_, sample_weight=sample_weight[idxs_ebm])

# extract features from EBMs
self.term_names_list_ = [
ebm_.term_names_ for ebm_ in self.ebms_]
self.term_names_ = sum(self.term_names_list_, [])
feats = self._extract_ebm_features(X)

if self.renormalize_features:
self.scaler_ = StandardScaler()
feats = self.scaler_.fit_transform(feats)
Expand All @@ -181,7 +190,8 @@ def fit(self, X, y, sample_weight=None):
feats[np.isinf(feats)] = 0

# fit linear model
self.lin_model = self._fit_linear_model(feats, y, sample_weight)
self.lin_model = self._fit_linear_model(
feats[idxs_lin], y[idxs_lin], sample_weight[idxs_lin])

return self

Expand All @@ -191,6 +201,23 @@ def _initialize_ebm_internal(self, y):
else:
return ExplainableBoostingRegressor(**self.ebm_kwargs)

def _split_data(self, num_samples):
'''Split data into EBM and linear model data
'''
if self.fit_linear_frac is not None:
rng = np.random.RandomState(self.random_state)
idxs_ebm = rng.choice(num_samples, int(
num_samples * self.fit_linear_frac), replace=False)
idxs_lin = np.array(
[i for i in range(num_samples) if i not in idxs_ebm])
else:
idxs_ebm = np.arange(num_samples)
idxs_lin = idxs_ebm
assert len(idxs_ebm) > 0, f"No data for EBM! {self.fit_linear_frac=}"
assert len(
idxs_lin) > 0, f"No data for linear model! {self.fit_linear_frac=}"
return idxs_ebm, idxs_lin

def _fit_linear_model(self, feats, y, sample_weight):
# fit a linear model to the features
if isinstance(self, ClassifierMixin):
Expand Down
8 changes: 6 additions & 2 deletions tests/gam_multitask_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,18 @@ def compare_models():
# X, y, feature_names = imodels.get_clean_dataset("diabetes")

# remove some features to speed things up
# X = X[:, :2]
X = X[:, :5]
X = X[:50]
y = y[:50]
X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

results = defaultdict(list)
for gam in tqdm([
MultiTaskGAMRegressor(use_correlation_screening_for_features=True),
# MultiTaskGAMRegressor(use_correlation_screening_for_features=True),
MultiTaskGAMRegressor(
use_single_task_with_reweighting=True, fit_linear_frac=0.5),
MultiTaskGAMRegressor(),
MultiTaskGAMRegressor(fit_linear_frac=0.5),
# MultiTaskGAMRegressor(fit_target_curves=False),
# AdaBoostRegressor(
# estimator=MultiTaskGAMRegressor(
Expand All @@ -189,6 +192,7 @@ def compare_models():
results['test_r2'].append(gam.score(X_test, y_test).round(3))
if hasattr(gam, 'lin_model'):
print('lin model coef', gam.lin_model.coef_)
print(results)

# don't round strings
with pd.option_context(
Expand Down

0 comments on commit 0be79e0

Please sign in to comment.