-
Notifications
You must be signed in to change notification settings - Fork 26
/
environment.py
316 lines (261 loc) · 11.6 KB
/
environment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 8 02:27:20 2020
@author: ZhiningLiu1998
mailto: [email protected] / [email protected]
"""
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings("ignore")
from utils import (
Rater, meta_sampling, histogram_error_distribution, imbalance_train_test_split,
)
class Ensemble():
"""A basic ensemble learning framework.
Parameters
----------
base_estimator : object (scikit-learn classifier)
The base estimator used to build ensemble classifiers.
NO need to support sample weighting.
Built-in `fit()`, `predict()`, `predict_proba()` methods are required.
Attributes
----------
base_estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted sub-estimators.
"""
def __init__(self, base_estimator):
self.estimators_ = []
if not sklearn.base.is_classifier(base_estimator):
raise TypeError(f'Base estimator {base_estimator} is not a sklearn classifier.')
self.base_estimator_ = base_estimator
def fit_step(self, X, y):
"""Bulid a new base classifier from the training set (X, y).
Parameters
----------
y : array-like of shape = [n_samples]
The training labels.
X : array-like of shape = [n_samples, n_features]
The training instances.
Returns
----------
self : object (Ensemble)
"""
self.estimators_.append(
sklearn.base.clone(self.base_estimator_).fit(X, y)
)
return self
def predict_proba(self, X):
"""Predict class probabilities for X.
The predicted class probabilities of an input sample is computed as the
mean predicted class probabilities of the classifiers in the ensemble.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input data instances.
Returns
----------
p : array-like of shape [n_samples, n_classes]
The class probabilities of the input samples.
"""
y_pred = np.array(
[model.predict_proba(X)[:, 1] for model in self.estimators_]
).mean(axis=0)
if y_pred.ndim == 1:
y_pred = y_pred[:, np.newaxis]
if y_pred.shape[1] == 1:
y_pred = np.append(1-y_pred, y_pred, axis=1)
return y_pred
def predict(self, X):
"""Predict classes for X.
The predicted class of an input sample is computed as the mean
prediction of the classifiers in the ensemble.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input data instances.
Returns
----------
y : array-like of shape = [n_samples]
The predicted classes.
"""
y_pred_binarized = sklearn.preprocessing.binarize(
self.predict_proba(X)[:,1].reshape(1,-1), threshold=0.5)[0]
return y_pred_binarized
def score(self, X, y):
"""Return area under precision recall curve (AUCPRC) scores for X, y.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input data instances.
y : array-like of shape = [n_samples]
Labels for X.
Yields
----------
z : float
"""
yield sklearn.metrics.average_precision_score(
y, self.predict_proba(X)[:, 1])
class EnsembleTrainingEnv(Ensemble):
"""The ensemble training environment in MESA.
Parameters
----------
args : arguments
See arguments.py for more information.
base_estimator : object (scikit-learn classifier)
The base estimator used to build ensemble classifiers.
NO need to support sample weighting.
Built-in `fit()`, `predict()`, `predict_proba()` methods are required.
Attributes
----------
args : arguments
rater : object (Rater)
Rater for evaluate classifiers performance on class imabalanced data.
See arguments.py for more information.
base_estimator_ : object (scikit-learn classifier)
The base estimator from which the ensemble is grown.
estimators_ : list of classifiers
The collection of fitted sub-estimators.
"""
def __init__(self, args, base_estimator):
super(EnsembleTrainingEnv, self).__init__(
base_estimator=base_estimator)
self.base_estimator_ = base_estimator
self.args = args
self.rater = Rater(metric=args.metric)
def load_data(self, X_train, y_train, X_valid, y_valid, X_test=None, y_test=None, train_ratio=1):
"""Load and preprocess the train/valid/test data into the environment."""
self.flag_use_test_set = False if X_test is None or y_test is None else True
if train_ratio < 1:
print ('Using {:.2%} random subset for meta-training.'.format(train_ratio))
_, X_train, _, y_train = imbalance_train_test_split(X_train, y_train, test_size=train_ratio)
self.X_train, self.y_train = pd.DataFrame(X_train), pd.Series(y_train)
self.X_valid, self.y_valid = pd.DataFrame(X_valid), pd.Series(y_valid)
self.X_test, self.y_test = pd.DataFrame(X_test), pd.Series(y_test)
self.mask_maj_train, self.mask_min_train = (y_train==0), (y_train==1)
self.mask_maj_valid, self.mask_min_valid = (y_valid==0), (y_valid==1)
self.n_min_samples = self.mask_min_train.sum()
n_samples = int(self.n_min_samples*self.args.train_ir)
if n_samples > self.mask_maj_train.sum():
raise ValueError(f"\
Argument 'train_ir' should be smaller than imbalance ratio,\n \
Please set this parameter to < {self.mask_maj_train.sum()/self.mask_min_train.sum()}.\
")
self.n_samples = n_samples
def init(self):
"""Reset the environment."""
self.estimators_ = []
# buffer the predict probabilities for better efficiency
# initialize
self.y_pred_train_buffer = np.zeros_like(self.y_train)
self.y_pred_valid_buffer = np.zeros_like(self.y_valid)
if self.flag_use_test_set:
self.y_pred_test_buffer = np.zeros_like(self.y_test)
self._warm_up()
def get_state(self):
"""Fetch the current state of the environment."""
hist_train = histogram_error_distribution(
self.y_train[self.mask_maj_train],
self.y_pred_train_buffer[self.mask_maj_train],
self.args.num_bins)
hist_valid = histogram_error_distribution(
self.y_valid[self.mask_maj_valid],
self.y_pred_valid_buffer[self.mask_maj_valid],
self.args.num_bins)
hist_train = hist_train / hist_train.sum() * self.args.num_bins
hist_valid = hist_valid / hist_valid.sum() * self.args.num_bins
state = np.concatenate([hist_train, hist_valid])
return state
def step(self, action, verbose=False):
"""Perform an environment step.
Parameters
----------
action: float, in [0, 1]
The action (mu) to execute in the environment.
verbose: bool, optional (default=False)
Whether to compute and return the information about the current ensemble.
Returns
----------
next_state : array-like of shape [state_size]
The state of the environment after executing the action.
reward : float
The reward of taking the action.
done : bool
Indicates the end of an episode.
True if the ensemble reaches the maximum number of base estimators.
info : string
Information about the current ensemble.
Empty string if verbose == False.
"""
# check action value
if action < 0 or action > 1:
raise ValueError("Action must be a float in [0, 1].")
# perform meta-sampling
X_maj_subset = meta_sampling(
y_pred = self.y_pred_train_buffer[self.mask_maj_train],
y_true = self.y_train[self.mask_maj_train],
n_under_samples = self.n_samples,
X = self.X_train[self.mask_maj_train],
mu = action,
sigma = self.args.sigma,
random_state = self.args.random_state,)
# build training subset (X_train_iter, y_train_iter)
X_train_iter = pd.concat([X_maj_subset, self.X_train[self.mask_min_train]]).values
y_train_iter = np.concatenate([np.zeros(X_maj_subset.shape[0]), np.ones(self.n_min_samples)])
score_valid_before = self.rater.score(self.y_valid, self.y_pred_valid_buffer)
# build a new base classifier from (X_train_iter, y_train_iter)
self.fit_step(X_train_iter, y_train_iter)
self.update_all_pred_buffer()
score_valid = self.rater.score(self.y_valid, self.y_pred_valid_buffer)
# obtain return values
next_state = self.get_state()
reward = score_valid - score_valid_before
done = True if len(self.estimators_) >= self.args.max_estimators else False
info = ''
# fetch environment information if verbose==True
if self.args.meta_verbose is 'full' or verbose:
score_train = self.rater.score(self.y_train, self.y_pred_train_buffer)
score_test = self.rater.score(self.y_test, self.y_pred_test_buffer) if self.flag_use_test_set else 'NULL'
info = 'k={:<3d}|{}| train {:.3f} | valid {:.3f} | '.format(
len(self.estimators_)-1, self.args.metric, score_train, score_valid)
info += 'test {:.3f}'.format(score_test) if self.flag_use_test_set else 'test NULL'
return next_state, reward, done, info
def update_all_pred_buffer(self):
"""Update all buffered predict probabilities."""
n_clf = len(self.estimators_)
self.y_pred_train_buffer = self._update_pred_buffer(n_clf, self.X_train, self.y_pred_train_buffer)
self.y_pred_valid_buffer = self._update_pred_buffer(n_clf, self.X_valid, self.y_pred_valid_buffer)
if self.flag_use_test_set:
self.y_pred_test_buffer = self._update_pred_buffer(n_clf, self.X_test, self.y_pred_test_buffer)
return
def _update_pred_buffer(self, n_clf, X, y_pred_buffer):
"""Update buffered predict probabilities.
Parameters
----------
n_clf : int
Current ensemble size.
X : array-like of shape = [n_samples, n_features]
The input data instances.
y_pred_buffer : array-like of shape [n_samples]
The buffered predict probabilities of X.
Returns
----------
y_pred_updated : array-like of shape [n_samples]
"""
y_pred_last_clf = self.estimators_[-1].predict_proba(X)[:, 1]
y_pred_buffer_updated = (y_pred_buffer * (n_clf-1) + y_pred_last_clf) / n_clf
return y_pred_buffer_updated
def _warm_up(self):
"""Train the first base classifier with random under-sampling."""
X_maj = self.X_train[self.mask_maj_train]
X_min = self.X_train[self.mask_min_train]
X_maj_rus = X_maj.sample(n=self.n_samples, random_state=self.args.random_state)
# X_maj_rus = X_maj
X_train_rus = pd.concat([X_maj_rus, X_min]).values
y_train_rus = np.concatenate([np.zeros(X_maj_rus.shape[0]), np.ones(X_min.shape[0])])
self.fit_step(X_train_rus, y_train_rus)
self.update_all_pred_buffer()
return