-
Notifications
You must be signed in to change notification settings - Fork 2
/
xgboost.py
103 lines (86 loc) · 4.74 KB
/
xgboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample
import pandas as pd
from tqdm import tqdm
import math
import operator
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut, train_test_split, StratifiedKFold, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
def load_matrices():
X = np.load('./data/entire_2016/eligible_ever/features.npy')
Y = np.load('./data/entire_2016/eligible_ever/labels.npy')
censor_mask = np.load('./data/entire_2016/eligible_ever/censor_mask.npy')
X_tr = X[np.logical_not(censor_mask)]
Y_tr = Y[np.logical_not(censor_mask)]
X = np.load('./data/entire_2017/eligible_ever/features.npy')
Y = np.load('./data/entire_2017/eligible_ever/labels.npy')
censor_mask = np.load('data/entire_2017/eligible_ever/censor_mask.npy')
eligible_mask = np.load('data/entire_2017/eligible_ever/eligible_mask.npy')
X_te = X[np.logical_and(np.logical_not(censor_mask), np.logical_not(eligible_mask))]
Y_te = Y[np.logical_and(np.logical_not(censor_mask), np.logical_not(eligible_mask))]
return X_tr, Y_tr, X_te, Y_te
def param_search(X, Y):
rskf = RepeatedStratifiedKFold(n_splits=5 n_repeats=5 random_state=4)
final_params = {}
# 1. Search over n_estimators
xgb_class = xgb.XGBClassifier(learning_rate=0.02, objective='binary:logistic', silent=True, nthread=1)
params = {
'n_estimators': [i*100 for i in range(6,11)]
}
grid_search = GridSearchCV(xgb_class, param_grid=params, scoring='roc_auc', cv=rskf.split(X,Y), verbose=3, n_jobs=4)
grid_search.fit(X, Y)
final_params['n_estimators'] = grid_search.best_params_['n_estimators']
# 2. Search over max_depth and min_child_weight
xgb_class = xgb.XGBClassifier(learning_rate=0.02, n_estimators=final_params['n_estimators'], objective='binary:logistic', silent=True, nthread=1)
params = {
'max_depth': [i for i in range(1,10,2)],
'min_child_weight': [i for i in range(2,8,2)]
}
grid_search = GridSearchCV(xgb_class, param_grid=params, scoring='roc_auc', cv=rskf.split(X,Y), verbose=3, n_jobs=4)
grid_search.fit(X, Y)
final_params['max_depth'] = grid_search.best_params_['max_depth']
final_params['min_child_weight'] = grid_search.best_params_['min_child_weight']
# 3. Search over gamma
xgb_class = xgb.XGBClassifier(learning_rate=0.02, n_estimators=final_params['n_estimators'], objective='binary:logistic', \
min_child_weight=final_params['min_child_weight'], max_depth=final_params['max_depth'], silent=True, nthread=1)
params = {
'gamma': [i/10.0 for i in range(5)]
}
grid_search = GridSearchCV(xgb_class, param_grid=params, scoring='roc_auc', cv=rskf.split(X,Y), verbose=3, n_jobs=4)
grid_search.fit(X,Y)
final_params['gamma'] = grid_search.best_params_['gamma']
# 4. Search over subsample and colsample_bytree
xgb_class = xgb.XGBClassifier(learning_rate=0.02, n_estimators=final_params['n_estimators'], objective='binary:logistic', \
min_child_weight=final_params['min_child_weight'], max_depth=final_params['max_depth'], gamma=final_params['gamma'], silent=True, nthread=1)
params = {
'subsample': [i/10.0 for i in range(4,7)],
'colsample_bytree': [i/10.0 for i in range(4,7)]
}
grid_search = GridSearchCV(xgb_class, param_grid=params, scoring='roc_auc', cv=rskf.split(X,Y), verbose=3, n_jobs=4)
grid_search.fit(X,Y)
final_params['subsample'] = grid_search.best_params_['subsample']
final_params['colsample_bytree'] = grid_search.best_params_['colsample_bytree']
return final_params
def final_performance(X_tr, Y_tr, X_te, Y_te, param):
stats = []
for i in tqdm(range(1000)):
xgb_class = xgb.XGBClassifier(learning_rate=0.02, n_estimators=param['n_estimators'], max_depth=param['max_depth'], \
min_child_weight=param['min_child_weight'], gamma=param['gamma'], \
subsample=param['subsample'], colsample_bytree=param['colsample_bytree'], \
objective='binary:logistic', silent=True, nthread=1)
xgb_class.fit(X_tr, Y_tr)
X, Y = resample(X_te, Y_te, n_samples=X_te.shape[0], random_state=i)
preds = xgb_class.predict_proba(X)
auc = roc_auc_score(Y, preds[:1], average='weighted')
stats.append(auc)
print('AUC: ' auc)
def main():
X_tr, Y_tr, X_te, Y_te = load_matrices()
params = param_search(X_tr, Y_tr)
auc_list = final_performance(params)
print('AUC: ' + str(np.median(stats)) + ' (' + str(sorted(stats)[24]) + ' ' + str(sorted(stats)[974]) + ')')
return
if __name__ == '__main__':
main()