-
Notifications
You must be signed in to change notification settings - Fork 10
/
stepwise_selection.py
95 lines (79 loc) · 3.58 KB
/
stepwise_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
def stepwiseSelection(data,
tag,
initial_list=None,
threshold_in=0.05,
threshold_out=0.10,
verbose=True):
'''
data: <Pandas.DataFrame> contaning indep/dep variables
tag: <String> name of target
initial_list: <List> independent variables that must be included
threshold_in: <float> partial F-test threshold for entering a variable
threshold_in: <float> partial F-test threshold for eliminating a variable
verbose: <Boolean> show stepwise details of entering and eliminating
>>> NOTE that {threshold_in < threshold_out} must be satisfied!
'''
ab_dic = dict()
ab_mark = ['~', '+', '-', ':', '*', '/']
for col in data.columns:
for mark in ab_mark:
if mark in col:
ab_dic[col.replace(mark, '')] = col
data.rename(columns={col: col.replace(mark, '')}, inplace=True)
included = initial_list if initial_list else []
excluded = data.columns.tolist()
excluded.remove(tag)
formula = f'{tag}~1' # set a constant model as initial reduced_model
best_r2_dif = .0
while True: # end loop when no variable gets in/out
changed = False
full_model = ols(formula=formula, data=data).fit()
last_adj_r2 = full_model.rsquared_adj
# forward step
for new_feature in excluded:
# Note here the test_model has more variable than full_model
test_model = ols(
formula=formula+f'+{new_feature}', data=data).fit()
# find feature whose contribution to adj_r2 largest
if test_model.rsquared_adj - last_adj_r2 > best_r2_dif:
best_r2_dif = test_model.rsquared_adj - last_adj_r2
last_adj_r2 = test_model.rsquared_adj
best_feature = new_feature
# Partial F-test
# Note that in anova_lm models with few variables are put forward
full_model_pro = ols(
formula=formula+f'+{best_feature}', data=data).fit()
anova_tbl = sm.stats.anova_lm(full_model, full_model_pro)
criterion = anova_tbl['Pr(>F)'][1]
if criterion <= threshold_in:
included.append(best_feature)
excluded.remove(best_feature)
formula += f'+{best_feature}'
full_model = full_model_pro
changed = True
best_r2_dif = .0
if verbose:
print('Add {:25} with f_pvalue {:.6}'.format(
best_feature, criterion))
# backward step
for old_feature in included:
test_model = ols(formula=formula.replace(
f'+{old_feature}', ''), data=data).fit()
# Note here the test_model has less variable than full_model
anova_tbl = sm.stats.anova_lm(test_model, full_model)
criterion = anova_tbl['Pr(>F)'][1]
if criterion >= threshold_out:
included.remove(old_feature)
excluded.append(old_feature)
formula = formula.replace(f'+{old_feature}', '')
changed = True
best_r2_dif = .0
if verbose:
print('Drop {:25} with f_pvalue {:.6}'.format(
old_feature, criterion))
if not changed:
break
return [ab_dic[x] if x in ab_dic.keys() else x for x in included]