-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-save-model.py
187 lines (132 loc) · 5.1 KB
/
train-save-model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head(2)
# check for missing values and the number of unique values in df
for col in df.columns:
print('*' * 80)
print(f'"{col.upper()}" Data Types, No. of Missing Values and Unique Values')
print('*' * 80)
print(f"data type: {df[col].dtype}")
print(f"no. missing values: {df[col].isna().sum()}")
print(f"no. of unique values: {df[col].nunique()}")
print()
# Clean Data
def clean_values(value):
"""
value (str):
a row text value or column name
Return (str):
a lowercase text with spaces replaced with _ and all leading and training spaces removed
"""
dummy = value.strip().lower().replace(' ', '_')
return dummy
# convert all columns to lower case
columns = list(map(clean_values, df.columns))
df.columns = columns
# Get Numerical and Categorical Columns
# total charges is suppose to be an integer from examining the dataframe but it's bject
numerical = list(df.dtypes[df.dtypes != 'object'].index) + ['totalcharges']
# while senior citizen should be a cetegorical feature
numerical.remove('seniorcitizen')
# clean all non-numerical column values
categorical = list(df.dtypes[df.dtypes == 'object'].index)
# total charges should be a numerical column
for col in ['totalcharges', 'churn', 'customerid']:
categorical.remove(col)
# convert all categorical column values to lower case
df = df.apply(lambda x: x.str.lower() if x.name in categorical else x)
# add senior citizen column to categorical
categorical.append('seniorcitizen')
# Change the type of totalcharges and churn column
# total charges is an object type but holds numerical values: convert it
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
# values that cannot be converted to integer are replaced with NaN: fill the missing values with zeroes
df['totalcharges'] = df['totalcharges'].fillna(0)
# change the type of churn column
# let yes be 1
df['churn'] = (df['churn'] == 'Yes').astype('int')
# Train, Validation and Test Dataset
# Split the dataframe into full_train, train, validation and test dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
# Reset the index values
df_full_train.reset_index(drop=True, inplace = True)
# All the target variables from the train, test and validation set
y_full_train = df_full_train['churn']
# y_train = df_train['churn']
# y_val = df_val['churn']
y_test = df_test['churn']
# Training Function
def train(df_train, y_train, c=1.0):
"""
df_train (DataFrame):
The training feature exam
y_train (Series):
The training target variables
Return:
dv (DictVectorizer): fitted one-hot encoding object
model (LogisticRegresion): trained logistic regression model
"""
# convert the data frame to list of dictionary
# each dicitionary is a key value pair of a particular record/observation
dict_df = df_train[categorical + numerical].to_dict(orient='records')
# one-hot encode the categorical variable
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_df)
model = LogisticRegression(C=c, max_iter=1000)
model.fit(X_train, y_train)
return dv, model
# Predict Function
def predict(dv, model, df):
"""
dv:
fitted one-hot encoding object
model:
trained logistic regression model
df (DataGFrame):
Test dataframe
Return (series):
predicted values of the test dataset
"""
# convert the data frame to dictionary
dict_df = df[categorical + numerical].to_dict(orient='records')
# one-hot encode the categorical variable
X_test= dv.transform(dict_df)
# predict the test data
y_pred = model.predict_proba(X_test)[:, 1]
return y_pred
# Validation
C = [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
for c in tqdm(C):
scores = []
n_split = 5
# this is a generator
kfolds = KFold(n_splits=n_split, random_state=1, shuffle=True)
for train_idx, val_idx in tqdm(kfolds.split(df_full_train)):
df_train = df_full_train.iloc[train_idx]
df_val = df_full_train.iloc[val_idx]
y_train = df_train.churn.values
y_val = df_val.churn.values
dv, model = train(df_train, y_train, c)
pred_val = predict(dv, model, df_val)
auc = roc_auc_score(y_val, pred_val)
scores.append(auc)
print(f"C <=> {c}: {round(np.array(scores).mean(), 3)} +/- {round(np.array(scores).std(), 3)}")
# Final Test Prediction
c=0.1
dv, model = train(df_full_train, y_full_train, c)
pred_test = predict(dv, model, df_test)
auc = roc_auc_score(y_test, pred_test)
# Saving the Model
import pickle
file_name = f'model_c={c}.bin'
with open(file_name, 'wb') as f:
pickle.dump((dv, model), f)