-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredictionTree.py
127 lines (89 loc) · 4.67 KB
/
predictionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Applying decision tree algorithm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# Importing the dataset
df = pd.read_csv("./bin/public/data.csv")
heart_data = df
# number of dead people in dataset
print(heart_data[heart_data.DEATH_EVENT == 1].shape[0])
# all parameters an user can input
all_features = heart_data.columns
# all_features = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure',
# 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT']
creatinine_phosphokinase_mean = df["creatinine_phosphokinase"].mean()
ejection_fraction_mean = df["ejection_fraction"].mean()
platelets_mean = df["platelets"].mean()
serum_creatinine_mean = df["serum_creatinine"].mean()
serum_sodium_mean = df["serum_sodium"].mean()
time_mean = df["time"].mean()
# we wont be using time feature
# input_from_doc = {'age': 75, 'anaemia': 0, 'creatinine_phosphokinase': 582, 'diabetes': 0, 'ejection_fraction': 20, 'high_blood_pressure': 1, 'platelets': 265000, 'serum_creatinine': 1.9, 'serum_sodium': 130, 'sex': 1}
# input_form_person = {'age': 40, 'anaemia': 0, 'diabetes': 1, 'high_blood_pressure': 0, 'sex': 1, 'smoking': 0}
dead_man1 = {'age': 75, 'anaemia': 0, 'diabetes': 0, 'high_blood_pressure': 1, 'sex': 1, 'smoking': 0}
dead_man2 = {'age': 55, 'anaemia': 0, 'diabetes': 0, 'high_blood_pressure': 0, 'sex': 1, 'smoking': 0}
alive_man = {'age': 40, 'anaemia': 1, 'diabetes': 0, 'high_blood_pressure': 1, 'sex': 0, 'smoking': 0}
ALIVE = False
if ALIVE:
input_form_person = alive_man
else:
input_form_person = dead_man2
append_data = {'creatinine_phosphokinase': creatinine_phosphokinase_mean, 'ejection_fraction': ejection_fraction_mean, 'platelets': platelets_mean, 'serum_creatinine': serum_creatinine_mean, 'serum_sodium': serum_sodium_mean, 'time': time_mean}
# append data if missing, but averages so it doesn't affect the result
for k,v in append_data.items():
if k not in input_form_person.keys():
input_form_person[k] = round(v,5)
print("input_form_person:", input_form_person)
# Features = ['age', 'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking', 'DEATH_EVENT']
Features = list(input_form_person.keys()) # we set features to train our model on by observing which ones a person has put in.
print("Features:")
print(Features)
print("input_form_person:", input_form_person)
df_person = pd.DataFrame(input_form_person, index=[0]) # create person df for obtaining our result later on a trained model.
x = heart_data[Features] # select data with the features we have available data for
y = heart_data["DEATH_EVENT"]
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=2, shuffle=True)
classifier = DecisionTreeClassifier(max_leaf_nodes = 3, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train) # train our model
# Predicting the test set
y_pred = classifier.predict(x_test) # binary values, array, 0 - death, 1 - alive; we don't really need these anymore
y_perc = classifier.predict_proba(x_test) # percentage values
# print(abs_weights)
# input the df rom a person to a trained model and get the result:
y_OUT_BIN = classifier.predict(df_person) # binary value if dead == 1
y_OUT = classifier.predict_proba(df_person)
def print_rez():
print("Percent , class")
# array of 2 elements means [death, live] in percentages
# so if death % is > 0.5, then the person will die with that prob
for percent, class_pred in zip(y_perc, y_pred):
if class_pred == 1: # death
print("dead:", round(percent[1],2), "%")
else: # alive
# print("alive:", round(percent[1],2), "%")
print("dead:", round(percent[1],2), "% so A")
print(len(y_perc))
print_rez() # print results of the model
# Making Confusion Matrix and calculating accuracy score
mylist = []
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred) # for out view
mylist.append(ac)
print(cm)
print(round(ac,3))
# -----------------------------------------
# printing the value of our person likelihood of dying
DEATH = round(y_OUT[0][0], 3) # return this number to the user
# We should also retunr the accuracy of our MODEL to predict (SUM) true positive and false negative.
ACCURACY = round(ac,3)
if ALIVE:
print("Our person should LIVE")
else:
print("Our person should DIE")
print("our person will die with prob: ", DEATH, "and with model accuracy", ACCURACY)
# TODO: pass/return the values it to the frontend
# Return tuple (OUT, ACCURACY)
print(y_OUT)
print(y_OUT_BIN)