-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic_comp.py
55 lines (39 loc) · 2.27 KB
/
titanic_comp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np
from sklearn.svm._libsvm import fit
data=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
test_ids=test["PassengerId"]
#First we just need to clean the train data for get the best outcome in the prediction
def clean(data):
data = data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1) # We are deleting these columns because after we observe this data, the columns don't actually provide useful information
cols = ["SibSp", "Parch", "Fare", "Age"]
#We actually filling the missing data in this useful column by their column median
for col in cols:
data[col].fillna(data[col].median(), inplace=True)
data.Embarked.fillna("U",inplace=True)
return data
data=clean(data) #We are cleaning the data (train set)
test=clean(test) #We are cleaning the test (test set)
#We are going to change the gender(string) column into binary using dummy
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
cols=["Sex","Embarked"]
for col in cols:
data[col]=le.fit_transform(data[col])
test[col]=le.transform(test[col])
print(le.classes_)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
Y=data["Survived"] #Y contain the data of whether a person a person is survived or not(0 or 1)
X=data.drop("Survived", axis=1) #X contain data (except the survived coloumn)
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.2,random_state=42) #splitting the data set into test set and train set
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, Y_train)
prediction=clf.predict(np.array((X_val)))
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_val,prediction))#We are checking the accuracy of the model by comparing the outcome of prediction with the existing data
Submission_preds=clf.predict(test) #we containing the prediction of test in this variable
df=pd.DataFrame({"PassengerId":test_ids.values,
"Survived":Submission_preds,
})
df.to_csv("Submission.csv",index=False ) #Transfering the above created data frame into new csv file(containing Survived and PassengerId)