-
Notifications
You must be signed in to change notification settings - Fork 1
/
st_assignment.py
108 lines (82 loc) · 3.45 KB
/
st_assignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Import the necessary libraries
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas_profiling
import warnings
warnings.filterwarnings('ignore') # to ignoring warnings
# We have imported dataset of world
st.title("EDA, Wrangling and ML on world population")
df = pd.read_csv('world_population.csv')
# check the data for understanding of data
st.write("## Raw Data")
st.write(df.head())
# Verified the data type and Null values
st.write(df.info())
st.write("## Statistics of Data")
st.write(df.describe())
# Printed columns name for X and y
st.write("## Column Names")
st.write(df.columns)
st.set_option('deprecation.showPyplotGlobalUse', False)
# defining variables for X and y
X = df[['2022 Population', '2020 Population', '2015 Population','2010 Population', '2000 Population', '1990 Population']]
a = df['World Population Percentage']# y is replaced with a in this step and in next
# Plotting a scatter plot
st.write("## Scatter Plot")
sns.scatterplot(data=df, x='2022 Population', y='World Population Percentage')
st.pyplot()
# Plotting a bar plot
st.write("## Bar Plot")
sns.barplot(data=df, x='World Population Percentage', y='2022 Population')
st.pyplot()
# Plotting a line chart
st.write("## Line Chart")
st.line_chart(X)
# Plotting a heatmap
st.write("## Heatmap")
sns.heatmap(df.corr(), annot=True)
st.pyplot()
# We changed the values by encoding as the y is countinous
from sklearn import preprocessing
from sklearn import utils
lab = preprocessing.LabelEncoder()
y = lab.fit_transform(a)
#Splited the dataset in two parts for test and train, we used the random state to get \
# the same results each time, if we select none everytime results will be changed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Imported required Libraries for Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import streamlit as st
models = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier()]
model_names = ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest', 'KNN']
# Add a sidebar to select the scoring method
scoring_method = st.sidebar.selectbox("Select the scoring method",
['accuracy', 'precision', 'recall', 'f1'])
models_scores = []
for model, model_name in zip(models, model_names):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if scoring_method == 'accuracy':
score = accuracy_score(y_test, y_pred)
elif scoring_method == 'precision':
score = precision_score(y_test, y_pred, average='micro')
elif scoring_method == 'recall':
score = recall_score(y_test, y_pred, average='micro')
else:
score = f1_score(y_test, y_pred, average='micro')
models_scores.append([model_name, score])
sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
st.write("## Model Scoring Results")
for model in sorted_models:
st.write("{}: {:.2f}".format(model[0], model[1]))