-
Notifications
You must be signed in to change notification settings - Fork 0
/
Divvy_Trips_2019_Cleaning.sql
196 lines (135 loc) · 8.13 KB
/
Divvy_Trips_2019_Cleaning.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
-- Briefly view the structure of the imported tables
USE PortfolioProject;
SELECT TOP 5 *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1]
SELECT TOP 5 *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2]
SELECT TOP 5 *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3]
SELECT TOP 5 *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4]
-- Modify a column to match other tables
EXEC sp_rename 'dbo.Divvy_Trips_2019_Q2.bike_id', 'bikeid', 'COLUMN';
-- Remove first row of invalid data in the Q2 table
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2]
WHERE trip_id LIKE '%Rental%'
DELETE FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2]
WHERE trip_id LIKE '%Rental%'
-- Join all table into a single one
-- First make sure all tables are of the same data type to allow union all
-- Trip duration should be converted to float but due to data type difference in the actual data it wouldn't work
-- As such the column was kept as nvarchar
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN trip_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN start_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN end_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN bikeid INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN tripduration NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN from_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN from_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN to_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN to_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN usertype NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2] ALTER COLUMN birthyear SMALLINT;
-- Convert all tables to same data type
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN trip_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN start_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN end_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN bikeid INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN tripduration NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN from_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN from_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN to_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN to_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN usertype NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1] ALTER COLUMN birthyear SMALLINT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN trip_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN start_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN end_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN bikeid INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN tripduration NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN from_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN from_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN to_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN to_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN usertype NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3] ALTER COLUMN birthyear SMALLINT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN trip_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN start_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN end_time DATETIME;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN bikeid INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN tripduration NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN from_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN from_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN to_station_id INT;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN to_station_name NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN usertype NVARCHAR(50);
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4] ALTER COLUMN birthyear SMALLINT;
-- Union all the tables
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1]
UNION ALL
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2]
UNION ALL
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3]
UNION ALL
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4];
;
-- Save the union into a new table
SELECT *
INTO [PortfolioProject].[dbo].[Divvy_Trips_2019]
FROM (
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q1]
UNION ALL
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q2]
UNION ALL
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q3]
UNION ALL
SELECT * FROM [PortfolioProject].[dbo].[Divvy_Trips_2019_Q4]
) AS CombinedData;
SELECT *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];
-- We create addition cloumn to help understand the data
SELECT start_time, CAST(start_time AS DATE) AS start_date_new, CAST(start_time AS TIME) AS start_time_new
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];
SELECT end_time, CAST(end_time AS DATE) AS end_date_new, CAST(end_time AS TIME) AS end_time_new
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];
SELECT start_time, DATENAME(WEEKDAY, start_time) AS day_of_week
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];
SELECT start_time, end_time, DATEDIFF(MINUTE, start_time, end_time) AS ride_length_minutes
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];
-- Add these new columns to the table
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019]
ADD start_date_new DATE,
start_time_new TIME,
end_date_new DATE,
end_time_new TIME,
day_of_week NVARCHAR(20),
ride_length_minutes INT;
-- Update the columns
UPDATE [PortfolioProject].[dbo].[Divvy_Trips_2019]
SET start_date_new = CAST(start_time AS DATE);
UPDATE [PortfolioProject].[dbo].[Divvy_Trips_2019]
SET start_time_new = CAST(start_time AS TIME);
UPDATE [PortfolioProject].[dbo].[Divvy_Trips_2019]
SET end_date_new = CAST(end_time AS DATE);
UPDATE [PortfolioProject].[dbo].[Divvy_Trips_2019]
SET end_time_new = CAST(end_time AS TIME);
UPDATE [PortfolioProject].[dbo].[Divvy_Trips_2019]
SET day_of_week = DATENAME(WEEKDAY, start_time);
UPDATE [PortfolioProject].[dbo].[Divvy_Trips_2019]
SET ride_length_minutes = DATEDIFF(MINUTE, start_time, end_time);
-- View table structure to confirm update
SELECT TOP 10 *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];
-- Removing some columns that would not be needed for the analysis
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019]
DROP COLUMN from_station_name;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019]
DROP COLUMN to_station_name;
-- Since we have split the start_time and end_time, we can remove the original columns
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019]
DROP COLUMN start_time;
ALTER TABLE [PortfolioProject].[dbo].[Divvy_Trips_2019]
DROP COLUMN end_time;
-- Viewing the table once again
SELECT TOP 20 *
FROM [PortfolioProject].[dbo].[Divvy_Trips_2019];