-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBigQuery.sql
249 lines (220 loc) · 7.23 KB
/
BigQuery.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
## merging all files in a temporary table
## excluded start_station_id and end station_id since there were some inconsistencies and the columns were not necessary for this analysis
BEGIN
CREATE TEMP TABLE all_data AS
SELECT * FROM(
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.apr_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.aug_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.dec_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.feb_2021`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.jan_2021`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.jul_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.jun_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.mar_2021`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.may_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.nov_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.oct_2020`
UNION ALL
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual,
trip_duration,
weekday
FROM `capstone-385619.bikeshare.sep_2020`);
END
## 'docked_bike' is an old name for 'classic_bike'
UPDATE `capstone-385619._script6b214dfd5465895ad9f713fd8760bf1dcfa33f76.all_data`
SET rideable_type = 'classic_bike' WHERE rideable_type = 'docked_bike'
## temp table with "bad data"
BEGIN
CREATE TEMP TABLE null_stations2 AS
SELECT ride_id AS bad_ride_id FROM (
SELECT ride_id,
start_station_name,
end_station_name
FROM `capstone-385619._script6b214dfd5465895ad9f713fd8760bf1dcfa33f76.all_data`
WHERE rideable_type = 'classic_bike' AND (start_station_name IS NULL OR end_station_name IS NULL));
END
## table with "bad data" excluded
BEGIN
CREATE TEMP TABLE cleaned_combined_data AS
SELECT * FROM `capstone-385619._script6b214dfd5465895ad9f713fd8760bf1dcfa33f76.all_data` AS cd
LEFT JOIN `capstone-385619._script0988d88bc82ec60de2cad2302c9cf2999b9d2547.null_stations2` AS ns
ON cd.ride_id = ns.bad_ride_id
WHERE ns.bad_ride_id IS NULL;
END
## final table ready for analysis
BEGIN
CREATE TEMP TABLE final_data AS
SELECT * FROM(
SELECT ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
end_station_name,
member_casual AS member_type,
trip_duration,
weekday,
CASE
WHEN EXTRACT(MONTH FROM started_at) = 1 THEN 'January'
WHEN EXTRACT(MONTH FROM started_at) = 2 THEN 'February'
WHEN EXTRACT(MONTH FROM started_at) = 3 THEN 'March'
WHEN EXTRACT(MONTH FROM started_at) = 4 THEN 'April'
WHEN EXTRACT(MONTH FROM started_at) = 5 THEN 'May'
WHEN EXTRACT(MONTH FROM started_at) = 6 THEN 'June'
WHEN EXTRACT(MONTH FROM started_at) = 7 THEN 'July'
WHEN EXTRACT(MONTH FROM started_at) = 8 THEN 'August'
WHEN EXTRACT(MONTH FROM started_at) = 9 THEN 'September'
WHEN EXTRACT(MONTH FROM started_at) = 10 THEN 'October'
WHEN EXTRACT(MONTH FROM started_at) = 11 THEN 'November'
ELSE 'December'
END AS month
FROM `capstone-385619._script3a551157e7d7cde40cceb65be3281e4165b9e1ea.cleaned_combined_data`
);
END
## Data Analysis
## added a WHERE clause because bikes with trip durations longer than a day are considered stolen or in repair
## number of rides by member_type and bike_type(rideable_type)
SELECT COUNT(*) AS number_of_rides, rideable_type, member_type
FROM `capstone-385619._script2fed36877decab554777bc3d3c01cc24dd9889df.final_data`
WHERE trip_duration <= 1440
GROUP BY member_type, rideable_type
## average ride duration by member_type, month
SELECT ROUND(AVG(trip_duration),0) AS average_duration, member_type, month
FROM `capstone-385619._script2fed36877decab554777bc3d3c01cc24dd9889df.final_data`
WHERE trip_duration <= 1440
GROUP BY member_type, month
## average ride duration by member_type, day
SELECT ROUND(AVG(trip_duration),0) AS average_duration, member_type, weekday
FROM `capstone-385619._script2fed36877decab554777bc3d3c01cc24dd9889df.final_data`
WHERE trip_duration <= 1440
GROUP BY member_type, weekday
## average ride duration by member_type
SELECT ROUND(AVG(trip_duration),0)AS average_trip, member_type
FROM `capstone-385619._script2fed36877decab554777bc3d3c01cc24dd9889df.final_data`
WHERE trip_duration <= 1440
GROUP BY member_type
## number of rides per month, member_type
SELECT COUNT(*) AS num_of_trips, member_type, month
FROM `capstone-385619._script2fed36877decab554777bc3d3c01cc24dd9889df.final_data`
WHERE trip_duration <= 1440
GROUP BY member_type, month
## average number of rides by member_type
SELECT COUNT(*) AS number_of_trips, member_type
FROM `capstone-385619._script2fed36877decab554777bc3d3c01cc24dd9889df.final_data`
WHERE trip_duration <= 1440
GROUP BY member_type