-
Notifications
You must be signed in to change notification settings - Fork 0
/
Google Capstone.Rmd
219 lines (136 loc) · 5.76 KB
/
Google Capstone.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
---
title: "Google Capstone"
author: "Zazzini Marco"
date: '2023-10-20'
output:
word_document: default
html_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#Load libraries
library(tidyverse)
library(lubridate)
library(janitor)
library(dplyr)
library(ggplot2)
library(hms)
```
```{r}
# Load data from March 2023 to August 2023
trip23_Mar <- read.csv("C:/Users/Utente/Downloads/Downloads/divvy/202302-divvy-tripdata.csv")
trip23_Apr <- read.csv("C:/Users/Utente/Downloads/Downloads/divvy/202303-divvy-tripdata.csv")
trip23_May <- read.csv("C:/Users/Utente/Downloads/Downloads/divvy/202304-divvy-tripdata.csv")
trip23_Jun <- read.csv("C:/Users/Utente/Downloads/Downloads/divvy/202305-divvy-tripdata.csv")
trip23_Jul <- read.csv("C:/Users/Utente/Downloads/Downloads/divvy/202306-divvy-tripdata.csv")
trip23_Aug <- read.csv("C:/Users/Utente/Downloads/Downloads/divvy/202307-divvy-tripdata.csv")
```
```{r}
# Combine together
trips23<- rbind( trip23_Mar, trip23_Apr, trip23_May, trip23_Jun, trip23_Jul, trip23_Aug)
```
```{r}
# Drop useless columns
trips23 <- trips23 %>%
select(-c(start_lat, start_lng, end_lat, end_lng, start_station_id,end_station_id, end_station_name))
```
```{r}
# Statistics
colnames(trips23)
nrow(trips23)
dim(trips23)
head(trips23, 6)
str(trips23)
summary(trips23)
```
```{r}
#The default format is yyyy-mm-dd
trips23$date <- as.Date(trips23$started_at)
trips23$month <- format(as.Date(trips23$date), "%m")
trips23$day <- format(as.Date(trips23$date), "%d")
trips23$year <- format(as.Date(trips23$date), "%Y")
trips23$day_of_week <- format(as.Date(trips23$date), "%A")
```
```{r}
# Convert start and end time in hours and minutes
trips23$time <- format(trips23$started_at, format= "%H:%M:%S")
trips23$time <- as_hms(ymd_hms(trips23$time))
trips23$time2 <- format(trips23$ended_at, format= "%H:%M:%S")
trips23$time2 <- as_hms(ymd_hms(trips23$time2))
```
```{r}
# Time length of a bike walk
trips23$ride_length <- as.double(difftime(trips23$time2, trips23$time))/60
#change datatype to numeric for further analysis
trips23$ride_length <- as.numeric(as.character(trips23$ride_length))
```
```{r}
# View and check changed dataset
str(trips23)
```
```{r}
# Remove all blank
trips23 <- trips23[!(trips23$start_station_name == "HQ QR" | trips23$ride_length<0),]
head(trips23)
```
```{r}
# Calculate values to determine membership type propagation.
aggregate(trips23$ride_length ~ trips23$member_casual, FUN = mean)
aggregate(trips23$ride_length ~ trips23$member_casual, FUN = median)
aggregate(trips23$ride_length ~ trips23$member_casual, FUN = max)
aggregate(trips23$ride_length ~ trips23$member_casual, FUN = min)
```
```{r}
# Check day of week
trips23$day_of_week <- ordered( trips23$day_of_week, levels=c("domenica", "lunedì", "martedì", "mercoledì", "giovedì", "venerdì", "sabato"))
```
```{r}
trips23 %>%
mutate(day_of_week = wday(started_at, label = TRUE)) %>%
group_by(member_casual, day_of_week ) %>%
summarise(number_of_rides = n())
```
```{r}
trips23$day_of_week <- format(as.Date(trips23$date), "%A")
trips23 %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n()) %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual)) + geom_col(position = "dodge") +
scale_fill_manual(values = c("#000000", "#56B4E9")) +
labs(x='Day of Week', y='Total Number of Rides', title='Rides per Week', fill = 'Membership') +
scale_y_continuous(breaks = c(250000, 450000, 550000), labels = c("250K", "450K", "550K"))
# We can see that casual use frequently on Friday(venerdì), Saturday(sabato) and Sunday(domenica), and users who own a membership use on average more throughout the week
```
```{r}
# Total rides per months
trips23 %>%
group_by(member_casual, month) %>%
summarise(total_rides = n(),`average_duration_(mins)` = mean(ride_length)) %>%
arrange(member_casual) %>%
ggplot(aes(x=month, y=total_rides, fill = member_casual)) + geom_col(position = "dodge") +
scale_fill_manual(values = c("#000000", "#56B4E9")) +
labs(x= "Month", y= "Total Number of Rides", title = "Rides per Month", fill = "Membership") +
scale_y_continuous(breaks = c(150000, 250000, 350000, 450000), labels = c("150K", "250K", "350K", "450K")) + theme(axis.text.x = element_text(angle = 45))
# During the summer months, casual users predominated.
# Conversely, during the winter months there is very little activity from regular users.
# Over the long term, membership users outperform regular/casual users
```
```{r}
# We know that the company owns two types of bicycles. Let's analyze which type is used more often
trips23 %>%
ggplot(aes(x = rideable_type, fill = member_casual)) + geom_bar(position = "dodge") +
scale_fill_manual(values = c("#000000", "#56B4E9")) +
labs(x= 'Type of Bike', y='Number of Rentals', title='Bikes', fill = 'Membership') +
scale_y_continuous(breaks = c(500000, 1000000, 1500000), labels = c("500K", "1Mil", "1.5Mil"))
```
```{r}
trips23 %>%
mutate(day_of_week = wday(started_at, label = TRUE)) %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = day_of_week, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge") + scale_fill_manual(values = c("#000000", "#56B4E9")) +
labs(x='Days of the week', y='Average Mins', title='Average ride time', fill='Membership')
```