-
Notifications
You must be signed in to change notification settings - Fork 0
/
20241113_DataPreparation_CyclisticBikeShareCapstone.R
76 lines (61 loc) · 3 KB
/
20241113_DataPreparation_CyclisticBikeShareCapstone.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -----------------------------------------------------------
# title: "Cyclistic Bike Share: Data Preparation"
# purpose: this script is used to perform data preparation
# Prepare, Process - data cleaning
# author: "Carla Cotas"
# date: "13.Nov.2024"
# version: 1.0
# -----------------------------------------------------------
#install and load packages
install.packages("tidyverse")
install.packages("dplyr") #column rename, datasets merge
install.packages ("lubridate")
library(tidyverse)
library("dplyr")
library (lubridate)
###PREPARE###
# original data files renaming for simplicity and improve readability
CyclisticTripData_2019_Q4 <- read.csv("Divvy_Trips_2019_Q4.csv")
CyclisticTripData_2020_Q1 <- read.csv("Divvy_Trips_2020_Q1.csv")
# summary data to ensure that the data sets have the same number of columns
# and same column names before moving to the next steps
colnames(CyclisticTripData_2019_Q4)
colnames(CyclisticTripData_2020_Q1)
CyclisticTripData_dfs = list(CyclisticTripData_2019_Q4, CyclisticTripData_2020_Q1)
for (CyclisticTripData_df in CyclisticTripData_dfs) {
glimpse(CyclisticTripData_df)
}
for (CyclisticTripData_df in CyclisticTripData_dfs) {
str(CyclisticTripData_df)
}
###PROCESS###
#columns rename to ensure uniformity
(CyclisticTripData_2019_Q4 <- rename(CyclisticTripData_2019_Q4
,ride_id = trip_id
,started_at = start_time
,ended_at = end_time
,rideable_type = bikeid
,start_station_id = from_station_id
,start_station_name = from_station_name
,end_station_id = to_station_id
,end_station_name = to_station_name
,member_casual = usertype))
#columns datatype 'ride_id' and 'rideable_type' change to character
CyclisticTripData_2019_Q4 <- mutate(CyclisticTripData_2019_Q4, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
#quarter dataframes combined into a large dataframe
CyclisticTripData <- bind_rows(CyclisticTripData_2019_Q4, CyclisticTripData_2020_Q1)
#exploratory verification - large dataframe
colnames(CyclisticTripData)
glimpse(CyclisticTripData)
head(CyclisticTripData) #first rows
tail(CyclisticTripData) #last rows
#removing the columns 'birthyear', 'gender', 'start_lat', 'start_lng', 'end_lat', 'end_lng'
CyclisticTripData <- CyclisticTripData[,!names(CyclisticTripData) %in% c("birthyear"
,"gender"
,"start_lat"
,"start_lng"
,"end_lat"
,"end_lng")]
#exporting the 'CyclisticTripData' to 'CyclisticTripData.csv'
write.csv(CyclisticTripData,"CyclisticTripData.csv", row.names = FALSE)