forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PA1_template.Rmd
166 lines (124 loc) · 5.94 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Reproducible Research: Peer Assessment 1
## Loading and preprocessing the data
```{r}
setwd('C:\\Users\\Kingsbury\\Documents\\GitHub\\RepData_PeerAssessment1')
# DataLoad
activity <- read.csv('activity.csv',
header=TRUE, sep=",", na.strings = "NA")
## Convert the Date column to date format
activity$date <- as.Date(activity$date,format="%Y-%m-%d")
#show the table of NAs
table(is.na(activity$steps))
```
## What is mean total number of steps taken per day?
```{r}
#calc the total nbr steps taken per day
total_nbr_steps_day <- aggregate(activity$steps, by = list(activity$date), FUN = sum)
#rename columns
colnames(total_nbr_steps_day) <- c("date", "nbr_steps")
#calculate mean and median, remove NAs
mean_raw <- mean(total_nbr_steps_day$nbr_steps, na.rm = TRUE)
median_raw <- median(total_nbr_steps_day$nbr_steps, na.rm = TRUE)
```
```{r}
# print the values to screen
mean_raw
median_raw
```
### Histogram of total number of steps taken per day
```{r historgram_tot_nbr_steps_day, fig.width=7, fig.height=6, echo=TRUE}
# draw the histogram
hist(total_nbr_steps_day$nbr_steps[is.na(total_nbr_steps_day$nbr_steps)== FALSE], main="Histogram of tot nbr steps per day", xlab="Nbr steps per day")
rug(total_nbr_steps_day$nbr_steps[is.na(total_nbr_steps_day$nbr_steps)== FALSE], ticksize = 0.02)
```
## What is the average daily activity pattern?
### Time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
```{r}
#removing NAs
clean_activity <-subset(activity, is.na(activity$steps)== FALSE)
# calculating
clean_activity$avg_by_date <- ave(clean_activity$steps, clean_activity$date)
# draw the time series plot
```
```{r Avg_nbr_steps_per_5m_interval, fig.width=7, fig.height=6, echo=TRUE}
plot(clean_activity$date, clean_activity$avg_by_date, type="l", ylab="Avg nbr steps per 5m interval", xlab="Date")
```
### Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
```{r}
# calculate average nbr steps per interval
clean_activity$avg_by_interval <- ave(clean_activity$steps, clean_activity$interval)
# plot average nbr steps per interval
#show maximum
max( clean_activity[, "avg_by_interval" ] )
# which interval contains the maximum
with(clean_activity,interval[avg_by_interval== max(avg_by_interval)])[1]
```
```{r Average_nbr_of_steps_per_five_min_interval, fig.width=7, fig.height=6, echo=TRUE}
plot(clean_activity$interval, clean_activity$avg_by_interval, ylab="Avg nbr steps", xlab="Interval", main="Average nbr of steps per five min interval")
text(1250, 205, "Interval 835 with the maximum steps",
cex = .7)
```
## Imputing missing values
### Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
```{r}
# report NAs
colSums(is.na(activity))
```
### New data set with imputed values
```{r}
#load a time series library zoo
library(zoo)
# convert the data frame into a time series object for usage in zoo
activity_imputed <- read.zoo(activity, index.column = 2, format ="%Y-%m-%d" )
# fill NAs be using median of the five minutes intervals in the zoo ts object
activity_filled <- na.aggregate(activity_imputed, by = 3, FUN = median)
## helper function to convert a time series object to a dataframe
zoo.to.data.frame <- function(x, index.name="date") {
stopifnot(is.zoo(x))
xn <- if(is.null(dim(x))) deparse(substitute(x)) else colnames(x)
setNames(data.frame(index(x), x, row.names=NULL), c(index.name,xn))
}
# convert ts object back to a dataframe
activity_filled_df <- zoo.to.data.frame(activity_filled)
# check NAs
table(is.na(activity_filled_df$steps))
```
### Calculations of median and mean with new NAs filled dataset
```{r}
#calc the total nbr steps taken per day
total_nbr_steps_day_fill <- aggregate(activity_filled_df$steps, by = list(activity_filled_df$date), FUN = sum)
#rename columns
colnames(total_nbr_steps_day_fill) <- c("date", "nbr_steps")
#calculate mean and median, remove NAs
mean_na_filled <- mean(total_nbr_steps_day_fill$nbr_steps, na.rm = TRUE)
median_na_filled <- median(total_nbr_steps_day_fill$nbr_steps, na.rm = TRUE)
```
### Do these values differ from the estimates from the first part of the assignment?
```{r}
mean_raw - mean_na_filled
median_raw - median_na_filled
## Yes, there is a difference between mean/median before and after filling of NAs
```
### Histogram of total number of steps taken per day
```{r Histogram_NA_filled, fig.width=7, fig.height=6, echo=TRUE}
# draw histogram
hist(total_nbr_steps_day_fill$nbr_steps[is.na(total_nbr_steps_day_fill$nbr_steps)== FALSE], main="NAs filled: histogram of tot nbr steps per day", xlab="Nbr steps per day")
rug(total_nbr_steps_day_fill$nbr_steps[is.na(total_nbr_steps_day_fill$nbr_steps)== FALSE], ticksize = 0.02)
```
## Are there differences in activity patterns between weekdays and weekends?
### Create a new factor variable in the dataset with two levels - "weekday" and "weekend" indicating whether a given date is a weekday or weekend day
```{r}
#create a new column with weekdays
activity_filled_df$weekdays <- weekdays(activity_filled_df$date)
#create a new column with two values weekday and weekend
activity_filled_df[((activity_filled_df[,4] == "Saturday")|(activity_filled_df[,4]=="Sunday")),'wd'] <- "weekend"
activity_filled_df[(!((activity_filled_df[,4] == "Saturday")|(activity_filled_df[,4]=="Sunday"))),'wd'] <- "weekday"
#new column with averages per 5 min interval
activity_filled_df$ave <- ave(activity_filled_df$step, activity_filled_df$interval, activity_filled_df$wd)
```
### Make a panel plot containing a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken
```{r Lattice_timeseries_panel_plot, fig.width=7, fig.height=6, echo=TRUE}
library(lattice)
xyplot(activity_filled_df$ave ~ activity_filled_df$interval | activity_filled_df$wd,
layout =c(1,2), xlab='inteval', ylab='Number of steps', type ="l")
```