You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
setwd("C:\\Users\\Kingsbury\\Documents\\GitHub\\RepData_PeerAssessment1")
# DataLoadactivity<- read.csv("activity.csv", header=TRUE, sep=",", na.strings="NA")
## Convert the Date column to date formatactivity$date<- as.Date(activity$date, format="%Y-%m-%d")
# show the table of NAs
table(is.na(activity$steps))
##
## FALSE TRUE
## 15264 2304
What is mean total number of steps taken per day?
# calc the total nbr steps taken per daytotal_nbr_steps_day<- aggregate(activity$steps, by=list(activity$date), FUN=sum)
# rename columns
colnames(total_nbr_steps_day) <- c("date", "nbr_steps")
# calculate mean and median, remove NAsmean_raw<- mean(total_nbr_steps_day$nbr_steps, na.rm=TRUE)
median_raw<- median(total_nbr_steps_day$nbr_steps, na.rm=TRUE)
# print the values to screenmean_raw
## [1] 10766
median_raw
## [1] 10765
Histogram of total number of steps taken per day
# draw the histogram
hist(total_nbr_steps_day$nbr_steps[is.na(total_nbr_steps_day$nbr_steps) ==FALSE],
main="Histogram of tot nbr steps per day", xlab="Nbr steps per day")
rug(total_nbr_steps_day$nbr_steps[is.na(total_nbr_steps_day$nbr_steps) ==FALSE],
ticksize=0.02)
What is the average daily activity pattern?
Time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
# removing NAsclean_activity<- subset(activity, is.na(activity$steps) ==FALSE)
# calculatingclean_activity$avg_by_date<- ave(clean_activity$steps, clean_activity$date)
# draw the time series plot
plot(clean_activity$date, clean_activity$avg_by_date, type="l", ylab="Avg nbr steps per 5m interval",
xlab="Date")
Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
# calculate average nbr steps per intervalclean_activity$avg_by_interval<- ave(clean_activity$steps, clean_activity$interval)
# plot average nbr steps per interval show maximum
max(clean_activity[, "avg_by_interval"])
## [1] 206.2
# which interval contains the maximum
with(clean_activity, interval[avg_by_interval== max(avg_by_interval)])[1]
## [1] 835
plot(clean_activity$interval, clean_activity$avg_by_interval, ylab="Avg nbr steps",
xlab="Interval", main="Average nbr of steps per five min interval")
text(1250, 205, "Interval 835 with the maximum steps", cex=0.7)
Imputing missing values
Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
# report NAs
colSums(is.na(activity))
## steps date interval
## 2304 0 0
New data set with imputed values
# load a time series library zoo
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# convert the data frame into a time series object for usage in zooactivity_imputed<- read.zoo(activity, index.column=2, format="%Y-%m-%d")
## Warning: some methods for "zoo" objects do not work if the index entries
## in 'order.by' are not unique
# fill NAs be using median of the five minutes intervals in the zoo ts# objectactivity_filled<- na.aggregate(activity_imputed, by=3, FUN=median)
## helper function to convert a time series object to a dataframezoo.to.data.frame<-function(x, index.name="date") {
stopifnot(is.zoo(x))
xn<-if (is.null(dim(x)))
deparse(substitute(x)) else colnames(x)
setNames(data.frame(index(x), x, row.names=NULL), c(index.name, xn))
}
# convert ts object back to a dataframeactivity_filled_df<- zoo.to.data.frame(activity_filled)
# check NAs
table(is.na(activity_filled_df$steps))
##
## FALSE
## 17568
Calculations of median and mean with new NAs filled dataset
# calc the total nbr steps taken per daytotal_nbr_steps_day_fill<- aggregate(activity_filled_df$steps, by=list(activity_filled_df$date),
FUN=sum)
# rename columns
colnames(total_nbr_steps_day_fill) <- c("date", "nbr_steps")
# calculate mean and median, remove NAsmean_na_filled<- mean(total_nbr_steps_day_fill$nbr_steps, na.rm=TRUE)
median_na_filled<- median(total_nbr_steps_day_fill$nbr_steps, na.rm=TRUE)
Do these values differ from the estimates from the first part of the assignment?
mean_raw-mean_na_filled
## [1] 1412
median_raw-median_na_filled
## [1] 370
## Yes, there is a difference between mean/median before and after filling of## NAs
Histogram of total number of steps taken per day
# draw histogram
hist(total_nbr_steps_day_fill$nbr_steps[is.na(total_nbr_steps_day_fill$nbr_steps) ==FALSE], main="NAs filled: histogram of tot nbr steps per day", xlab="Nbr steps per day")
rug(total_nbr_steps_day_fill$nbr_steps[is.na(total_nbr_steps_day_fill$nbr_steps) ==FALSE], ticksize=0.02)
Are there differences in activity patterns between weekdays and weekends?
Create a new factor variable in the dataset with two levels - "weekday" and "weekend" indicating whether a given date is a weekday or weekend day
# create a new column with weekdaysactivity_filled_df$weekdays<- weekdays(activity_filled_df$date)
# create a new column with two values weekday and weekendactivity_filled_df[((activity_filled_df[, 4] =="Saturday") | (activity_filled_df[,
4] =="Sunday")), "wd"] <-"weekend"activity_filled_df[(!((activity_filled_df[, 4] =="Saturday") | (activity_filled_df[,
4] =="Sunday"))), "wd"] <-"weekday"# new column with averages per 5 min intervalactivity_filled_df$ave<- ave(activity_filled_df$step, activity_filled_df$interval,
activity_filled_df$wd)
Make a panel plot containing a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken
library(lattice)
xyplot(activity_filled_df$ave~activity_filled_df$interval|activity_filled_df$wd,
layout= c(1, 2), xlab="inteval", ylab="Number of steps", type="l")