# DataLoadactivity<- read.csv("activity.csv", header=TRUE, sep=",", na.strings="NA")
## Convert the Date column to date formatactivity$date<- as.Date(activity$date, format="%Y-%m-%d")
# show the table of NAs
## 15264 2304
What is mean total number of steps taken per day?
# calc the total nbr steps taken per daytotal_nbr_steps_day<- aggregate(activity$steps, by=list(activity$date), FUN=sum)
# rename columns
colnames(total_nbr_steps_day) <- c("date", "nbr_steps")
# calculate mean and median, remove NAsmean_raw<- mean(total_nbr_steps_day$nbr_steps, na.rm=TRUE)
median_raw<- median(total_nbr_steps_day$nbr_steps, na.rm=TRUE)
# print the values to screenmean_raw
## [1] 10766
## [1] 10765
Histogram of total number of steps taken per day
# draw the histogram
hist(total_nbr_steps_day$nbr_steps[$nbr_steps) ==FALSE],
main="Histogram of tot nbr steps per day", xlab="Nbr steps per day")
rug(total_nbr_steps_day$nbr_steps[$nbr_steps) ==FALSE],
What is the average daily activity pattern?
Time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
# removing NAsclean_activity<- subset(activity,$steps) ==FALSE)
# calculatingclean_activity$avg_by_date<- ave(clean_activity$steps, clean_activity$date)
# draw the time series plot
plot(clean_activity$date, clean_activity$avg_by_date, type="l", ylab="Avg nbr steps per 5m interval",
Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
# calculate average nbr steps per intervalclean_activity$avg_by_interval<- ave(clean_activity$steps, clean_activity$interval)
# plot average nbr steps per interval show maximum
max(clean_activity[, "avg_by_interval"])
## [1] 206.2
# which interval contains the maximum
with(clean_activity, interval[avg_by_interval== max(avg_by_interval)])[1]
## [1] 835
plot(clean_activity$interval, clean_activity$avg_by_interval, ylab="Avg nbr steps",
xlab="Interval", main="Average nbr of steps per five min interval")
text(1250, 205, "Interval 835 with the maximum steps", cex=0.7)
Imputing missing values
Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
# report NAs
## steps date interval
## 2304 0 0
New data set with imputed values
# load a time series library zoo
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## as.Date, as.Date.numeric
# convert the data frame into a time series object for usage in zooactivity_imputed<- read.zoo(activity, index.column=2, format="%Y-%m-%d")
## Warning: some methods for "zoo" objects do not work if the index entries
## in '' are not unique
# fill NAs be using median of the five minutes intervals in the zoo ts# objectactivity_filled<- na.aggregate(activity_imputed, by=3, FUN=median)
## helper function to convert a time series object to a<-function(x,"date") {
xn<-if (is.null(dim(x)))
deparse(substitute(x)) else colnames(x)
setNames(data.frame(index(x), x, row.names=NULL), c(, xn))
# convert ts object back to a dataframeactivity_filled_df<-
# check NAs
## 17568
Calculations of median and mean with new NAs filled dataset
# calc the total nbr steps taken per daytotal_nbr_steps_day_fill<- aggregate(activity_filled_df$steps, by=list(activity_filled_df$date),
# rename columns
colnames(total_nbr_steps_day_fill) <- c("date", "nbr_steps")
# calculate mean and median, remove NAsmean_na_filled<- mean(total_nbr_steps_day_fill$nbr_steps, na.rm=TRUE)
median_na_filled<- median(total_nbr_steps_day_fill$nbr_steps, na.rm=TRUE)
Do these values differ from the estimates from the first part of the assignment?
## [1] 1412
## [1] 370
## Yes, there is a difference between mean/median before and after filling of## NAs
Histogram of total number of steps taken per day
# draw histogram
hist(total_nbr_steps_day_fill$nbr_steps[$nbr_steps) ==FALSE], main="NAs filled: histogram of tot nbr steps per day", xlab="Nbr steps per day")
rug(total_nbr_steps_day_fill$nbr_steps[$nbr_steps) ==FALSE], ticksize=0.02)
Are there differences in activity patterns between weekdays and weekends?
Create a new factor variable in the dataset with two levels - "weekday" and "weekend" indicating whether a given date is a weekday or weekend day
# create a new column with weekdaysactivity_filled_df$weekdays<- weekdays(activity_filled_df$date)
# create a new column with two values weekday and weekendactivity_filled_df[((activity_filled_df[, 4] =="Saturday") | (activity_filled_df[,
4] =="Sunday")), "wd"] <-"weekend"activity_filled_df[(!((activity_filled_df[, 4] =="Saturday") | (activity_filled_df[,
4] =="Sunday"))), "wd"] <-"weekday"# new column with averages per 5 min intervalactivity_filled_df$ave<- ave(activity_filled_df$step, activity_filled_df$interval,
Make a panel plot containing a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken
layout= c(1, 2), xlab="inteval", ylab="Number of steps", type="l")