title | author | output | ||||
---|---|---|---|---|---|---|
Reproducible Research: Peer Assessment 1 |
David Searl |
|
Load the reqired libaries
library(dplyr)
library(readr)
library(ggplot2)
library(gridExtra)
Load the data into a dataframe
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
if(!file.exists("activity.csv")) {
download.file(url, destfile="repdata%2Fdata%2Factivity.zip")
unzip(zipfile ="repdata%2Fdata%2Factivity.zip" )
}
activityDF <- read_csv(file = "activity.csv")
per_day <- activityDF %>% group_by(date) %>% summarize(daily_sum = sum(steps))
ggplot(data = per_day, aes(daily_sum)) + geom_histogram() + labs(x="Daily Steps", y="Frequency", main="Histogram of Daily Step Totals")
originalMean <- as.integer(mean(per_day$daily_sum, na.rm=TRUE))
originalMedian <- as.integer(median(per_day$daily_sum, na.rm=TRUE))
Mean over the recorded period is 10766
Median over the recorded period is 10765
act_by_intvl<- activityDF %>%
group_by(interval) %>%
summarize(mean_by_Interval = mean(steps, na.rm= TRUE))
ggplot(act_by_intvl, aes(x=act_by_intvl$interval, y=act_by_intvl$mean_by_Interval)) +
geom_line() +
labs(x="5-minute Interval", y= "Average Activity in Steps", title ="Average Steps per Daily Interval Chunk")
The 5-minute interval that has the highest average activity is interval 835.
There are 2304 NA values for steps in the activity dataset.
The code below will replace any NA values with the average of it's appropriate interval.
# Group By interval
# Then use if/else to replace NA step values with the mean of that interval
activityDF_imputed <- activityDF %>%
group_by(interval) %>%
mutate(steps = ifelse(is.na(steps),
mean(steps, na.rm=TRUE), #return mean of that group
as.numeric(steps))) #not NA? just return numeric value of steps at that position.
There are now 0 NA values for steps in the activity dataset.
In any interval that was missing step data had the NA replaced with that interval's mean across the dataset.
per_day_imputed <- activityDF_imputed %>% group_by(date) %>% summarize(daily_sum = sum(steps))
ggplot(data = per_day_imputed, aes(daily_sum)) + geom_histogram() + labs(x="Daily Steps", y="Frequency", main="Histogram of Daily Step Totals")
imputedMean <- as.integer(mean(per_day_imputed$daily_sum, na.rm=TRUE))
imputedMedian <- as.integer(median(per_day_imputed$daily_sum, na.rm=TRUE))
percent_change_mean <- (imputedMean - originalMean)/originalMean
percent_change_median <- format((imputedMedian - originalMedian)/originalMedian, scientific=FALSE)
Originally the mean of total daily steps was 10766 after imputing it is 10766 for a 0 % change.
The median of these daily step totas was 10765 after imputing it is 10766 for a 0.00009289364 % change.
# Make labeled factor of Weekend or Weekday
activityDF_imputed$weekend <- chron::is.weekend(activityDF_imputed$date)
activityDF_imputed$weekend <- factor(activityDF_imputed$weekend, levels = c("TRUE", "FALSE"), labels=c("Weekends","Weekdays"))
# Split, Group and Summarize
plotme <- activityDF_imputed %>%
group_by(weekend, interval) %>%
summarize(mean_by_interval = mean(steps, na.rm= TRUE))
# Plot
ggplot(plotme, aes(interval, mean_by_interval)) +
facet_grid(weekend~.) +
geom_line() +
labs(x="Time Interval",y="Average Steps", main="Weekend & Weekday Activity Compared")