-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_analysis.R
More file actions
205 lines (178 loc) · 9.59 KB
/
run_analysis.R
File metadata and controls
205 lines (178 loc) · 9.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
## Course Project in Coursera Getting and Cleaning Data
## The run_analysis function produces a tidy file from the UCI HAR files containing all data with DataType, Subject_ID, ActivityType and all columns having experessions ".mean." and ".std.
## The function has hardcoded URL to the zip file conatining the different data files
## URL="https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
##
## The result of the procesing is a tidy file "data/UCIHAR_allData_Subset_Tidy_Table.txt" placed in data folder under working directory.
run_analysis <- function() {
library(dplyr)
## Step 1 Downloading zip file containing the UCI HAR (Human Activity Recognition) files
message("Step 1 - Download zip file if not done already: ", "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip")
zipfileName <- as.character("UCIHarFiles.zip")
destpath <- as.character("data/")
zipfileDestination <- as.character(paste(destpath,zipfileName, sep = ""))
dataURL <- as.character("https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip")
returncode <- downloadUCIHARData(dataURL, zipfileDestination)
## Step 2 extract data files from zip file
message("Step 2 - extracting data files from zip file: ", "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip")
filelist <- vector()
if (returncode == 0) {
message("zipfileDestination: ", zipfileDestination)
message("destpath: ", destpath)
filelist <- extractUCIHARFiles(zipfileDestination, destpath)
}
if (length(filelist) > 0) {
returncode <- 0
}
else {
return <- -1
}
## Step 3 read the required files into data frames
message("Step 3 - reading the required files into data frames. Files in sub-folder 'Inertial Signals' are not read.")
## Name data variables that will be used
df_activity_labels <- data.frame()
df_features <- data.frame()
## test data
df_subject_test <- data.frame()
df_X_test <- data.frame()
df_y_test <- data.frame()
## Combined test data columns
df_test_combined <- data.frame()
## train data
df_subject_train <- data.frame()
df_X_train <- data.frame()
df_y_train <- data.frame()
## Combined train data columns
df_train_combined <- data.frame()
## Combined train data columns
df_UCIHAR_allData <- data.frame()
## variable to translate labels in features.txt to more readable names
better_col_names <- vector()
subject_col_name <- vector()
subject_col_name <- c("Subject_ID")
activityType_col_names <- c("ActivityType_ID", "ActivityType")
dataType_col_name <- vector()
dataType_col_name <- c("DataType")
if (returncode == 0) {
for (i in 1:nrow(filelist)) {
## Read the describing files
if (grepl("activity_labels.txt",as.character(filelist[i,1]))) {
df_activity_labels <- readUCIHARdatafile(as.character(filelist[i,1]))
}
if (grepl("features.txt",as.character(filelist[i,1]))) {
df_features <- readUCIHARdatafile(as.character((filelist[i,1])))
# Convert variable names in feature.txt to valid column names using make.name to remove parenthesis()
better_col_names <- make.names(names = df_features$V2, unique = T, allow_ = T)
}
## Read the test files. We will not read the Inertial Signal files, because are not to be used in this exercise.
if (grepl("/subject_test.txt",as.character(filelist[i,1]))) {
df_subject_test <- readUCIHARdatafile(as.character(filelist[i,1]))
names(df_subject_test) <- subject_col_name
}
if (grepl("/X_test.txt",as.character(filelist[i,1]))) {
df_X_test <- readUCIHARdatafile(as.character(filelist[i,1]))
names(df_X_test) <- better_col_names
}
if (grepl("/y_test.txt",as.character(filelist[i,1]))) {
df_y_test <- readUCIHARdatafile(as.character(filelist[i,1]))
for (i in 1:nrow(df_y_test)) {
activity_selected <- subset(df_activity_labels, V1 == df_y_test[i,1])
activity_label <- activity_selected[1,2]
df_y_test[i, 2] <- activity_label
}
names(df_y_test) <- activityType_col_names
}
## Read the training files. We will not read the Inertial Signal files, because are not to be used in this exercise.
if (grepl("/subject_train.txt",as.character(filelist[i,1]))) {
df_subject_train <- readUCIHARdatafile(as.character(filelist[i,1]))
names(df_subject_train) <- subject_col_name
}
if (grepl("/X_train.txt",as.character(filelist[i,1]))) {
df_X_train <- readUCIHARdatafile(as.character(filelist[i,1]))
names(df_X_train) <- better_col_names
}
if (grepl("/y_train.txt",as.character(filelist[i,1]))) {
df_y_train <- readUCIHARdatafile(as.character(filelist[i,1]))
for (i in 1:nrow(df_y_train)) {
activity_selected <- subset(df_activity_labels, V1 == df_y_train[i,1])
activity_label <- activity_selected[1,2]
df_y_train[i, 2] <- activity_label
}
names(df_y_train) <- activityType_col_names
}
}
}
## Step 4 create combined test data frame consisting of columns of subjects, activity labels and test data
message("Step 4 - creating combined test data frame consisting of columns of subjects, activity labels and test data")
## Create one column to indicate type of data = test
df_test <- data.frame(DataType = "test")
## replicate row to get same number of rows as in df_X_test
df_test <- as.data.frame(df_test[rep(1:nrow(df_test),each=nrow(df_X_test)),])
names(df_test) <- dataType_col_name
df_test_combined <- cbind(df_test, df_subject_test, df_y_test, df_X_test)
## Step 5 create combined training data frame consisting of columns of subjects, activity labels and train data
message("Step 5 - creating combined training data frame consisting of columns of subjects, activity labels and train data")
## Create one column to indicate type of data = training
df_train <- data.frame(DataType = "training")
## replicate row to get same number of rows as in df_X_train
df_train <- as.data.frame(df_train[rep(1:nrow(df_train),each=nrow(df_X_train)),])
names(df_train) <- dataType_col_name
df_train_combined <- cbind(df_train, df_subject_train, df_y_train, df_X_train)
## Step 6 combine rows from test and training into a combined training data frame consisting of columns of subjects, activity labels and train data
message("Step 6 - combining rows from test and training into create combined training data frame consisting of columns of subjects, activity labels and train data")
df_UCIHAR_allData <- rbind(df_test_combined, df_train_combined)
message("Number of columns in df_UCIHAR_allData: ", ncol(df_UCIHAR_allData))
## Step 7 Produce the tidy dataset, that only provides average values of each variable for each activity and subject
message("Step 7 - Producing the tidy dataset, that only provides average values of each variable for each activity and subject.")
## Create a subset of all data with DataType, Subject_ID, ActivityType and all columns having expressions ".mean." and ".std."
df_UCIHAR_allData_Subset <- select(df_UCIHAR_allData, DataType, Subject_ID, ActivityType, contains(".mean."), contains(".std.") )
## Create the required tidy data set with the average of each variable for each activity and each subject.
df_UCIHAR_allData_SubsetTidy <- group_by(df_UCIHAR_allData_Subset, Subject_ID, ActivityType, DataType) %>%
summarise_each(funs(mean))
## Make the column names nice to read
output_colnames <- as.data.frame(colnames(df_UCIHAR_allData_SubsetTidy))
output_colnames <- as.data.frame(gsub("\\.","",as.character(output_colnames[,1])))
output_colnames <- as.data.frame(gsub("mean","Mean",as.character(output_colnames[,1])))
output_colnames <- as.data.frame(gsub("std","Std",as.character(output_colnames[,1])))
names(df_UCIHAR_allData_SubsetTidy) <- output_colnames[,1]
names(output_colnames) <- c("OutputColumnNames")
browser()
## Output the column names to file so they can be used in code book
write.table(output_colnames, file = "data/output_colnames.txt", row.names = F, sep = "\t")
# Store the df_UCIHAR_allData_SubsetTidy tidy data in a text file with the same name in the data folder under working dir
write.table(df_UCIHAR_allData_SubsetTidy, file = "data/UCIHAR_allData_Subset_Tidy_Table.txt", row.names = F, sep = "\t")
message("Output file: data/UCIHAR_allData_Subset_Tidy_Table.txt")
message("Number of columns in UCIHAR_allData_Subset_Tidy_Table.txt: ", ncol(df_UCIHAR_allData_SubsetTidy))
## browser()
return(returncode)
}
## Downloads the zip containing the training data
downloadUCIHARData <- function(dataURL, destination) {
returnCode <- as.numeric(0)
if (!file.exists(destination)) {
returnCode <- download.file(dataURL, destfile=destination)
}
return(returnCode)
}
## Extract all files for processing
extractUCIHARFiles <- function(zipfileName, destination) {
## unzip to destination
message("zipfileName: ", zipfileName)
message("destination: ", destination)
fileList <- unzip(zipfile = zipfileName, list = TRUE)
## returncode <- unzip(zipfile = zipfileName, exdir=destination)
## returncode <- unzip(zipfile = as.character(zipfileName), exdir= as.character(destination))
if (!file.exists(as.character(fileList[1,1]))) {
fileList <- unzip(zipfile = zipfileName)
}
return(fileList)
}
## load file into data frame
readUCIHARdatafile <- function(filename) {
## read the file
## browser()
message("fileName: ", filename)
dataframe <- read.table(as.character(filename), stringsAsFactors = FALSE)
## browser()
return(dataframe)
}