-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_analysis.R
More file actions
165 lines (111 loc) · 6.41 KB
/
run_analysis.R
File metadata and controls
165 lines (111 loc) · 6.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
## GETTING AND CLEANING DATA COURSE PROJECT: SCRIPT
###
# This code produces three tidy database from the raw data located in the link:
# https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
#
# 1. HumanActivity (steps 1, 3 and 4)
# 2. HUmanActivity_mean-std (step 2)
# 3. HumanActivity_mean (step 5)
#
# The code assumes that the folder containing the raw data from the project (UCI HAR Dataset)
# is located in the working directory in R, without changing the original file structure.
###
# STEPS
# 1. Merges the training and the test sets to create one data set.
# 1.1 Reading and Merging Training Datasets
# reading Subjects
x_1 <- read.table("./UCI HAR Dataset/train/subject_train.txt", col.names = "subjects")
# a column is added to indicate that these individuals belong to the training
x_1 <- cbind(x_1, "train-test" = as.character(rep("train", nrow(x_1))))
# reading Activities
x_2 <- read.table("./UCI HAR Dataset/train/y_train.txt", col.names = "activities")
# reading Calculated variables from raw data
x_3 <- read.table("./UCI HAR Dataset/train/x_train.txt",colClasses = "numeric")
# merging training datasets
train <- cbind(x_1, x_2, x_3)
rm(x_1,x_2,x_3)
# 1.2 Reading and Merging Test Datasets
# reading Subjects
xt_1 <- read.table("./UCI HAR Dataset/test/subject_test.txt", col.names = "subjects")
# a column is added to indicate that these individuals belong to the training
xt_1 <- cbind(xt_1, "train-test" = as.character(rep("test", nrow(xt_1))))
# reading Activities
xt_2 <- read.table("./UCI HAR Dataset/test/y_test.txt", col.names = "activities")
# reading Calculated variables from raw data
xt_3 <- read.table("./UCI HAR Dataset/test/x_test.txt",colClasses = "numeric")
# merging training datasets
test <- cbind(xt_1, xt_2, xt_3)
rm(xt_1,xt_2,xt_3)
# 1.3 Merging training and test datasets
HumanActivity <- rbind(train, test)
# 1.4 deleting variables created in step 1, except the database obtained 'HumanActivity'
rm(train, test)
# 2. Extracts only the measurements on the mean and standard deviation for each measurement.
# 2.1 reading file 'features.txt' (measurements names)
features <- read.table("./UCI HAR Dataset/features.txt", colClasses= c("numeric","character"),
col.names = c("id", "measurement"))
# 2.2 extraction of the measurements names related to averages and standard deviation
mean_std <- subset(features, grepl("mean()", measurement) | grepl("std()", measurement),
select = c(measurement))
mean_std <- mean_std[,1]
# 2.3 labeling fields names of database
vLabels <- c("Subjects", "Subjects_Group", "Activity", features[,2])
colnames(HumanActivity) <- vLabels
# 2.4 Extracting of means and standard deviations from the database
HumanActivity_mean_std <- HumanActivity[, c("Subjects", "Subjects_Group", "Activity", mean_std)]
# 2.5 creation of a txt file for the database "HumanActivity_mean_std"
write.table(HumanActivity_mean_std, file = "HumanActivity_mean_std.txt")
# 2.6 deleting variables created in step 2, except the database obtained 'HumanActivity_mean_std'
rm (mean_std, vLabels, features)
# 3. Uses descriptive activity names to name the activities in the data set
# 3.1 reading file 'activity_labels.txt' (activities names)
Activity_names <- read.table("./UCI HAR Dataset/activity_labels.txt",stringsAsFactors = FALSE)
# 3.2 assignment of descriptive names to the activity codes
for (i in 1:6) {
text <- Activity_names[i,2]
HumanActivity$Activity <- replace(HumanActivity$Activity, HumanActivity$Activity== i, text )
}
# 3.3 deleting variables created in step 3
rm(Activity_names, text)
# 4. Appropriately labels the data set with descriptive variable names.
#
# Labeling variables is already did in step 2.3, but is repeated again to
# avoid misunderstandings
# 4.1 reading file 'features.txt' (measurements names)
features <- read.table("./UCI HAR Dataset/features.txt", colClasses= c("numeric","character"),
col.names = c("id", "measurement"))
# 4.2 labeling of the fields of database
vLabels <- c("Subjects", "Subjects_Group", "Activity", features[,2])
colnames(HumanActivity) <- vLabels
# 4.3 creation of a txt file for database "HumanActivity"
write.table(HumanActivity, file = "HumanActivity.txt")
# 4.4 deleting variables created in step 4
rm(features, vLabels)
# 5. Creates a second, independent tidy data set with the average of each variable
# for each activity and each subject.
# 5.1 Checking missing values
sum(is.na(HumanActivity)) # Result [1] 0, There are no missing values
# 5.2 calculating the average of the variables grouped by subject and activity
HA_mean <- NULL
for (i in 4:564) {
t <- tapply(HumanActivity[,i],paste(HumanActivity$Activity,HumanActivity$Subjects, sep=","),mean)
HA_mean <- cbind(HA_mean,t)
}
HA_mean <- as.data.frame(HA_mean)
# 5.2 Splitting activities and subjects in two variables (columns)
groups <- as.character(row.names(HA_mean))
groups <- strsplit(groups, split = ",")
groups <- t(as.data.frame(groups))
# 5.3 joining of Activities and Subjects to the calculated variables
HumanActivity_mean <- data.frame(groups, HA_mean, stringsAsFactors = FALSE)
HumanActivity_mean$X2 <- as.numeric(HumanActivity_mean$X2)
# 5.4 assigning the labels to the dataset 'HumanActivity_mean'
# reading the variables labels from file "features.txt"
features1 <- read.table("./UCI HAR Dataset/features.txt",stringsAsFactors = FALSE)
# Columns labeled
vLabels1 <- c("Activity", "Subjects", as.character(features1[,2]))
colnames(HumanActivity_mean) <- vLabels1
# 5.5 creation of a txt file for the database "HumanActivity_mean"
write.table(HumanActivity_mean, file = "HumanActivity_mean.txt")
# 5.6 deleting variables created in step 5, except the database obtained 'HumanActivity_mean'
rm(HA_mean,t, groups, features1, vLabels1, i)