-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
144 lines (72 loc) · 4 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
if(!("data.table" %in% installed.packages()))
# if the host doesn't have 'data.table' package installed, it will be installed
{
message("'data.table' package is not installed.")
message("This script requires this package and so it will download it automatically.")
message()
install.packages("data.table")
}
# 'data.table' package is required to run the commands in this script
library("data.table")
# downloading the zip file and extracting the contents
message(paste0("(Dowloading raw data at ",getwd()))
message()
download.file("https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip",
destfile = "./DATASET.zip",
quiet = TRUE)
message("(Downloading complete)")
message()
unzip(zipfile = "./DATASET.zip")
# reading 'activity_labels.txt', 'features.txt' and then selecting the required variable names
activityLabels<-fread("UCI HAR Dataset/activity_labels.txt", col.names = c("activityID","activityName"))
variablesMeasured<-fread("UCI HAR Dataset/features.txt",col.names = c("variableID","variables"))
targetVariables_indices<-grep("(mean|std)\\(\\)",variablesMeasured[,variables])
targetVariables<-variablesMeasured[targetVariables_indices,variables]
targetVariables<-gsub("[()]","",targetVariables)
# loading the training data and selecting the values corresponding to
# the variables selected earlier
trainingData<-fread("UCI HAR Dataset/train/X_train.txt")
trainingData<-trainingData[,targetVariables_indices,with=FALSE]
setnames(trainingData,colnames(trainingData),targetVariables)
trainingActivity<-fread("UCI HAR Dataset/train/y_train.txt",col.names = "activityPerformed")
trainingSubject<-fread("UCI HAR Dataset/train/subject_train.txt",col.names = "subjectID")
trainingData<-cbind(trainingSubject,trainingActivity,trainingData)
# loading the test data and selecting the values corresponding to
# the variables selected earlier
testData<-fread("UCI HAR Dataset/test/X_test.txt")
testData<-testData[,targetVariables_indices,with=FALSE]
setnames(testData,colnames(testData),targetVariables)
testActivity<-fread("UCI HAR Dataset/test/y_test.txt",col.names = "activityPerformed")
testSubject<-fread("UCI HAR Dataset/test/subject_test.txt",col.names = "subjectID")
testData<-cbind(testSubject,testActivity,testData)
# the output data table is generated by row-binding the training and test data tables
DATASET<-rbind(trainingData,testData)
# The column named "ACTIVITY PERFORMED (LABEL ID)" in the generated "DATASET" data table can be made
# into a factor column where each ID number correspons to a particular activity.
# The information relating the ID and the activities are in the data table, "activityLabels"
# as read from the file "activity_labels.txt"
DATASET$activityPerformed<-factor(DATASET$activityPerformed,levels = activityLabels$activityID,labels = activityLabels$activityName)
# Just like the "ACTIVITY PERFORMED (LABEL ID)" column, the column "SUBJECT ID"
# can be made into a factor column where the values(levels) are in range 1 to 30
# representing the 1st to the 30th subject in the experiment.
DATASET$subjectID<-as.factor(DATASET$subjectID)
# "melt" and "dcast" are used to create another data table
# which contains average values of each variable recorded for every kind
# of activity per subject.
# The data table generated is exported and written to disk at the working directory
# in a file named "AVERAGES.csv"
temp<-melt(DATASET,id=c("subjectID","activityPerformed"))
temp<-dcast(temp,subjectID+activityPerformed ~ variable,fun.aggregate = mean)
write.table(temp,file = "./AVERAGES.txt",row.names = FALSE)
message("The averages for every activity for every subject are calculated and exported as a text file in the working directory, named 'AVERAGES.txt'")
# intermediate script variables and data tables are removed
rm("activityLabels",
"variablesMeasured",
"targetVariables_indices",
"targetVariables",
"trainingData",
"trainingActivity",
"trainingSubject",
"testData",
"testActivity",
"testSubject")