-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQuickStartAnalyses.R
209 lines (194 loc) · 10.5 KB
/
QuickStartAnalyses.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
## ----------------------------------
## QUICKSTARTANALYSES.R
## ----------------------------------
#
# This R script provides examples of how one might analyze data that are
# exported from an experiment in Terracotta. The experiment that produced
# these data is described in Terracotta's "Quick Start Guide," available at
# https://terracotta.education/help-center/quick-start-guide
#
# Detailed documentation about the Terracotta data export is available at
# https://terracotta.education/help-center/data-dictionary
#
# This is a simulated experiment with only 5 imaginary participants, and
# only 4 of these participants have outcome scores. We want to emphasize
# that these data are not real, and the results should not be interpreted
# as research findings. The purpose of this script is only to illustrate
# how analyses might be carried out on data exported from Terracotta.
#
# November 7, 2022
# Clean-up
rm(list=ls())
# Required Libraries
require(tidyverse)
require(jsonlite)
require(rstatix)
## ----------------------------------
## IMPORT THE TERRACOTTA EXPORT
## ----------------------------------
# Make a temporary directory for eventually unzipping the data
# This will help us avoid dealing with local paths
tempdir <- tempdir()
# For this script, we will download the data from a GitHub repository on the
# web. This may also be appropriate if you are storing your data on OSF or
# some other institutional data repository. Data exports from Terracotta are
# de-identified, and if using consent, data from students who did not provide
# consent are excluded.
dataurl <- 'https://github.com/terracotta-education/terracotta-quickstart-analyses/raw/main/Terracotta%20Quick%20Start%20Guide%20Experiment%20Export.zip'
tempfile <- tempfile(fileext = ".zip")
download.file(dataurl, tempfile)
unzip(tempfile, exdir = tempdir, junkpaths = TRUE)
# NOTE: If you have the data on your local machine in the working directory,
# you can read and unzip the data export zip file directly, like this:
# datapath <- 'Terracotta Quick Start Export.zip' # path to the zip export
# unzip(datapath, exdir = tempdir)
# Read each of the CSV files from the temporary directory into the environment
experiment <- read.csv(file.path(tempdir,"experiment.csv"),stringsAsFactors = FALSE)
item_responses <- read.csv(file.path(tempdir,"item_responses.csv"),stringsAsFactors = FALSE)
items <- read.csv(file.path(tempdir,"items.csv"),stringsAsFactors = FALSE)
outcomes <- read.csv(file.path(tempdir,"outcomes.csv"),stringsAsFactors = FALSE)
participant_treatment <- read.csv(file.path(tempdir,"participant_treatment.csv"),stringsAsFactors = FALSE)
participants <- read.csv(file.path(tempdir,"participants.csv"),stringsAsFactors = FALSE)
response_options <- read.csv(file.path(tempdir,"response_options.csv"),stringsAsFactors = FALSE)
submissions <- read.csv(file.path(tempdir,"submissions.csv"),stringsAsFactors = FALSE)
# Read the events.json file
events <- read_json(file.path(tempdir,"events.json"))
# Clean up the temporary file and directory, optional
# file.remove(tempfile)
# unlink(tempdir, recursive = T)
rm(tempdir, tempfile, dataurl)
## ----------------------------------
## DESCRIPTIVE ANALYSES
## ----------------------------------
# NUMBER OF PARTICIPANTS
# How many participants are included in the export?
length(unique(participants$participant_id))
# How many participants made submissions to each assignment?
assignment_names <- participant_treatment %>% select(assignment_id,assignment_name) %>% unique()
merge(submissions, assignment_names, by="assignment_id") %>% group_by(assignment_name) %>% summarize(n = length(unique(participant_id)))
# How many participants have scores for each outcome?
outcomes %>% group_by(outcome_name) %>% summarize(n = length(unique(participant_id)))
# NOTE: In this experiment, there are 5 consenting students who each
# submitted assignments, however, only 4 of them have outcome scores.
# DURATION OF TIME ON ASSIGNMENTS
# The time spent on assignments can be estimated as the time duration between
# starting an assignment and submitting that assignment. The time of events
# are recorded in the events.json file, which we've imported as a list.
# These data are formatted according to the IMS Global (1EdTech) Caliper
# standard, described at http://www.imsglobal.org/activity/caliper
# When dealing with the events, I find that it's easiest to loop through each
# event, and to extract the information you want from each event.
# First, initiate an empty data frame that will hold the relevant information
events_df <- data.frame(participant_id=integer(),
object=character(),
action=character(),
eventTime=character(), # we'll convert it to a datetime later
assignment_id=integer(),
exposure_id=integer(),
treatment_id=integer())
# Next, loop through the events, adding new rows to the empty data frame
for (i in 1:length(events)) {
object <- events[[i]]$data[[1]]$object$type
# Make sure we're measuring details from only "Assessment" events. Other
# event types, such as those corresponding to the Media profile, will have
# a different structure, with different fields
if (object == "Assessment") {
participant_id <- events[[i]]$data[[1]]$membership$member$extensions$terracotta_participant_id
object <- events[[i]]$data[[1]]$object$type
action <- events[[i]]$data[[1]]$action
eventTime <- events[[i]]$data[[1]]$eventTime
assignment_id <- events[[i]]$data[[1]]$generated$assignable$extensions$terracotta_assignment_id
exposure_id <- events[[i]]$data[[1]]$generated$assignable$extensions$terracotta_exposure_id
treatment_id <- events[[i]]$data[[1]]$generated$assignable$extensions$terracotta_treatment_id
newrow <- data.frame(participant_id,object,action,eventTime,assignment_id,exposure_id,treatment_id)
events_df <- rbind(events_df,newrow)
}
}
# Clean-up
rm(participant_id,object,action,eventTime,assignment_id,exposure_id,treatment_id,newrow)
# Turn the eventTime character string into an actual datetime, as promised
events_df <- events_df %>% mutate(eventTime = as.POSIXct(strptime(eventTime, "%Y-%m-%dT%H:%M:%S")))
# Now that we have the event data in a more manageable format, we need to
# go through the list of events, and extract the latest start (or restart)
# prior to every submission event, for every participant.
# Start by making a container
durations <- data.frame(participant_id=integer(),
assignment_id=integer(),
treatment_id=integer(),
start_time=as.POSIXct(character()),
submit_time=as.POSIXct(character()),
duration=numeric())
# For every participant
for (p in unique(events_df$participant_id)) {
# For every assignment
for (a in unique(events_df$assignment_id)) {
these_events <- events_df %>% filter(participant_id == p, assignment_id == a) %>% arrange(eventTime)
start_time <- NA
submit_time <- NA
# Loop through events
for (e in 1:nrow(these_events)) {
if (these_events$action[e] == "Started" || these_events$action[e] == "Restarted") {
submit_time <- NA
start_time <- these_events$eventTime[e]
}
if (these_events$action[e] == "Submitted" && !(is.na(start_time))) {
submit_time <- these_events$eventTime[e]
duration <- as.numeric(difftime(submit_time,start_time),units = "secs") ## DURATIONS IN SECONDS
treatment_id <- these_events$treatment_id[e]
newrow <- data.frame(participant_id=p,assignment_id=a,treatment_id,start_time,submit_time,duration)
durations <- rbind(durations,newrow)
start_time <- NA
submit_time <- NA
}
}
}
}
# Summarize durations
condition_names <- participant_treatment %>% select(treatment_id,assignment_name,condition_name)
merge(durations,condition_names) %>% group_by(assignment_name,condition_name) %>% summarize(mean_duration = mean(duration,na.rm=TRUE))
## ----------------------------------------
## COMPARING DIFFERENCES BETWEEN CONDITIONS
## ----------------------------------------
# This was a within-subject design, where each participant was exposed to
# each conditions, just at different times. The time when a participant
# experiences a condition is called an "exposure set." The details of
# how different assignments are mapped onto different conditions, for different
# participants, at different times, is in the file called
# "participant_treatment." For convenience, this file also contains the score
# a participant received for each treatment assignment.
# Show the scores on the assignments
participant_treatment %>%
ggplot(aes(x=assignment_name, y=final_score, colour=condition_name)) +
geom_point(position=position_jitterdodge())
# Calculate the average scores on the assignments
participant_treatment %>%
group_by(assignment_name,condition_name) %>% summarize(mean = mean(final_score,na.rm=TRUE),
sd = sd(final_score,na.rm=TRUE))
# Conduct a repeated measures ANOVA
aov.result <- anova_test(
data = participant_treatment, dv = final_score, wid = participant_id,
within = condition_name
)
get_anova_table(aov.result)
# In the steps above, we're examining the effect of condition on assignment
# scores. Additionally, if outcome scores have been added to the experiment,
# we can examine the effect of condition on study outcomes. These are in the
# file "outcomes."
# Show the scores on the outcomes
outcomes %>% ggplot(aes(x=outcome_name, y=outcome_score, colour=condition_name)) +
geom_point(position=position_jitterdodge())
# Calculate the average scores on the outcomes
outcomes %>%
group_by(outcome_name,condition_name) %>% summarize(mean = mean(outcome_score,na.rm=TRUE),
sd = sd(outcome_score,na.rm=TRUE))
# Conduct a repeated measures ANOVA
aov.result <- anova_test(
data = outcomes, dv = outcome_score, wid = participant_id,
within = condition_name
)
get_anova_table(aov.result)
# A note about statistics: ANOVAs are among the most conventional ways to
# perform statistical analysis. However, they have assumptions that are
# rarely true of educational data. For example, scores on tests and assignments
# are often negatively skewed and bounded between a minimum and a maximum score.
# Please use caution when conducting statistical tests.