-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgroup_exercise.R
160 lines (118 loc) · 7.52 KB
/
group_exercise.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
library(tidyverse)
library(here)
#This project has two raw data files at different scales from a study of
#infants, children, and adults watching a series of 7 video clips
#I wrote Steps 1 and 2 to import and merge the data, and kept them here for your reference
#Skip down to Step 3 to work on EDA
#FILE 1: auc.csv
#Columns = stim (stimulus video, levels/labels provided below)
# id (unique participant identifier)
# age (in days)
# AUC_sal (area-under-the-curve for a saliency model)
# AUC_dist (area-under-the-curve for a distance model)
#AUC values indicate how well each model predicted where participants looked
#when watching a video
#AUC values can range from 0-1 where .5 is chance and 1 is perfect prediction
#FILE 2: participants_info_full_headers.csv
#Columns = id (unique participant identifier, matches auc.csv)
# age_group (a categorical age variable with levels:
# ".5-1 y" "1-1.5 y" "1.5-2 y" "2-4 y" "4-6 y" "8-10 y" "adult"
# precision (a quality measure of the eye data, smaller is better)
# 7 columns of "Seen X" the stimulus video before the study
# coded as SEEN (1), NOT SEEN (2), NOT SURE (3)
#STEP 1: READ IN THE AUC DATA AND CODE STIM AS A FACTOR
auc <- read_csv(here("data_raw", "auc_bystim.csv"))
stim_levels <- 1:7
stim_labels <- c("Fallon","Feist","Pentatonix","Science","Rube","Plane","Dogs")
auc <- auc %>% mutate(stim = factor(stim, levels = stim_levels, labels = stim_labels))
#STEP 2: READ IN THE PPT INFO DATA
#Wrangle the ppt info data so that you can merge it into the auc data
#Drop any data where the AUC values are missing
#In the final, merged data, make the "watched" variable is coded as a factor with
# levels "seen" (1), "not seen" (2), "not sure" (3)
#Write the cleaned file to data_cleaned/
#Read in the ppt data and rename columns to be easier to work with
ppt <- read_csv(here("data_raw","participants_info_full_headers.csv")) %>%
rename(id = `participant ID`,
age_group = `Age group`,
precision = "Precision")
#Each question about watching each video is a column, so pivot_longer
#Use separate to get just the video name into it's own column
ppt_long <- ppt %>% pivot_longer(cols = starts_with("Seen"), names_to = "stim", values_to = "watched")
ppt_long <- ppt_long %>% separate(stim, into = c(NA, "stim"))
#Code stim and watched as factors
ppt_long <- ppt_long %>% mutate(
stim = factor(stim, levels = stim_labels, labels = stim_labels),
watched = factor(watched, levels = 1:3, labels = c("Yes","No","Not Sure")))
#Join the ppt data to the AUC data (by id and by stim since each participant has observations for each stim)
ds <- left_join(auc, ppt_long, by = c("id", "stim"))
ds <- ds %>% drop_na(AUC_sal:AUC_dist) #Drop participants for whom we don't have data for the DV
#Write the data to file
ds %>% write_csv(here("data_cleaned","cleaned.csv"))
#STEP 3: EXPLORATORY DATA ANALYSIS
#3A PRECISION: Is the precision acceptable (< 2.5) for each participant?
#Visualize the distribution of precision to see if there are values above 2.5
#Create a summary to figure out which participants would we need to exclude if > 2.5 meant the data are unuseable?
#Use a summary table and plots to investigate whether data equally precise for participants of different ages
#Visualize the distribution of precision to see if there are values above 2.5
ds %>% ggplot(aes(x = precision)) + geom_histogram() + geom_vline(xintercept = 2.5)
#Create a summary to figure out which participants would we need to exclude if > 2.5 meant the data are unuseable?
ds %>% group_by(id, age_group) %>%
summarize(precision = mean(precision, na.RM = T)) %>%
filter(precision > 2.5)
ds %>% filter(precision > 2.5)
#Use a summary table and plots to investigate whether data equally precise for participants of different ages
ds %>% group_by(age_group) %>% summarize(across(precision, list(M = mean, MIN = min, MAX = max)))
ds %>% ggplot(aes(x = age_group, y = precision)) + geom_boxplot() + geom_hline(yintercept = 2.5)
ds %>% ggplot(aes(x = age, y = precision)) + geom_point() + geom_hline(yintercept = 2.5)
#3B AGE: Are there any errors in age?
#Convert age to years so that it can be more easily compared to age_group
#Visualize age in years by age_group to see whether participants are the correct age for their group
#Make a summary table of age in years by age group to check whether all participants' ages are correct
#Convert age to years so that it can be more easily compared to age_group
ds <- ds %>% mutate(age_years = age/365.25)
#Visualize age in years by age_group to see whether participants are the correct age for their group
ds %>% group_by(id, age_group) %>%
summarize(age_years = mean(age_years)) %>%
ggplot(aes(y = age_group, x = age_years)) + geom_boxplot()
#Another option would be to facet by age group and to let the scales be "free" to get a better look
ds %>% group_by(id, age_group) %>%
summarize(age_years = mean(age_years)) %>%
ggplot(aes(y = age_years)) +
geom_boxplot() +
facet_wrap("age_group", scales = "free")
#Make a summary table of age in years by age group to check whether all participants' ages are correct
ds %>% group_by(age_group) %>% summarize(min_age = min(age_years), max_age = max(age_years))
#3C SEEN VIDEOS BEFORE:
#How many total participants saw each video before collapsed across age? Make a summary table and a bar plot to illustrate this.
#Make a table to show how many participants in each age group have seen the videos before
#How many total participants saw each video before collapsed across age? Make a summary table and a bar plot to illustrate this.
#Summing "watched == "Yes" will give us the summary of how many ppts in each grouping have watched each video
ds %>% group_by(stim) %>%
summarize(n_watched = sum(watched == "Yes"))
#Plotting watched with bar will give us the counts, facet by stimulus to show it separately by video
ds %>% ggplot(aes(x = watched)) + geom_bar() + facet_grid("stim")
#Make a table to show how many participants in each age group have seen the videos before
#Similar to the above, but adding age_group to group_by gives us the totals within age group
#Pivoting wider makes it a little bit more readable
ds %>% group_by(age_group, stim) %>%
summarize(n_watched = sum(watched == "Yes")) %>%
pivot_wider(id_cols = "age_group", names_from = "stim", values_from = "n_watched")
#3D AUC VALUES:
#Are the two AUC values all within the possible range (0,1)? Create a plot and a summary table to investigate.
#Plot AUCs by stimulus and age to explore whether they might be related.
#How do they compare to chance (.5)? Plot AUCs across their full range 0-1.
#Easier if we pivot AUC to longer
ds_longer <- ds %>% pivot_longer(starts_with("AUC"), names_to = "model", values_to = "AUC")
#Are the two AUC values all within the possible range (0,1)? Create a plot and a summary table to investigate.
#Histogram of AUC with bar fills determined by model, seems like they're all in range
ds_longer %>% ggplot(aes(x = AUC, fill = model)) + geom_histogram() + xlim(0,1)
#Are the two AUC values all within the possible range (0,1)? Create a plot and a summary table to investigate.
#Summary table version shows that all AUCs are > 0 and < 1
ds_longer %>% group_by(model) %>% summarize(min = min(AUC), max = max(AUC))
#Plot AUCs by stimulus and age to explore whether they might be related. How do they compare to chance (.5)?
ds_longer %>% ggplot(aes(x = age, y = AUC, color = model)) +
geom_point() +
facet_wrap("stim") +
geom_hline(yintercept = .5) +
ylim(0,1)