-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLangComplex.R
224 lines (180 loc) · 8.65 KB
/
LangComplex.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
library(tidyverse)
library(ggplot2)
library(gvlma)
library(car)
library(RCurl)
library(plyr)
docloc='https://docs.google.com/spreadsheets/d/e/2PACX-1vSzvJcT6yT9_fpRoFg5O7LAput7VKKltSxAuGMyC5wDlo_75D9ELA8YaVeMIVwcLw/pub?gid=1294110857&single=true&output=csv'
myfile <- getURL(docloc, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
all_data<- read.csv(textConnection(myfile), header=T)
# if there is a new version of the data file, get it from here:
# https://docs.google.com/spreadsheets/d/1rhpTqgpv9VgsZOtoEHqxwShYrZIsbiB_/edit#gid=1860768761
#an error can be solved by using ";" or "," as a separator
all_data<- read.csv("./Data/CR_by_child-updated_3_08.xlsx - MAIN.csv", header=T,sep=";")
tsi_data<-read.csv("./Data/CR_by_child_tsi.csv",header=T)
names(tsi_data)[names(tsi_data) == "Age"] <- "Age.in.months"
all_data<-rbind.fill(all_data, tsi_data)
summary(all_data)
dim(all_data)
# apply exclusions
#data.sub <- subset(all_data, Age_in_months<=50)
#removed: corpus != "Warlaumont" & corpus != "Cychosz" & because we decided to include bilinguals
all_data->data.sub
#correct some data issues
data.sub$CR=as.numeric(gsub(",",".",data.sub$CR))
data.sub$SylComp=factor(data.sub$Syllable.complexity,levels=c("Low","Moderate","High"))
data.sub$Age=as.numeric(gsub(",",".",data.sub$Age.in.months))
data.sub$Age2=data.sub$Age^2 #generate squared component
data.sub$Age3=data.sub$Age^3 #generate cubic component
data.sub<-subset(data.sub, !is.na(SylComp))
# add more information
data.sub$coding<-ifelse(data.sub$corpus %in% c("Solomon","French"),"lab","citsci")
# describe data
table(data.sub$corpus)
table(data.sub$Language) #shows N kids per language
table(data.sub$SylComp,data.sub$Language) #notice that all the moderate data comes from Tsimane
#histograms
hist(data.sub$CR,main="CR",xlab="CR") #quite normally distributed
hist(data.sub$CR[data.sub$SylComp=="Low"],main="Low Syllable Complexity",xlab="CR") #looks ok
hist(data.sub$CR[data.sub$SylComp=="Moderate"],main="Moderate Syllable Complexity",xlab="CR") #less good, quite flat
hist(data.sub$CR[data.sub$SylComp=="High"],main="High Syllable Complexity",xlab="CR") #idem
# plots
ggplot(data.sub,aes(x=CR,fill=SylComp))+
geom_histogram(bins=20,color="black") +
facet_grid(.~SylComp)
ggplot(data.sub, aes(x=Age, y=CR, color=Language)) +
labs(x = "Age (months)")+
labs(y = "CP")+
geom_point()+
# Add regression lines
geom_smooth(method=lm,se=FALSE)
# Fit most complex model
mod_complex=lm(CR~Age*SylComp+Age2*SylComp+Age3*SylComp,data=data.sub)
#check for assumptions
plot(mod_complex) #looks pretty ok
gvlma(mod_complex) #assumptions met
#compare to simpler model
mod_simple=lm(CR~Age+SylComp,data=data.sub)
anova(mod_simple,mod_complex)
# the more complex model explains sig more variance, despite added model complexity
# check whether it's simply due to age polynomials
mod_age=lm(CR~Age+Age2+Age3,data=data.sub)
anova(mod_complex,mod_age) #no, model with interaction terms is much better than even the one with polynomials
# check whether it's the interaction
mod_int=lm(CR~Age*SylComp,data=data.sub)
anova(mod_complex,mod_int)
#NOTE! This changed with more data
# I first thought it seems so, because the more complex model is only marginally better than this simpler one, with interaction
#but now the more complex model is sig better than this simpler one
# add back polynomials but without interaction
mod_int_age=lm(CR~Age*SylComp+Age2+Age3,data=data.sub)
anova(mod_complex,mod_int_age)
#NOTE: this also changed with more data
# originally I thought model with interactions on all the polyn terms is no better
# but now it is
mod_int_age2=lm(CR~Age*SylComp+Age2,data=data.sub)
anova(mod_int_age,mod_int_age2) #and age cube doesn't add anything either
#this hasn't changed-- age cube didn't help
mod_int_age2_int=lm(CR~Age*SylComp+Age2*SylComp,data=data.sub)
anova(mod_complex,mod_int_age2_int) #ah note that the interaction age3*sylcomp wasn't adding anything
#check for assumptions in this new winning model
plot(mod_int_age2_int) #looks ok
gvlma(mod_int_age2_int) #passes all checks
# So look at what it says
Anova(mod_int_age2_int, type="III")
summary(mod_int_age2_int)
#main effect of age, ag2, sylcomp, interaction age*sylcom and age2*sylcom!
mod_int_age_noTsi=lm(CR~Age*SylComp+Age2*SylComp,data=data.sub,subset=c(corpus!="Tsimane"))
plot(mod_int_age_noTsi) #looks ok
gvlma(mod_int_age_noTsi) #passes all checks
Anova(mod_int_age_noTsi, type="III")
summary(mod_int_age_noTsi)
#results are not driven by Tsimane
# plot data
ggplot(data.sub, aes(x=Age, y=CR, color=SylComp)) +
geom_point()+
# Add regression lines
# geom_smooth(method=lm)+
# Add loess lines
geom_smooth(span = 0.8)
# scale ages, so that intercept corresponds to mean age
data.sub$Age.s=scale(data.sub$Age)
data.sub$Age2.s=scale(data.sub$Age2)
data.sub$Age3.s=scale(data.sub$Age3)
# better control for ages...
mod_int_age_noTsi_ageScaled=lm(CR~Age.s*SylComp+Age2.s*SylComp,data=data.sub,subset=c(corpus!="Tsimane"))
plot(mod_int_age_noTsi_ageScaled) #looks ok
gvlma(mod_int_age_noTsi_ageScaled) #checks ok
Anova(mod_int_age_noTsi_ageScaled, type="III")
summary(mod_int_age_noTsi_ageScaled)
# now we have effect of syllable complexity
# check that this is not just driven by Yeli old kids
mod_int_age_noTsi_ageScaled_no_old=lm(CR~Age.s*SylComp+Age2.s*SylComp,data=data.sub,
subset=c(corpus!="Tsimane"&Age<40))
plot(mod_int_age_noTsi_ageScaled_no_old) #looks ok
gvlma(mod_int_age_noTsi_ageScaled_no_old) #checks ok
Anova(mod_int_age_noTsi_ageScaled_no_old, type="III")
summary(mod_int_age_noTsi_ageScaled_no_old)
#replot without kids over 40
data.sub_under40=subset(data.sub, Age<40 & corpus!="Warlaumont")
# plot data
ggplot(data.sub_under40, aes(x=Age, y=CR, color=SylComp, shape=SylComp)) +
labs(title = "Distribution of CP wrt. Age (up to 40 months)")+
labs(colour = "Syllable Complexity", shape="Syllable Complexity")+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
scale_color_manual(values=c("#E69F00", "#56B4E9", "#009E73"))+
scale_shape_manual(values=c(4, 16, 0))+
labs(x = "Age (months)")+
labs(y = "CP")+
geom_point(size=2.5)+
# Add regression lines
geom_smooth(method=lm, se=FALSE)
# Add loess lines
#geom_smooth(span = 0.8)
#Mean CR and standard deviation
"Average CR"; mean(data.sub_under40$CR) ; "Standard Deviation" ; sd(data.sub_under40$CR)
"Average CR for low syllable complexity"; mean(data.sub_under40$CR[data.sub_under40$SylComp == "Low"]) ; "Standard Deviation" ; sd(data.sub_under40$CR[data.sub_under40$SylComp == "Low"])
"Average CR for moderate syllable complexity"; mean(data.sub_under40$CR[data.sub_under40$SylComp == "Moderate"]) ; "Standard Deviation" ; sd(data.sub_under40$CR[data.sub_under40$SylComp == "Moderate"])
"Average CR for high syllable complexity"; mean(data.sub_under40$CR[data.sub_under40$SylComp == "High"]) ; "Standard Deviation" ; sd(data.sub_under40$CR[data.sub_under40$SylComp == "High"])
boxplot(data.sub_under40$CR~data.sub_under40$SylComp, main="Distribution of CP by syllable complexity", xlab="Syllable complexity", ylab="CP")
# Low vs. Moderate vs. High complexity -----------------------------------------
#Pearson's correlations for each SylComp level
data.sub_under40_L <- subset(data.sub_under40, SylComp=="Low")
data.sub_under40_M <- subset(data.sub_under40, SylComp=="Moderate")
data.sub_under40_H <- subset(data.sub_under40, SylComp=="High")
"Low SylComp"; cor.test(data.sub_under40_L$Age,data.sub_under40_L$CR) #0.7598376
"Moderate SylComp"; cor.test(data.sub_under40_M$Age,data.sub_under40_M$CR) #0.3671603 Pretty low...
"High SylComp"; cor.test(data.sub_under40_H$Age,data.sub_under40_H$CR) #0.7770199
#check assumptions
#Low SylComp
lm_Low <- lm(data.sub_under40_L$CR~data.sub_under40_L$Age)
par(mfrow=c(2,2))
plot(lm_Low, main="Low SylComp")
#Moderate Sylcomp
lm_Mod <- lm(data.sub_under40_M$CR~data.sub_under40_M$Age)
par(mfrow=c(2,2))
plot(lm_Mod, main="Moderate SylComp") # homoscedasticity pretty bad
#High SylComp
lm_High <- lm(data.sub_under40_H$CR~data.sub_under40_H$Age)
par(mfrow=c(2,2))
plot(lm_High, main="High SylComp")
#Low vs. Moderate
LvM <- subset(data.sub_under40, SylComp!="High")
lm_LvM <- (lm(LvM$CR~LvM$Age))
par(mfrow=c(2,2))
plot(lm_LvM, main="Low and Moderate")
"Low vs. Moderate"; anova(lm(LvM$CR~LvM$Age*LvM$SylComp))
#Moderate vs. High
MvH <- subset(data.sub_under40, SylComp!="Low")
lm_MvH <- (lm(MvH$CR~MvH$Age))
par(mfrow=c(2,2))
plot(lm_MvH, main="Moderate and High")
"Moderate vs. High"; anova(lm(MvH$CR~MvH$Age*MvH$SylComp))
#Low vs. High
LvH <- subset(data.sub_under40, SylComp!="Moderate")
lm_LvH <- (lm(LvH$CR~LvH$Age))
par(mfrow=c(2,2))
plot(lm_LvH, main="Low and High")
"Low vs. High"; anova(lm(LvH$CR~LvH$Age*LvH$SylComp))