-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGeneralized Linear Models.R
138 lines (105 loc) · 5.58 KB
/
Generalized Linear Models.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
###############################################################################################
## Script generated by Apostolis Stefanidis & Konstantina Zografou ############################
## for the study entitled: Migrating the extinction risk of globally threatened and endemic ##
## mountainous Orthoptera species: Parnassiana parnassica and Oropodisma parnassica ###########
###############################################################################################
# Generalized Linear Models
#>Generalized linear models to identify important microhabitat parameters
#>explanatory variables = microhabitat parameters
#>response variable = species abundance
# Load required libraries
library(glmnet)
library(car)
library(MuMIn)
library(DHARMa)
library(lme4)
library(pscl)
# Load the datasets (replace with your actual file paths)
data_pp <- read.table("Pparnassica.csv", sep=",", header=TRUE) # P. parnassica dataset
data_op <- read.table("Oparnassica.csv", sep=",", header=TRUE) # O. parnassica dataset
# Step 1: Multi-collinearity check ----------------------------------------
# Define the explanatory variables (adjust based on your dataset)
variables_pp <- data_pp[, !names(data_pp) %in% c("site", "year", "response_variable")]
variables_op <- data_op[, !names(data_op) %in% c("site", "year", "response_variable")]
# Spearman correlation and VIF check
correlation_pp <- cor(variables_pp, method = "spearman")
correlation_op <- cor(variables_op, method = "spearman")
# Check VIF for P. parnassica
vif_model_pp <- lm(response_variable ~ ., data = data_pp)
vif_values_pp <- vif(vif_model_pp)
# Check VIF for O. parnassica
vif_model_op <- lm(response_variable ~ ., data = data_op)
vif_values_op <- vif(vif_model_op)
# Step 2: Check for influential values (outliers) -------------------------
# Cook's distance for P. parnassica
model_pp <- lm(response_variable ~ ., data = data_pp)
cooksd_pp <- cooks.distance(model_pp)
data_pp_clean <- data_pp[cooksd_pp <= 1, ]
# Cook's distance for O. parnassica (no observations removed)
model_op <- lm(response_variable ~ ., data = data_op)
cooksd_op <- cooks.distance(model_op)
data_op_clean <- data_op # No rows removed
# Step 3: Standardize continuous variables --------------------------------
# Standardize the datasets
data_pp_clean_scaled <- as.data.frame(scale(data_pp_clean[, sapply(data_pp_clean, is.numeric)]))
data_op_clean_scaled <- as.data.frame(scale(data_op_clean[, sapply(data_op_clean, is.numeric)]))
# Step 4: Variable selection using LASSO and Elastic Net ------------------
# LASSO for P. parnassica
x_pp <- as.matrix(data_pp_clean_scaled)
y_pp <- data_pp_clean$response_variable
cv_model_pp <- cv.glmnet(x_pp, y_pp, alpha = 1, family = "poisson")
best_lambda_pp <- cv_model_pp$lambda.min
# Elastic Net for O. parnassica
x_op <- as.matrix(data_op_clean_scaled)
y_op <- data_op_clean$response_variable
cv_model_op <- cv.glmnet(x_op, y_op, alpha = 0.5, family = "poisson") # Adjust alpha for Elastic Net
best_lambda_op <- cv_model_op$lambda.min
# cycle for doing 100 cross validations and take the average of the mean error curves
MSEs <- NULL
for (i in 1:1000){
cv <- cv.glmnet(x, y, alpha= 1, family = "poisson")# alpha = 1 for LASSO (default) or 0.5 for Elastic Net Regression
MSEs <- cbind(MSEs, cv$cvm)
}
rownames(MSEs) <- cv$lambda
lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
lambda.min
#> MSEs is the data frame containing all the errors for all lambdas
#> (for the 100 runs), lambda.min is your lambda with minimum average error.
# Step 5: Fit GLMs --------------------------------------------------------
# GLM for P. parnassica (Poisson)
final_vars_pp <- coef(cv_model_pp, s = best_lambda_pp)
selected_vars_pp <- names(final_vars_pp)[final_vars_pp != 0]
glm_pp <- glm(as.formula(paste("response_variable ~", paste(selected_vars_pp, collapse = "+"))),
family = poisson, data = data_pp_clean)
# GLM for O. parnassica (Negative Binomial)
final_vars_op <- coef(cv_model_op, s = best_lambda_op)
selected_vars_op <- names(final_vars_op)[final_vars_op != 0]
glm_op <- glm(as.formula(paste("response_variable ~", paste(selected_vars_op, collapse = "+"))),
family = negative.binomial(), data = data_op_clean)
# Step 6: Fit GLMMs --------------------------------------------------------
# GLMM for P. parnassica
glmm_pp <- glmer(as.formula(paste("response_variable ~", paste(selected_vars_pp, collapse = "+"), "+ (1 | site)")),
family = poisson, data = data_pp_clean)
# GLMM for O. parnassica
glmm_op <- glmer(as.formula(paste("response_variable ~", paste(selected_vars_op, collapse = "+"), "+ (1 | site)")),
family = negative.binomial(), data = data_op_clean)
# Likelihood ratio tests to compare GLM and GLMM
lrt_pp <- anova(glm_pp, glmm_pp, test = "Chisq")
lrt_op <- anova(glm_op, glmm_op, test = "Chisq")
# Step 7: Multimodel inference --------------------------------------------
# Model selection and averaging for P. parnassica
dredge_pp <- dredge(glm_pp, rank = AICc)
model_avg_pp <- model.avg(dredge_pp, subset = delta <= 2)
# Model selection and averaging for O. parnassica
dredge_op <- dredge(glm_op, rank = AICc)
model_avg_op <- model.avg(dredge_op, subset = delta <= 2)
# Step 8: Evaluate goodness-of-fit ----------------------------------------
# Pseudo R-squared for P. parnassica
pseudo_r2_pp <- pR2(glm_pp)
# Pseudo R-squared for O. parnassica
pseudo_r2_op <- pR2(glm_op)
# Output results
summary(glm_pp)
summary(glm_op)
summary(model_avg_pp)
summary(model_avg_op)