forked from nfj1380/mrIML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClassification_example.R
395 lines (281 loc) · 15.5 KB
/
Classification_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#------------------------------------------------------------------------------#
# doubly robust estimator of tx effect
#
# ASSUMPTIONS
# tx is denoted by A and is binary coded as {0,1}
#
# INPUTS
# moOR : a modeling object specifying the outcome regression step
# moPS : a modeling object specifying the propensity score regression step
# data : a data.frame containing covariates and tx
# y : a vector containing the outcome of interest
#
# RETURNS
# a list containing
# fitOR : a modelObjFit object containing the results of the
# outcome regression step
# fitPS : a modelObjFit object containing the results of the
# propensity score regression step
# EY : the sample mean of the outcome under each tx option
# deltaHat : the doubly robust estimator for the tx effect
#------------------------------------------------------------------------------#
#'Testing functions out on real data.I will turn this into a vignette
#' to go with the paper. Currently we have two test datasets.
#'One coinfection data set where columns represent parasite species.
#' Rows reflect sites. This dataset has only one feature (scale prop.
#' Currently just set up for binary classification. But multiclass/regression
#' would be useful additions and easy
#'in the tidymodel environment. Multiclass could be pick SNP identities which would be useful.
#'
#
#lean toward mrIML as a name for the package.
#I'm thinking for the paper having three test datasets plus simulations.
#1. FIV bobcat data (host/pathogen example)
#2. Amphibian SNP data (landscape genetics).
#3. Moose microbiome data. This would hit the key audience for Molecular Ecology resources.
#4.PRRS data on multi-strain circulation in swine https://www.biorxiv.org/content/10.1101/2020.04.09.034181v2
#There then could be separate papers on confection data extending
#the approach to focus on co-occurence (Lion/avian malaria data)
#as well as a paper tracking between farm transisiion (Gustavo's data)
#--------------------------------------------------------------------------------------------------------------
# Use the ipak function to install and load multiple R packages. Thanks to Steven Worthington for this code.
# checks to see if packages are installed. Install them if they are not, then load them into the R session. Only work for CRAN
#packages. Details are provided for the others required.
#load packages
library(vip)
library(mice)
library(tidymodels)
library(pdp)
library(randomForest)
library(caret)
library(missForest)
library(gbm)
library(tidyverse)
library(doParallel)
library(themis)
library(viridis)
library(janitor)
library(hrbrthemes)
library(MRFcov)
library(xgboost)
library(vegan)
library(ggrepel)
library(LEA)
library(ape)
library(flashlight)
library(devtools)
install_github("mayer79/flashlight")
# load all function codes. This will disappear when we formally make this a function
source("./R/filterRareCommon.R")
source("./R/MrIMLpredicts.R")
source("./R/StackPredictions.R")
source("./R/devianceResids.R")
source("./R/filterRareCommon.R")
source("./R/MrIMLperformance.R")
source("./R/mrvip.R")
source("./R/plot_vi.R")
#new interaction code
source("./R/mrInteractions.R")
source("./R/mrPlotInteractions.R") #not finding this for some reason - no idea why
source("./R/vintTidy.R")
#Nick C - this and the function below are the functions with tidy model code I made from your original
source("./R/stacked_preds.R") #this does the stacking
source("./R/response_covariance") #this should create the covatiance matrix #not working?
source("./R/readSnpsPed.R") #function for reading SNP data from plink .ped file
source("./R/ResistanceComponents.R") #function for generating resistance component data from resistance matrices
source(("./R/mrFlashlight.R"))
source("./R/mrProfileplots.R")
#----------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------
#Example: Bobcat SNP data- from Plink file
#---------------------------------------------------------------------------------
snps <- readSnpsPed("bobcat.plink.ped", "bobcat.plink.map.map") #NAs in data and interpolated as the mode.
X <- filterRareCommon (snps, lower=0.4, higher=0.7) #these are harsh
X <- X[-c(279:281),] #these didnt match
#whatever individual X128 is it needs to go as it is a crazy outlier!
#filter correlated SNPS
df2 <- cor(X) #find correlations
hc <- findCorrelation(df2, cutoff=0.5) # put any value as a "cutoff". This is quite high
hc <- sort(hc)
# Labelled Y for simplicity
#Create resistance components for features using PCoA
#load resistance matrix generated using circuitscape. Add any matrix to a sub file in the working directory
#and provide that name to the function below. This will create a resistance component dataframe. Could add Mems here too
Y <- resist_components(data_resist, p_val=0.01, filename = 'Bobcat_cs_matrices')
#check for correlations and remove
df2 <- cor(Y) #find correlations
hc <- findCorrelation(df2, cutoff=0.9) # put any value as a "cutoff". This is quite high
hc <- sort(hc)
Y <- Y[,-c(hc)] #reduced set of Y features. Labelled Y for simplicity
#---------------------------------------------------------------------------------
#Example: Bobcat FIV data - read viral data already curated
#---------------------------------------------------------------------------------
#load data for training
Responsedata <-read.csv("GF SNPs posBinary.csv")
rownames(Responsedata) <-Responsedata$X
Responsedata$X <- NULL
Responsedata[Responsedata==4] <-0 #fix random 4s in response data.
#feature set. Note that samples must be rows.
Features <-read.csv("Landscape and host data.csv", row.names = 1, head=T)
#summary(Features)
# instead we can devlop a cut % for interpolation in the func_load_data. Seems to be a random row somehow added to the feature data.
#Features<-na.interpolation(Features, option = "spline")
# # or remove NAs from the feature/predictor data.
FeaturesnoNA<-Features[complete.cases(Features), ];str(Features) #dropping NAs
Y <- FeaturesnoNA #for simplicity
#for more efficent testing for interactions (more variables more interacting pairs)
Y <- FeaturesnoNA[c(1:3)]
#Optional: Filter rare/common SNPs or species. Retaining minor allelle frequncies >0.1 and removing common allelles (occur>0.9)
fData <- filterRareCommon (Responsedata, lower=0.4, higher=0.7)
X <- fData #for simplicity when comparing
#that occur in less than 35% of individuals and > 75% of individuals
#fData <- rownames_to_column(fData, "Individual")#get individual id back
#user can specify any tidy model here.
#---------------------------------------------------------------------------------
#coinfection test data # from MRFcov. This data can also be used in this pipeline replacing X/Y
#---------------------------------------------------------------------------------
X <- select(Bird.parasites, -scale.prop.zos) #response variables eg. SNPs, pathogens, species....
Y <- select(Bird.parasites, scale.prop.zos) # feature set
#---------------------------------------------------------------------------------
#Simulated data
#---------------------------------------------------------------------------------
X <- read.csv('grid101_binary.csv', row.names = NULL, head=T)
X[1:9] <- NULL
Y <- read.csv('simple_sims_env.csv')
#note that for multinomial models we do not have a great solution yet to calculate model deivance
#this means that model stacking currently is unavialable for this class of models.
#multinominal models arent working either...
###############################################################################
#--------------------------------------------------------------------------------------------------------------------------------------
#creating the models
#-------------------------------------------------------------------
#Pre-training data visualization- checks for long tall distributions
#-------------------------------------------------------------------
## GM add here
#-------------------------------------------------------------------
#Set up the model
#-------------------------------------------------------------------
#user provides the models they would like for each component (model1)
model1 <- #model used to generate yhat
# specify that the model is a random forest
logistic_reg() %>%
# select the engine/package that underlies the model
set_engine("glm") %>%
# choose either the continuous regression or binary classification mode
set_mode("classification") #just for your response
model1 <-
rand_forest(trees = 100, mode = "classification") %>% #this should cope with multinomial data alreadf
set_engine("ranger", importance = c("impurity","impurity_corrected")) %>%
set_mode("classification")
#Boosted model (xgboost). Not tuned either currently. This doesn't work with the small bobcat dataset. Much too small
model1 <-
boost_tree() %>%
set_engine("xgboost") %>%
set_mode("classification")
#for SVM need different tuning paramters. Currently tuning works best for tree-based algorithms.
#---------------------------------------------------------------------
#Perform the analysis
#---------------------------------------------------------------------
#parallell processing
#models just using features/predictor variables.
yhats <- mrIMLpredicts(X=X,Y=Y, model1=model1, balance_data='no', model='classification', parallel = TRUE) ## in MrTidymodels. Balanced data= up updamples and down downsampled to create a balanced set. For regression there no has to be selected.
#-------------------------------------------------------------------
# Visualization for model tunning and performance
#-------------------------------------------------------------------
## GM add here
#we can now assess model performance from the best tuned model
# MCC is useful (higher numbers = better fit)
yhats <- mrIMLpredicts(X=X,Y=Y, model1=model1, balance_data='no', model='classification', parallel = TRUE) ## in MrTidymodels. Balanced data= up updamples and down downsampled to create a balanced set. For regression there no has to be selected.
#save(yhats, file='rf_yhats')
load('rf_model_sim')
ModelPerf <- mrIMLperformance(yhats, model1, X=X, model='classification')
ModelPerf[[1]] #predictive performance for individual responses
ModelPerf[[2]]#overall predictive performance. r2 for regression and MCC for classification
modelPerfD <- ModelPerf[[1]]
save(modelPerfD, file = 'simRF_performance')
#-------------------------------------------------------------------
# Visualization individual and global feature importance
#-------------------------------------------------------------------
## GM add here
#we can look at variable importance. Co-infection data only has one feature so not much use there.
VI <- mrVip(yhats, Y=Y)
#plot model similarity
#plot variable importance
#for interpretation group features. Annoying but I cant see an easier way. Features are in alphabetical order
groupCov <- c(rep ("Host_characteristics", 1),rep("Urbanisation", 3), rep("Vegetation", 2), rep("Urbanisation",1), rep("Spatial", 2),
rep('Host_relatedness', 6),rep ("Host_characteristics", 1),rep("Vegetation", 2), rep("Urbanisation",1))
#not that GLMs in particular wont produce coefficents for features that are strongly colinear and will drop them from the model.
#in this case group cov will have to be changed to reflect features included in the model.
plot_vi(VI=VI, X=X,Y=Y, modelPerf=ModelPerf, cutoff= 0.6, plot.pca='no')#note there are two plots here. PCA is hard to read with > 50 response varianbles
#if you dont want/need to group covariates:
#First plot is overall importance, the second a pca showing responses with similar importance scores and the third is individual SNP models.
#warning are about x axis labels so can ignore
# can build a flaslight object for individual responses
fl <- mrFlashlight(yhats, X, Y, response = "single", index=1, model='classification')
plot(light_performance(fl), fill = "orange", rotate_x = TRUE) +
labs(x = element_blank()) +
theme(axis.text.x = element_text(size = 8))
#plot(light_breakdown(fl , new_obs = cbind(X, Y)[1, ]),by = X, v=Y) #prints all responses - need to fix but could be quite handy.
#int <- light_interaction(fl, pairwise=TRUE) #not working, but possible!
#------------------------------------------------------------
#Multiple response
flashlightObj <- mrFlashlight(yhats, X, Y, response = "multi", model='classification')
#plots
plot(light_profile(flashlightObj, v = "simple", type = "ale"))
plot(light_profile(mfl, v = "bio_1", type = "ale"))
#this will plot all responses. See mrALEplots below
#another way to assess performance for each response. Lots of response make it hard to read
plot(light_performance(multifl), rotate_x = TRUE, fill = "orange") +
labs(x = element_blank())
#plot prediction scatter for all responses. Gets busy with
plot(light_scatter(flashlightObj, v = "Grassland", type = "predicted"))
#plots everything on one plot (partial dependency, ALE, scatter)
plot(light_effects(flashlightObj, v = "Grassland"), use = "all")
profileData_pd <- light_profile(flashlightObj, v = "simple") #partial dependencies
profileData_ale <- light_profile(flashlightObj, v = "simple", type = "ale") #acumulated local effects
#Plot global ALE plot. This the plots the smoothed average ALE value. Have to remove responses that don't respond.
mrProfileplot(profileData_pd , sdthresh =0.07)
mrProfileplot(profileData_ale , sdthresh =0.07)
#-------------------------------------------------------------------------------------------------
#calculate interactions -this is qute slow and memory intensive
interactions <-mrInteractions(yhats, X, Y, mod='classification') #this is computationally intensive so multicores are needed. If stopped prematurely - have to reload things
mrPlot_interactions(interactions, X,Y, top_ranking = 5, top_response=5)
save(interactions, file='Fitzpatrick2016interactions')
#-------------------------------------------------------------------------------------------------
#adding other response varables to see if this improves predictions. wecould say that it only has been tested
#on 3 algorithms (GAMs, Xgboost which is an improved GBM and liner models) and user beware otherwise.
covariance_mod <-
boost_tree() %>%
set_engine("xgboost") %>%
set_mode("regression")
#or
covariance_mod <-
rand_forest(trees = 100, mode = "regression") %>%
set_engine("ranger", importance = "permutation") %>%
set_mode("regression")
#rhats <- StackPredictions(yhats, data.test, covariance_mod = 'gam')
#calculate covariance predictions. Nick - I just split this function up as I imagine it would be handy.
#at this bit,
rhats_2 <- response_covariance(yhats, covariance_mod)
#stack everything together
finalPred <- stacked_preds(rhats_2, yhats)
#add similarity in predictions to look for SNP patterns.
#at some point it would be good to add interactions/Shapely here too.
#---------------------------------------------------------------------------------------------------------------
#Extra importance plots:
#another way to look at it
ModelPerf_p<-do.call(rbind.data.frame, ModelPerf)
## perfromance by outcome
ModelPerf_p%>%
drop_na()%>%
ggplot(aes(sensitivity, specificity, shape=response, colour=response , fill=response)) +
geom_smooth(method="lm") +
geom_point(size=3)
## also
ModelPerf_p%>%
drop_na()%>%
ggplot(aes(mcc, specificity, shape=response, colour=response, fill=response )) +
geom_smooth(method="lm") +
geom_point(size=3)
### Not sure we want to have this
ModelPerf_p[3] #summary mcc for all response variables