-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlib.r
executable file
·202 lines (163 loc) · 9.07 KB
/
lib.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
##
## This function retrieves runinfos and instrument infos from
## mzfile given in argument
##
getInfos <- function(mzdatafile) {
# Get informations about instruments used and run
ms <- openMSfile(mzdatafile)
runInfo <- t(runInfo(ms))
instrumentInfo <- t(instrumentInfo(ms))
infos <- list("runInfo" = runInfo, "instrumentInfo" = instrumentInfo)
return(infos)
}
##
## This function launches IPO functions to get the best parameters for xcmsSet
## 5% but at least 10 files of the whole dataset is used to save processing time
##
ipo4xcmsSet <- function(directory, parametersOutput, listArguments) {
setwd(directory)
files <- list.files(".", recursive = T)
# Check if there are blank files
# TODO Change the method retrieving blank samples, with a more "official way" then "grep" maybe ?
blank.files <- grep("blan(k|c)", files, ignore.case = TRUE, value = TRUE)
# Keep only QCs and/or pool files if possible since they are more representative of the experimental study
representative.files <- grep("(QC)|(pool)", files, ignore.case = TRUE, value = TRUE)
if (length(representative.files) != 0) { # If pools or QC, keep only them
mzfiles <- representative.files
if (length(blank.files) != 0) { # Keep also blanks if there are
mzfiles <- c(mzfiles, blank.files)
}
} else {
# To reduce processing time, keep 5% but at least 10 raw data files of the assay
if (length(files) <= 10) {
mzfiles <- files
} else if (ceiling((5 * length(files)) / 100) < 10) {
mzfiles <- sample(files, 10)
} else {
mzfiles <- sample(files, ceiling((5 * length(files)) / 100))
}
}
cat("\t\tSamples used:\n")
print(mzfiles)
peakpickingParameters <- getDefaultXcmsSetStartingParams(listArguments[["method"]]) # get default parameters of IPO
# filter listArguments to only get releavant parameters and complete with those that are not declared
peakpickingParametersUser <- c(listArguments[names(listArguments) %in% names(peakpickingParameters)], peakpickingParameters[!(names(peakpickingParameters) %in% names(listArguments))])
peakpickingParametersUser$verbose.columns <- TRUE
# allow range for min and max peakwidth and ppm if given in arguments
if (!is.null(listArguments[["minPeakWidth"]])) {
peakpickingParametersUser$min_peakwidth <- as.vector(as.numeric(unlist(strsplit(listArguments[["minPeakWidth"]], split = ","))))
listArguments[["minPeakWidth"]] <- NULL
}
if (!is.null(listArguments[["maxPeakWidth"]])) {
peakpickingParametersUser$max_peakwidth <- as.vector(as.numeric(unlist(strsplit(listArguments[["maxPeakWidth"]], split = ","))))
listArguments[["maxPeakWidth"]] <- NULL
}
if (!is.null(listArguments[["ppm"]])) {
if (grepl(",", listArguments[["ppm"]])) {
peakpickingParametersUser$ppm <- as.vector(as.numeric(unlist(strsplit(listArguments[["ppm"]], split = ","))))
} else {
peakpickingParametersUser$ppm <- listArguments[["ppm"]]
}
listArguments[["ppm"]] <- NULL
}
# peakpickingParametersUser$profparam <- list(step=0.005) #not yet used by IPO have to think of it for futur improvement
resultPeakpicking <- optimizeXcmsSet(mzfiles, peakpickingParametersUser, nSlaves = peakpickingParametersUser$nSlaves, subdir = "../IPO_results") # some images generated by IPO
# Export results
resultPeakpicking_best_settings_parameters <- resultPeakpicking$best_settings$parameters[!(names(resultPeakpicking$best_settings$parameters) %in% c("nSlaves", "verbose.columns"))]
# Export:
# - TSV output for run / instrument informations of all files in the dataset
# - TSV output for best settings parameters
table <- NULL
for (filename in mzfiles) {
one.file.infos <- getInfos(filename)
infos <- cbind(filename, one.file.infos$instrumentInfo, one.file.infos$runInfo)
table <- rbind(table, infos)
}
# Export run and instrument infos
write.table(table, file = "../run_instrument_infos.tsv", sep = "\t", row.names = F, col.names = T, quote = F)
# Export best parameters of peak picking
write.table(as.matrix(resultPeakpicking_best_settings_parameters), file = parametersOutput, sep = "\t", row.names = T, col.names = F, quote = F)
# Returns best settings containing among others:
# - Best Xset (xcmsSet object)
# - Best Xset parameters
# - PeakPickingScore (PPS)
return(resultPeakpicking)
}
##
## This function launches IPO functions to get the best parameters for group and retcor
## Factors are extracted from the sample metadata file to fill the phenoData of the xcmsSet object
## This ensures that the experimental design factors are taken into account
##
ipo4retgroup <- function(xset, sample.metadata.file, directory, parametersOutput, listArguments) {
setwd(directory)
files <- list.files(".", recursive = T)
# Read the sample.metadata (W4M) file to have access to factors of the assay
sample.metadata <- read.table(sample.metadata.file, header = TRUE)
# Get all factors columns
factors <- sample.metadata[, grep("Factor.Value.*", colnames(sample.metadata), value = TRUE, ignore.case = TRUE)]
if (length(factors) != 0) {
# Link all files to their factors
factors <- cbind(levels(sample.metadata$Raw.Spectral.Data.File)[sample.metadata$Raw.Spectral.Data.File], factors)
# Get indexes of files that are in the xcmsSet object
files.index <- which(levels(sample.metadata$Raw.Spectral.Data.File)[sample.metadata$Raw.Spectral.Data.File] %in% basename(xset@filepaths))
# Get the factors of these files respectively
factors.of.files <- factors[files.index, 2:ncol(factors)]
# Fill the phenoData of xset: indicate the experimental design factors
# for grouping and retention time correction
xset <- `sampclass<-`(xset, factors.of.files)
} else {
cat("\n\nWARNING: no factors found, check that the sample metadata file is correct.\nNo sample class is set for the xcmsSet object.\n\n")
}
# Retrieve default parameters
retcorGroupParameters <- getDefaultRetGroupStartingParams(listArguments[["retcorMethod"]]) # get default parameters of IPO
print(retcorGroupParameters)
# filter listArguments to only get releavant parameters and complete with those that are not declared
retcorGroupParametersUser <- c(listArguments[names(listArguments) %in% names(retcorGroupParameters)], retcorGroupParameters[!(names(retcorGroupParameters) %in% names(listArguments))])
print("retcorGroupParametersUser")
print(retcorGroupParametersUser)
# Do the job !
resultRetcorGroup <- optimizeRetGroup(xset, retcorGroupParametersUser, nSlaves = listArguments[["nSlaves"]], subdir = "../IPO_results") # some images generated by IPO
# Export best retCor + grouping parameters
write.table(t(as.data.frame(resultRetcorGroup$best_settings)), file = parametersOutput, sep = "\t", row.names = T, col.names = F, quote = F) # can be read by user
}
##
## This function checks if xcms will find all the files
##
# @author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
checkFilesCompatibilityWithXcms <- function(directory) {
cat("Checking files filenames compatibilities with xmcs...\n")
# WHAT XCMS WILL FIND
filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]", "[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
filepattern <- paste(paste("\\.", filepattern, "$", sep = ""), collapse = "|")
info <- file.info(directory)
listed <- list.files(directory[info$isdir], pattern = filepattern, recursive = TRUE, full.names = TRUE)
files <- c(directory[!info$isdir], listed)
files_abs <- file.path(getwd(), files)
exists <- file.exists(files_abs)
files[exists] <- files_abs[exists]
files[exists] <- sub("//", "/", files[exists])
# WHAT IS ON THE FILESYSTEM
filesystem_filepaths <- system(paste("find $PWD/", directory, " -not -name '\\.*' -not -path '*conda-env*' -type f -name \"*\"", sep = ""), intern = T)
filesystem_filepaths <- filesystem_filepaths[grep(filepattern, filesystem_filepaths, perl = T)]
# COMPARISON
if (!is.na(table(filesystem_filepaths %in% files)["FALSE"])) {
write("\n\nERROR: List of the files which will not be imported by xcmsSet", stderr())
write(filesystem_filepaths[!(filesystem_filepaths %in% files)], stderr())
stop("\n\nERROR: One or more of your files will not be imported by xcmsSet. It may be due to bad characters in their filenames.")
}
}
##
## This function checks if XML contains special caracters. It also checks integrity and completness.
##
# @author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
checkXmlStructure <- function(directory) {
cat("Checking XML structure...\n")
cmd <- paste("IFS=$'\n'; for xml in $(find", directory, "-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'); do if [ $(xmllint --nonet --noout \"$xml\" 2> /dev/null; echo $?) -gt 0 ]; then echo $xml;fi; done;")
capture <- system(cmd, intern = TRUE)
if (length(capture) > 0) {
# message=paste("The following mzXML or mzML file is incorrect, please check these files first:",capture)
write("\n\nERROR: The following mzXML or mzML file(s) are incorrect, please check these files first:", stderr())
write(capture, stderr())
stop("ERROR: xcmsSet cannot continue with incorrect mzXML or mzML files")
}
}