-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathconfig.yaml
executable file
·332 lines (254 loc) · 11.5 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
########################################################################################################################
# WORKFLOW CONFIGURATION
########################################################################################################################
# INPUT DATASET
##############################
#alignment in fasta format and tree in newick format
# PAC
# D140 (amino acids)
#dataset_align: /home/nikolai/dev/rappas/data/test/D140/140.phy.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D140/RAxML_bipartitions.140.BEST.WITH
# D155 (full virus genome, 9k positions, 2% gap rate
#dataset_align: /home/nikolai/dev/rappas/data/test/D155/reference.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D155/tree.newick
# D218
#dataset_align: /home/nikolai/dev/rappas/data/test/D218/reference.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D218/tree.newick
# D500
#dataset_align: /home/nikolai/dev/rappas/data/DATASETS/EPA_datasets/500.phy.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/DATASETS/EPA_datasets/RAxML_result.optimization500
# D652
#dataset_align: /home/nikolai/dev/rappas/data/test/D652/bv_refs_aln_stripped_99.5.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D652/RAxML_result.bv_refs_aln
# LAC
dataset_align: /home/nikolai/dev/test/D652
dataset_tree: /home/nikolai/dev/rappas/data/test/D652/RAxML_result.bv_refs_aln
#working directory
#workdir: /home/nikolai/dev/pewo/D140
#workdir: /home/nikolai/dev/pewo/D218
#workdir: /home/nikolai/dev/pewo/D500
#workdir: /home/nikolai/dev/pewo/D652
#workdir: /home/nikolai/dev/pewo/bernoulli_500
#workdir: /home/nikolai/dev/pewo/bernoulli_652
#workdir: /home/nikolai/dev/pewo/bernoulli_155
#workdir: /home/nikolai/dev/pewo/debug_filters/D652/
#states used in analysis, either '0' for nucleotides or '1' for amino acids
states: 0
#which software to test, at least one of : epa, epang, pplacer, rappas, rappas2, apples, appspam
#test_soft: [epa, rappas, rappas2]
test_soft: [epa, rappas2]
# READ GENERATION
# Read lengths to generate
read_length: [300]
# Number of random prunings to compute
pruning_count: 25
## IF "ACCURACY" IS EVALUATED
#############################
### this section matters only when you run the "eval_accuracy.smk" workflow
## IF "RESOURCES" ARE EVALUATED
##############################
### this section matters only when you run the "eval_ressources.smk" workflow
# number of identical runs to launch when evaluating RAM/CPU consumption
# final measurements are reported as mean of the the runs.
repeats: 3
#defines queries source, one of the following:
# user: query sequences are loaded from a file target by parameter "query_set"
# simulate: queries are simulated from the input alignment (reserved for future upgrades, currently not implemented)
query_type: user
# queries used in resource evaluation, >10000 sequences recommended
query_user: examples/6_placement_likelihood/EMP_92_studies_100.fas
########################################################################################################################
# PER SOFTWARE CONFIGURATION
########################################################################################################################
# The following section allow you to set parameters combinations that will be tested by the workflow
# For each software/parameter, set a list of values.
### EPA
###############################
config_epa:
#EPA is alignment-based and uses a ML evaluation of the placement.
#it uses a 2-step heuristic:
# 1) rapid ML evaluation after insertion in the midpoint of each branch
# 2) full optimization for top scoring branch selected at step 1.
#(Berger et al, 2011 ; doi: 10.1093/sysbio/syr010)
#proportion of top scoring branch for which full optimization is computed
#float in ]0,1]
G: [0.01]
### PPLACER
###############################
config_pplacer:
#PPLACER is alignment-based and uses a ML evaluation of the placement.
#it uses a 2-step heuristic similar to EPA but called the "baseball" heuristic
#(Matsen et al, 2012 ; doi: 10.1186/1471-2105-11-538)
max-strikes: [6,12]
strike-box: [3,6]
max-pitches: [40,80]
#pre-masking, 1=yes, 0=no
premask: 1
### EPA-ng
###############################
config_epang:
#EPA-NG is alignment-based and uses a ML evaluation of the placement.
#different heuristics that can be tested:
# h1: program default, heuristic developed for EPA-ng, fastest heuristic
# h2: heuristic equivalent to old EPA, slow
# h3: heuristic equivalent to pplacer defaults, fast
# h4: no heuristic, very very slow but should produce the best accuracy
#(Barbera et al, 2019 ; doi: 10.1093/sysbio/syy054)
heuristics: ["h1"]
#heuristic-specific parameters can be setup in following lines
h1:
g: [0.999,0.99999]
h2:
G: [0.01,0.1]
h3:
options: none #reserved if any option appears in future versions
h4:
options: none #reserved if any option appears in future versions
#pre-masking, 1=yes, 0=no
premask: 1
### RAPPAS
###############################
config_rappas:
# RAPPAS uses an alignment-free approach which is completely different from the alignment-based apporaches
# of EPA, EPA-ng and PPlacer. It does not use a "heuristic" per see to accelerate placements,
# but a 2-step approach (DB build, then placements) based on the phylo-kmer idea.
#(Linard et al, 2019 ; doi: 10.1093/bioinformatics/btz068)
#panel of k that is tested
#integer in [2,16] (8~10 recommended, >12 often produces too long computations)
k: [7,8,9]
#panel of omega that is tested, rappas probability threshold is Thr=(omeIccoydHutgiUda7ga/#states)^k
#integer in ]0,#states] with #states=4 for nucleotides and 20 for amino acids
#For DNA, values in [1,2] recommended. For amino acids, values in [5,15] recommended.
omega: [1.5]
#reduction setup, e.g. gap/non-gap ratio
#above which a site of the input alignment
#is ignored during phylo-kmer computations
#integer in ]0,1], a value close to 1.00 is recommended (with 1.00, only gap-only columns are filtered).
reduction: [0.99]
#external software used to compute ancestral states probabilities (ancestral reconstruction)
#all software should compute approximately the same values (as same model and model parameters are called)
#putting more than one software in the list is useful only when 'ressources' consumption is explored
#
#following values can be currently set (software supported by RAPPAS):
# - PAML
# - PHYML
# - RAXMLNG
#
#overall, the following patterns are expected (oct 2019):
# speed: paml < phyml < raxml-ng
# ram : paml < phyml < raxml-ng
#
#if you do not care about testing the behaviour of these external dependencies,
#set arsoft to 'RAXMLNG' and arthreads with 2 to 8 CPUs for faster computations
#currently, only raxml-ng can use multiple threads
#
#!!! warning, be sure to set arsoft VALUES as UPPER CASE !!!
#arsoft: [PHYML,RAXMLNG]
arsoft: [PHYML]
arthreads: 2
#maximum amount of memory available to rappas process
#this has no influence on placement accuracy but it will impact "resource" evaluation
#in particular, testing very large trees will be faster with more memory (due to JVM garbage collector behaviour)
#set as an integer value, which represents the maximum amount of Gb allocatable to the JVM (memory: 8 => 8 Gb of RAM)
memory: 8
### RAPPAS2
###############################
config_rappas2:
k: [7,8,9]
omega: [1.5]
reduction: [0.99]
arsoft: [PHYML]
arthreads: 1
#filter: "NO-FILTER"
filter: ["ENTROPY", "RANDOM"]
mu: [0.0625, 0.125, 0.25, 0.5, 1.0]
#mu: [0.1]
#mu: [1.0]
#f: [0.01, 0.25, 0.5, 0.75, 0.99]
#model: ["BERNOULLI"]
f: 1.0
#model: ["DEFAULT", "MULTINOMIAL"]
model: ["DEFAULT"]
### APPLES
###############################
config_apples:
#apples placements are based on distance computations between the query and the reference tree
#it allows different "methods" to compute these distance and different "criteria" to selection the best placement.
#(Balaban et al, 2019 ; doi: 10.1093/sysbio/syz063)
#List of weighted least squares method to test.
#Possible values are:
# OLS: k=0 ordinary least square (Cavalli-Sforza and Edwards 1967)
# FM : k=2 (Fitch and Margoliash, 1967)
# BE : k=1 (Beyer et al., 1974)
#methods: ["OLS","FM","BE"]
#!warning, be sure to set methods VALUES as UPPER CASE
methods: [OLS,BE]
#List of placement criterion to test.
#Possible values are:
# MLSE: Least Squares Phylogenetic Placement
# ME : Minimum Evolution
# HYBRID : MLSE then ME
#criteria: ["MLSE","ME","HYBRID"]
#!warning, be sure to set criteria VALUES as UPPER CASE
criteria: [MLSE,ME]
### APP-SPAM
###############################
config_appspam:
#appspam calculates phylogenetic distances between all query and reference distances based on
#filtered spaced word matches. The placement position is determined with different heuristics (mode).
#(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)
#List of placement heuristics to test.
#Possible values are:
# MINDIST : Above reference with smallest phylogenetic distance.
# SPAMCOUNT : Above reference with most filtered spaced word matches.
# LCADIST : LCA of two leaves with smallest phylogenetic distances.
# LCACOUNT : LCA of two leaves with most filtered spaced word matches.
# APPLES : Our calculated distances are used as input matrix for APPLES.
mode: [LCACOUNT]
#List of weights for the pattern to be tested (number of match positions).
#Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
w: [8, 12]
#Number of pattern from which spaced words are generated.
#At the moment 1 is heavily recommended.
pattern: [1]
########################################################################################################################
# OPTIONS COMMON TO ALL SOFTWARE
########################################################################################################################
### jplace output formatting
#maximum number placements kept per query (minimum is 1)
#equivalent to options :
# --epa-keep-placements (EPA)
# --keep-at-most (PPLACER)
# --filter-max (EPANG)
# --keep-at-most (RAPPAS)
maxplacements: 7
#minimum likelihood weight ratio below which placement are not output
#equivalent to options :
# --epa-prob-thresholds (EPA)
# --keep-factor (PPLACER)
# --filter-min-lwr (EPANG)
# --keep-factor (RAPPAS)
minlwr: 0.01
########################################################################################################################
# EVOLUTIONARY MODEL
########################################################################################################################
# By default, the workflow re-optimises all pruned tree using the model defined below.
# Updated model parameters are then loaded and transferred to placement software using ML approaches.
# Currently, you can choose one of the following models :
# - GTR+G (nucleotides)
# - JTT+G (amino acids)
# - WAG+G (amino acids)
# - LG+G (amino acids)
# Warning: in the present configuration file, be sure you set the "states:" field accordingly.
phylo_params:
model: "GTR+G"
categories: 4
lac:
optimization: "OFF"
########################################################################################################################
# DEBUG OPTIONS
########################################################################################################################
#if 1, prints some debug lines
debug: 0
#path to java scripts (compiled at installation)
pewo_jar: scripts/java/PEWO_java/dist/PEWO.jar