config.yaml

########################################################################################################################
# WORKFLOW CONFIGURATION
########################################################################################################################

# INPUT DATASET
##############################
#alignment in fasta format and tree in newick format


# PAC

# D140 (amino acids)
#dataset_align: /home/nikolai/dev/rappas/data/test/D140/140.phy.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D140/RAxML_bipartitions.140.BEST.WITH

# D155 (full virus genome, 9k positions, 2% gap rate
#dataset_align: /home/nikolai/dev/rappas/data/test/D155/reference.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D155/tree.newick

# D218
#dataset_align: /home/nikolai/dev/rappas/data/test/D218/reference.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D218/tree.newick

# D500
#dataset_align: /home/nikolai/dev/rappas/data/DATASETS/EPA_datasets/500.phy.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/DATASETS/EPA_datasets/RAxML_result.optimization500

# D652
#dataset_align: /home/nikolai/dev/rappas/data/test/D652/bv_refs_aln_stripped_99.5.fasta
#dataset_tree: /home/nikolai/dev/rappas/data/test/D652/RAxML_result.bv_refs_aln

# LAC
dataset_align: /home/nikolai/dev/test/D652
dataset_tree: /home/nikolai/dev/rappas/data/test/D652/RAxML_result.bv_refs_aln


#working directory
#workdir: /home/nikolai/dev/pewo/D140
#workdir: /home/nikolai/dev/pewo/D218
#workdir: /home/nikolai/dev/pewo/D500
#workdir: /home/nikolai/dev/pewo/D652
#workdir: /home/nikolai/dev/pewo/bernoulli_500
#workdir: /home/nikolai/dev/pewo/bernoulli_652
#workdir: /home/nikolai/dev/pewo/bernoulli_155
#workdir: /home/nikolai/dev/pewo/debug_filters/D652/


#states used in analysis, either '0' for nucleotides or '1' for amino acids
states: 0

#which software to test, at least one of : epa, epang, pplacer, rappas, rappas2, apples, appspam
#test_soft: [epa, rappas, rappas2]
test_soft: [epa, rappas2]

# READ GENERATION
# Read lengths to generate
read_length: [300]
# Number of random prunings to compute
pruning_count: 25


## IF "ACCURACY" IS EVALUATED
#############################
### this section matters only when you run the "eval_accuracy.smk" workflow


## IF "RESOURCES" ARE EVALUATED
##############################
### this section matters only when you run the "eval_ressources.smk" workflow

# number of identical runs to launch when evaluating RAM/CPU consumption
# final measurements are reported as mean of the the runs.
repeats: 3
#defines queries source, one of the following:
# user: query sequences are loaded from a file target by parameter "query_set"
# simulate: queries are simulated from the input alignment (reserved for future upgrades, currently not implemented)
query_type: user

# queries used in resource evaluation, >10000 sequences recommended
query_user: examples/6_placement_likelihood/EMP_92_studies_100.fas


########################################################################################################################
# PER SOFTWARE CONFIGURATION
########################################################################################################################

# The following section allow you to set parameters combinations that will be tested by the workflow
# For each software/parameter, set a list of values.

### EPA
###############################

config_epa:

  #EPA is alignment-based and uses a ML evaluation of the placement.
  #it uses a 2-step heuristic:
  # 1) rapid ML evaluation after insertion in the midpoint of each branch
  # 2) full optimization for top scoring branch selected at step 1.
  #(Berger et al, 2011 ; doi: 10.1093/sysbio/syr010)

  #proportion of top scoring branch for which full optimization is computed
  #float in ]0,1]
  G: [0.01]

### PPLACER
###############################

config_pplacer:

  #PPLACER is alignment-based and uses a ML evaluation of the placement.
  #it uses a 2-step heuristic similar to EPA but called the "baseball" heuristic
  #(Matsen et al, 2012 ; doi: 10.1186/1471-2105-11-538)

  max-strikes: [6,12]
  strike-box: [3,6]
  max-pitches: [40,80]

  #pre-masking, 1=yes, 0=no
  premask: 1

### EPA-ng
###############################

config_epang:

  #EPA-NG is alignment-based and uses a ML evaluation of the placement.
  #different heuristics that can be tested:
  # h1: program default, heuristic developed for EPA-ng, fastest heuristic
  # h2: heuristic equivalent to old EPA, slow
  # h3: heuristic equivalent to pplacer defaults, fast
  # h4: no heuristic, very very slow but should produce the best accuracy
  #(Barbera et al, 2019 ; doi: 10.1093/sysbio/syy054)
  heuristics: ["h1"]

  #heuristic-specific parameters can be setup in following lines
  h1:
    g: [0.999,0.99999]
  h2:
    G: [0.01,0.1]
  h3:
    options: none    #reserved if any option appears in future versions
  h4:
    options: none    #reserved if any option appears in future versions

  #pre-masking, 1=yes, 0=no
  premask: 1

### RAPPAS
###############################

config_rappas:

  # RAPPAS uses an alignment-free approach which is completely different from the alignment-based apporaches
  # of EPA, EPA-ng and PPlacer. It does not use a "heuristic" per see to accelerate placements,
  # but a 2-step approach (DB build, then placements) based on the phylo-kmer idea.
  #(Linard et al, 2019 ; doi: 10.1093/bioinformatics/btz068)

  #panel of k that is tested
  #integer in [2,16] (8~10 recommended, >12 often produces too long computations)
  k: [7,8,9]

  #panel of omega that is tested, rappas probability threshold is Thr=(omeIccoydHutgiUda7ga/#states)^k
  #integer in ]0,#states] with #states=4 for nucleotides and 20 for amino acids
  #For DNA, values in [1,2] recommended. For amino acids, values in [5,15] recommended.
  omega: [1.5]

  #reduction setup, e.g. gap/non-gap ratio
  #above which a site of the input alignment
  #is ignored during phylo-kmer computations
  #integer in ]0,1], a value close to 1.00 is recommended (with 1.00, only gap-only columns are filtered).
  reduction: [0.99]

  #external software used to compute ancestral states probabilities (ancestral reconstruction)
  #all software should compute approximately the same values (as same model and model parameters are called)
  #putting more than one software in the list is useful only when 'ressources' consumption is explored
  #
  #following values can be currently set (software supported by RAPPAS):
  # - PAML
  # - PHYML
  # - RAXMLNG
  #
  #overall, the following patterns are expected (oct 2019):
  # speed: paml < phyml < raxml-ng
  # ram  : paml < phyml < raxml-ng
  #
  #if you do not care about testing the behaviour of these external dependencies,
  #set arsoft to 'RAXMLNG' and arthreads with 2 to 8 CPUs for faster computations
  #currently, only raxml-ng can use multiple threads
  #
  #!!! warning, be sure to set arsoft VALUES as UPPER CASE !!!
  #arsoft: [PHYML,RAXMLNG]
  arsoft: [PHYML]
  arthreads: 2

  #maximum amount of memory available to rappas process
  #this has no influence on placement accuracy but it will impact "resource" evaluation
  #in particular, testing very large trees will be faster with more memory (due to JVM garbage collector behaviour)
  #set as an integer value, which represents the maximum amount of Gb allocatable to the JVM (memory: 8 => 8 Gb of RAM)

  memory: 8

### RAPPAS2
###############################

config_rappas2:
  k: [7,8,9]
  omega: [1.5]
  reduction: [0.99]
  arsoft: [PHYML]
  arthreads: 1
  #filter: "NO-FILTER"
  filter: ["ENTROPY", "RANDOM"]

  mu: [0.0625, 0.125, 0.25, 0.5, 1.0]
  #mu: [0.1]
  #mu: [1.0]

  #f: [0.01, 0.25, 0.5, 0.75, 0.99]
  #model: ["BERNOULLI"]

  f: 1.0
  #model: ["DEFAULT", "MULTINOMIAL"]
  model: ["DEFAULT"]

### APPLES
###############################

config_apples:

  #apples placements are based on distance computations between the query and the reference tree
  #it allows different "methods" to compute these distance and different "criteria" to selection the best placement.
  #(Balaban et al, 2019 ; doi: 10.1093/sysbio/syz063)

  #List of weighted least squares method to test.
  #Possible values are:
  # OLS: k=0 ordinary least square (Cavalli-Sforza and Edwards 1967)
  # FM : k=2 (Fitch and Margoliash, 1967)
  # BE : k=1 (Beyer et al., 1974)
  #methods: ["OLS","FM","BE"]
  #!warning, be sure to set methods VALUES as UPPER CASE
  methods: [OLS,BE]

  #List of placement criterion to test.
  #Possible values are:
  # MLSE: Least Squares Phylogenetic Placement
  # ME : Minimum Evolution
  # HYBRID : MLSE then ME
  #criteria: ["MLSE","ME","HYBRID"]
  #!warning, be sure to set criteria VALUES as UPPER CASE
  criteria: [MLSE,ME]

### APP-SPAM
###############################

config_appspam:

  #appspam calculates phylogenetic distances between all query and reference distances based on 
  #filtered spaced word matches. The placement position is determined with different heuristics (mode).
  #(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)

  #List of placement heuristics to test.
  #Possible values are:
  # MINDIST   : Above reference with smallest phylogenetic distance.
  # SPAMCOUNT : Above reference with most filtered spaced word matches.
  # LCADIST   : LCA of two leaves with smallest phylogenetic distances.
  # LCACOUNT  : LCA of two leaves with most filtered spaced word matches.
  # APPLES    : Our calculated distances are used as input matrix for APPLES.
  mode: [LCACOUNT]

  #List of weights for the pattern to be tested (number of match positions).
  #Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
  w: [8, 12]

  #Number of pattern from which spaced words are generated.
  #At the moment 1 is heavily recommended. 
  pattern: [1]


########################################################################################################################
# OPTIONS COMMON TO ALL SOFTWARE
########################################################################################################################

### jplace output formatting

#maximum number placements kept per query (minimum is 1)
#equivalent to options :
# --epa-keep-placements (EPA)
# --keep-at-most (PPLACER)
# --filter-max (EPANG)
# --keep-at-most (RAPPAS)
maxplacements: 7

#minimum likelihood weight ratio below which placement are not output
#equivalent to options :
# --epa-prob-thresholds (EPA)
# --keep-factor (PPLACER)
# --filter-min-lwr (EPANG)
# --keep-factor (RAPPAS)
minlwr: 0.01


########################################################################################################################
# EVOLUTIONARY MODEL
########################################################################################################################

# By default, the workflow re-optimises all pruned tree using the model defined below.
# Updated model parameters are then loaded and transferred to placement software using ML approaches.
# Currently, you can choose one of the following models :
#   - GTR+G (nucleotides)
#   - JTT+G (amino acids)
#   - WAG+G (amino acids)
#   - LG+G  (amino acids)
# Warning: in the present configuration file, be sure you set the "states:" field accordingly.
phylo_params:
  model: "GTR+G"
  categories: 4

lac:
  optimization: "OFF"


########################################################################################################################
# DEBUG OPTIONS
########################################################################################################################

#if 1, prints some debug lines
debug: 0
#path to java scripts (compiled at installation)
pewo_jar: scripts/java/PEWO_java/dist/PEWO.jar