From 8ce5ad6700e655c08673aa627460824a84766353 Mon Sep 17 00:00:00 2001 From: Cedric Chauve Date: Wed, 26 Apr 2023 19:09:24 -0700 Subject: [PATCH 1/6] ecceTERA rformating --- scripts/DeCoSTAR_ecceTERA_reformat.py | 136 ++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 scripts/DeCoSTAR_ecceTERA_reformat.py diff --git a/scripts/DeCoSTAR_ecceTERA_reformat.py b/scripts/DeCoSTAR_ecceTERA_reformat.py new file mode 100644 index 0000000..0369e8f --- /dev/null +++ b/scripts/DeCoSTAR_ecceTERA_reformat.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +""" Reformat reconciliations computed by ecceTERA within DeCoSTAR """ + +__author__ = "Cedric Chauve" +__email__ = "cedric.chauve@sfu.ca" +__version__ = "1.0" +__status__ = "Released" + +import sys +import os +import xml.etree.ElementTree as ET + + +from recPhyloXML_utils import xml_rename_species, + +''' +Reads DeCoSTAR original and reformatted genes +Creates dict(ecceTERA species -> data species) +Creates dict(ecceTERA family -> data family +''' +def eccetera_read_results(in_genes_file1, in_genes_file2, sep='|'): + ''' + input: + - original and reformatted genes file + output: + - dict(ecceTERA species -> data species) + - dict(ecceTERA family -> data family) + ''' + species_dict,families_dict = {},{} + # Reading original genes + data,data_idx = {},0 + with open(in_genes_file1, 'r') as in_genes: + for gene_data in in_genes.readlines(): + gene_split = gene_data.rstrip().split() + species_id,gene = gene_split[0:2] + if len(gene_split)>2: + family_id = gene.split(sep)[0] + data[data_idx] = (species_id,family_id) + else: + data[data_idx] = (species_id,family_id) + data_idx += 1 + # Reading reformated genes + data_idx = 0 + with open(in_genes_file2, 'r') as in_genes: + for gene_data in in_genes.readlines(): + gene_split = gene_data.rstrip().split() + species_id,gene = gene_split[0:2] + family_id,gene_id = gene.split(sep,1)[0:2] + (sp,f) = data[data_idx] + species_dict[sp] = species_id + species_dict[species_id] = species_id + families_dict[f] = family_id + data_idx += 1 + return species_dict,families_dict + +''' Read the recPhyloXML species tree created by DeCoSTAR ''' +def eccetera_read_species_tree(in_species_tree_file): + ''' + input: recPhyloXML species tree of DeCoSTAR + output: XML string for encoding the species tree + ''' + species_tree_str = '' + append_lines = False + with open(in_species_tree_file, 'r') as in_sp_tree: + for xml_line in in_sp_tree.readlines(): + if xml_line.lstrip().startswith(' path to corresponding temporary recPhyloXML file + ''' + family_idx = 0 + out_reconciliations_files = {} + with open(in_reconciliations_file, 'r') as in_reconciliations: + for xml_line in in_reconciliations.readlines(): + if xml_line.lstrip().startswith(' 0: + reconciliation_str += f'\n{xml_line.rstrip()}' + return out_reconciliations_files + +def main(): + in_genes_file1 = sys.argv[1] # DeCoSTAR genes file + in_genes_file2 = sys.argv[2] # Reformatted genes file + in_sp_xml_file = sys.argv[3] # Path to DeCoSTAR species tree XML file + in_reconciliations_file = sys.argv[4] # Path to DeCoSTAR reconciliations file + out_dir = sys.argv[5] # Directory where to write reconciliations files + out_xml_ext = sys.argv[6] # Extension to recPhyloXML files + out_reconciliations_file = sys.argv[7] # Data set reconciliations file + + species_map,families_map = eccetera_read_results(in_genes_file1, in_genes_file2) + xml_sp_str = eccetera_read_species_tree(in_sp_xml_file) + tmp_reconciliations_files = eccetera_read_reconciliations( + in_reconciliations_file, xml_sp_str, families_map, out_dir + ) + + with open(out_reconciliations_file, 'w') as out_file: + for family_idx,(family_id,tmp_rec_file) in tmp_reconciliations_files.items(): + tree = ET.parse(tmp_rec_file) + xml_rename_species(tree, species_map) + out_reconciliation_file = os.path.join(out_dir, f'{family_id}{out_xml_ext}') + tree.write(out_reconciliation_file) + out_file.write(f'{family_id}\t{out_reconciliation_file}\n') + +if __name__ == "__main__": + main() + From ff3879660cd3b5144a40523d4f5bafcb809f7a09 Mon Sep 17 00:00:00 2001 From: Cedric Chauve Date: Thu, 27 Apr 2023 10:16:04 -0700 Subject: [PATCH 2/6] Creating intermediate files --- parameters/ecceTERA_DeCoSTAR.yaml | 126 ++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 parameters/ecceTERA_DeCoSTAR.yaml diff --git a/parameters/ecceTERA_DeCoSTAR.yaml b/parameters/ecceTERA_DeCoSTAR.yaml new file mode 100644 index 0000000..0c52a24 --- /dev/null +++ b/parameters/ecceTERA_DeCoSTAR.yaml @@ -0,0 +1,126 @@ + # DO NOT EDIT + # DeCoSTAR: computing ancestral adjacencies + ecceTERA_DeCoSTAR: + name: &decostar_name 'ecceTERA_DeCoSTAR' + input: + dirs: + - &decostar_input_dir !join [*dir_aux, *decostar_name] + files: + - &decostar_input_file_trees !join [*decostar_input_dir, 'gene_trees.txt'] + - &decostar_input_file_adjacencies !join [*decostar_input_dir, 'adjacencies.txt'] + script: + - !join [*run_dir_scripts, 'DeCoSTAR_create_input_files.py'] + - !ref [*data_gene_orders_path] + - !ref [*decostar_input_path] + - !ref [*data_families_path] + - !ref [*decostar_input_file_adjacencies] + - !ref [*decostar_input_file_trees] + output: + file: &decostar_output_file !ref [*data_adjacencies_path] + slurm: + options: !ref [*decostar_slurm_options] + modules: !ref [*decostar_modules] + array: + results: + file: &decostar_slurm_array_results_file !ref [*data_species_path] + field: &decostar_slurm_array_results_field 1 + var: &decostar_slurm_array_results_var 'SPECIES' + name: &decostar_slurm_array_results_name !concat ['${', *decostar_slurm_array_results_var, '}'] + ext: &decostar_slurm_array_results_ext !ref [*data_adjacencies_ext] + results: + dirs: + - &decostar_results_dir !join [*dir_results, *decostar_name] + names: + - &decostar_slurm_results_species_adjacencies_name !concat [*decostar_slurm_array_results_name, *decostar_slurm_array_results_ext] + files: + - &decostar_slurm_results_species_adjacencies !join [*decostar_results_dir, *decostar_slurm_results_species_adjacencies_name] + other: + - &decostar_slurm_results_adjacencies_file !join [*decostar_results_dir, 'adjacencies.txt'] + - &decostar_slurm_results_genes_file_1 !join [*decostar_results_dir, 'genes.txt'] + - &decostar_slurm_results_genes_file_2 !join [*decostar_results_dir, 'genes_reformatted.txt'] + - &decostar_slurm_results_species_file !join [*decostar_results_dir, 'species.txt'] + - &decostar_slurm_results_species_tree_file !join [*decostar_results_dir, 'speciesTree.phyloxml'] + - &decostar_slurm_results_reconciliations_file !join [*decostar_results_dir, 'reconciliations.xml'] + cmd: + - !concat [*decostar_exec, ' \'] + # Input files + - !concat [' species.file=', *data_species_tree_path, ' \'] + - !concat [' adjacencies.file=', *decostar_input_file_adjacencies, ' \'] + - !concat [' gene.distribution.file=', *decostar_input_file_trees, ' \'] + # Output + - !concat [' output.dir=', *decostar_results_dir, ' \'] + - !concat [' write.newick=', *decostar_write_newick, ' \'] + - !concat [' write.adjacency.trees=', *decostar_write_adjacency_trees, ' \'] + # Model + - !concat [' already.reconciled=', *decostar_already_reconciled, ' \'] + - !concat [' rooted=', *decostar_rooted, ' \'] + - !concat [' nb.sample=', *decostar_nb_sample, ' \'] + - !concat [' dupli.cost=', *decostar_dupli_cost, ' \'] + - !concat [' loss.cost=', *decostar_loss_cost, ' \'] + - !concat [' AGain.cost=', *decostar_again_cost, ' \'] + - !concat [' ABreak.cost=', *decostar_abreak_cost, ' \'] + - !concat [' Loss.aware=', *decostar_loss_aware, ' \'] + - !concat [' Loss.iteration=', *decostar_loss_iteration, ' \'] + - !concat [' C1.Advantage=', *decostar_c1_advantage, ' \'] + - !concat [' all.pair.equivalence.class=', *decostar_all_pairs, ' \'] + - !concat [' always.AGain=', *decostar_always_again, ' \'] + - !concat [' absence.penalty=', *decostar_absence_penalty, ' \'] + - !concat [' all.pair.equivalence.class=', *decostar_all_pairs, ' \'] + - !concat [' boltzmann.temperature=', *decostar_boltzmann_temperature, ' \'] + - ' write.adjacencies=true \' + - ' write.genes=true \' + - ' use.boltzmann=true \' + - ' char.sep="|" \' + - ' with.transfer=false \' + - ' verbose=2' + - ' ' + - !concat [!join [*run_dir_scripts, 'DeCoSTAR_reformat.py'], ' \'] + - !concat [' ', *data_species_path, ' \'] + - !concat [' ', *decostar_slurm_results_species_file, ' \'] + - !concat [' ', *decostar_already_reconciled, ' \'] + - !concat [' ', *data_families_path, ' \'] + - !concat [' ', *decostar_input_path, ' \'] + - !concat [' ', *decostar_input_file_trees, ' \'] + - !concat [' ', *decostar_slurm_results_genes_file_1, ' \'] + - !concat [' ', *decostar_slurm_results_adjacencies_file, ' \'] + - !concat [' ', *decostar_slurm_results_genes_file_2, ' \'] + - !concat [' ', *decostar_results_dir] + - ' ' + - !concat [!join [*run_dir_scripts, 'DeCoSTAR_ecceTERA_reformat.py'], ' \'] + - !concat [' ', *decostar_slurm_results_genes_file_1, ' \'] + - !concat [' ', *decostar_slurm_results_genes_file_2, ' \'] + - !concat [' ', *decostar_slurm_results_species_tree_file, ' \'] + - !concat [' ', *decostar_slurm_results_reconciliations_file, ' \'] + - !concat [' ', *decostar_results_dir, ' \'] + - !concat [' ', *data_reconciliations_ext, ' \'] + - !concat [' ', *data_reconciliations_path] + stats: + names: + - &decostar_stats_file_adj_species_name !concat [*decostar_name, '_adjacencies_species', *log_ext_stat] + - &decostar_stats_file_adj_components_name !concat [*decostar_name, '_adjacencies_components', *log_ext_stat] + - &decostar_stats_file_rec_species_name !concat [*decostar_name, '_reconciliations_species', *log_ext_stat] + - &decostar_stats_file_rec_families_name !concat [*decostar_name, '_reconciliations_families', *log_ext_stat] + files: + - &decostar_stats_file_adj_species !join [*dir_stats, *decostar_name, *decostar_stats_file_adj_species_name] + - &decostar_stats_file_adj_components !join [*dir_stats, *decostar_name, *decostar_stats_file_adj_components_name] + - &decostar_stats_file_rec_species !join [*dir_stats, *decostar_name, *decostar_stats_file_rec_species_name] + - &decostar_stats_file_rec_families !join [*dir_stats, *decostar_name, *decostar_stats_file_rec_families_name] + script: + - !join [*run_dir_scripts, 'recPhyloXML_statistics.py'] + - !ref [*data_reconciliations_path] + - !ref [*decostar_stats_file_rec_species] + - !ref [*decostar_stats_file_rec_families] + - !concat [';'] + - !join [*run_dir_scripts, 'DeCoSTAR_statistics.py'] + - !ref [*data_species_path] + - !ref [*decostar_slurm_results_genes_file_2] + - !ref [*data_adjacencies_path] + - !ref [*decostar_stats_thresholds] + - !ref [*decostar_stats_file_adj_species] + - !concat [';'] + - !join [*run_dir_scripts, 'gene_orders_utils.py'] + - 'stats' + - !ref [*decostar_slurm_results_genes_file_2] + - !ref [*data_adjacencies_path] + - !ref [*decostar_results_dir] + - !ref [*decostar_stats_file_adj_components] From 4e59064be41722976842ffea440adfca78cad6f2 Mon Sep 17 00:00:00 2001 From: Cedric Chauve Date: Thu, 27 Apr 2023 10:16:31 -0700 Subject: [PATCH 3/6] Creating intermediate files --- parameters/GeneRax.yaml | 3 +- scripts/DeCoSTAR_ecceTERA_reformat.py | 27 ++++++++----- scripts/GeneRax_reformat.py | 57 ++++++++++++++++++--------- 3 files changed, 57 insertions(+), 30 deletions(-) mode change 100644 => 100755 scripts/DeCoSTAR_ecceTERA_reformat.py diff --git a/parameters/GeneRax.yaml b/parameters/GeneRax.yaml index 785ae2a..73a022f 100644 --- a/parameters/GeneRax.yaml +++ b/parameters/GeneRax.yaml @@ -44,7 +44,8 @@ - !concat [!join [*run_dir_scripts, 'GeneRax_reformat.py'], ' \'] - !concat [' ', *generax_input_file, ' \'] - !concat [' ', *generax_results_dir, ' \'] - - !concat [' ', *data_reconciliations_ext] + - !concat [' ', *data_reconciliations_ext, ' \'] + - !concat [' ', *data_gene_trees_path] stats: names: - &generax_stats_file_species_name !concat [*generax_name, '_species', *log_ext_stat] diff --git a/scripts/DeCoSTAR_ecceTERA_reformat.py b/scripts/DeCoSTAR_ecceTERA_reformat.py old mode 100644 new mode 100755 index 0369e8f..0568aac --- a/scripts/DeCoSTAR_ecceTERA_reformat.py +++ b/scripts/DeCoSTAR_ecceTERA_reformat.py @@ -13,7 +13,7 @@ import xml.etree.ElementTree as ET -from recPhyloXML_utils import xml_rename_species, +from recPhyloXML_utils import xml_rename_species ''' Reads DeCoSTAR original and reformatted genes @@ -108,6 +108,18 @@ def eccetera_read_reconciliations(in_reconciliations_file, in_sp_xml_str, famili reconciliation_str += f'\n{xml_line.rstrip()}' return out_reconciliations_files +''' Writing reconcilitions files ''' +def eccetera_write_reconciliations(in_reconciliations_files, species_map, out_dir, out_xml_ext, out_reconciliations_file): + with open(out_reconciliations_file, 'w') as out_file: + for family_idx,(family_id,tmp_rec_file) in in_reconciliations_files.items(): + tree = ET.parse(tmp_rec_file) + xml_rename_species(tree, species_map) + out_reconciliation_file = os.path.join(out_dir, f'{family_id}{out_xml_ext}') + tree.write(out_reconciliation_file) + out_file.write(f'{family_id}\t{out_reconciliation_file}\n') + + + def main(): in_genes_file1 = sys.argv[1] # DeCoSTAR genes file in_genes_file2 = sys.argv[2] # Reformatted genes file @@ -116,20 +128,15 @@ def main(): out_dir = sys.argv[5] # Directory where to write reconciliations files out_xml_ext = sys.argv[6] # Extension to recPhyloXML files out_reconciliations_file = sys.argv[7] # Data set reconciliations file - + species_map,families_map = eccetera_read_results(in_genes_file1, in_genes_file2) xml_sp_str = eccetera_read_species_tree(in_sp_xml_file) tmp_reconciliations_files = eccetera_read_reconciliations( in_reconciliations_file, xml_sp_str, families_map, out_dir ) - - with open(out_reconciliations_file, 'w') as out_file: - for family_idx,(family_id,tmp_rec_file) in tmp_reconciliations_files.items(): - tree = ET.parse(tmp_rec_file) - xml_rename_species(tree, species_map) - out_reconciliation_file = os.path.join(out_dir, f'{family_id}{out_xml_ext}') - tree.write(out_reconciliation_file) - out_file.write(f'{family_id}\t{out_reconciliation_file}\n') + eccetera_write_reconciliations( + tmp_reconciliations_files, species_map, out_dir, out_xml_ext, out_reconciliations_file + ) if __name__ == "__main__": main() diff --git a/scripts/GeneRax_reformat.py b/scripts/GeneRax_reformat.py index 8eeb02f..08bb320 100755 --- a/scripts/GeneRax_reformat.py +++ b/scripts/GeneRax_reformat.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # coding: utf-8 -""" Reformat GeneRax recPhyloXML files to be read by DeCoSTAR """ +""" Reformat GeneRax recPhyloXML files to be read by DeCoSTAR and creates a gene trees file """ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" @@ -13,32 +13,51 @@ from recPhyloXML_utils import xml_reformat_file -def main(): - in_GeneRax_families_file = sys.argv[1] - results_dir = sys.argv[2] - suffix = sys.argv[3] - - with open(in_GeneRax_families_file, 'r') as families: - current_gene_id = 0 # current gene number +def get_families_from_input(in_GeneRax_families_file): + with open(in_GeneRax_families_file, 'r') as in_file: families = [ f.rstrip()[2:] - for f in families.readlines() + for f in in_file.readlines() if f.startswith('- ') ] + return families + +def reformat_reconciliations(families, results_dir, suffix): + current_gene_id = 0 # current gene number + for fam_id in families: + in_file = os.path.join( + results_dir, 'reconciliations', + f'{fam_id}_reconciliated.xml' + ) + out_file = os.path.join( + results_dir, 'reconciliations', + f'{fam_id}{suffix}' + ) + if os.path.isfile(in_file): + current_gene_id = xml_reformat_file( + in_file, out_file, + start_id=current_gene_id + ) + +def create_gene_trees_file(families, results_dir, out_gene_trees_file): + with open(out_gene_trees_file, 'w') as out_file: for fam_id in families: in_file = os.path.join( - results_dir, 'reconciliations', - f'{fam_id}_reconciliated.xml' - ) - out_file = os.path.join( - results_dir, 'reconciliations', - f'{fam_id}{suffix}' + results_dir, 'results', fam_id, + 'geneTree.newick' ) if os.path.isfile(in_file): - current_gene_id = xml_reformat_file( - in_file, out_file, - start_id=current_gene_id - ) + out_file.write(f'{fam_id}\t{in_file}\n') + +def main(): + in_GeneRax_families_file = sys.argv[1] + results_dir = sys.argv[2] + rec_ext = sys.argv[3] + out_gene_trees_file = sys.argv[4] + + families = get_families_from_input(in_GeneRax_families_file) + reformat_reconciliations(families, results_dir, rec_ext) + create_gene_trees_file(families, results_dir, out_gene_trees_file) if __name__ == "__main__": main() From 6fcfb11ee6769e3c3418b5affe4ff4a5aff18e95 Mon Sep 17 00:00:00 2001 From: Cedric Chauve Date: Thu, 27 Apr 2023 10:23:42 -0700 Subject: [PATCH 4/6] Creating intermediate files --- example/anopheles_X_ecceTERA_header.yaml | 3 +++ parameters/README.md | 1 + scripts/README.md | 10 ++++++++++ 3 files changed, 14 insertions(+) diff --git a/example/anopheles_X_ecceTERA_header.yaml b/example/anopheles_X_ecceTERA_header.yaml index c2f87e7..ff10382 100644 --- a/example/anopheles_X_ecceTERA_header.yaml +++ b/example/anopheles_X_ecceTERA_header.yaml @@ -40,6 +40,9 @@ data: gene_trees: path: &data_gene_trees_path !join [*run_dir_root, 'example', 'anopheles_X_3_ALE', 'data', 'gene_trees_X_3.txt'] # Paths to computed output files (generated data, no reconciliation) + reconciliations: + path: &data_reconciliations_path !join [*dir_data, 'reconciliations_X_3.txt'] + ext: &data_reconciliations_ext '.recphyloxml' adjacencies: path: &data_adjacencies_path !join [*dir_data, 'adjacencies_X_3.txt'] ext: &data_adjacencies_ext '_adjacencies.txt' diff --git a/parameters/README.md b/parameters/README.md index 10f37a3..0e7a8e9 100644 --- a/parameters/README.md +++ b/parameters/README.md @@ -5,6 +5,7 @@ This directory contains the files required to create an AGO pipeline YAML parame - `MACSE.yaml`: paramaters file block to run MACSE; **do not edit**. - `ALE.yaml`: paramaters file block to run ALE; **do not edit**. - `DeCoSTAR.yaml`: paramaters file block to run DeCoSTAR; **do not edit**. +- `ecceTERA_DeCoSTAR.yaml`: paramaters file block to run ecceTERA+DeCoSTAR; **do not edit**. - `GeneRax.yaml`: paramaters file block to run GeneRax; **do not edit**. - `IQ-TREE.yaml`: paramaters file block to run IQ-TREE; **do not edit**. - `SPPDCJ.yaml`: paramaters file block to run spp_dcj; **do not edit**. diff --git a/scripts/README.md b/scripts/README.md index 6edcddc..c4c75de 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -73,6 +73,16 @@ python DeCoSTAR_reformat.py ``` +`DeCoSTAR_ecceTERA_reformat.py`: +- Reformats ecceTERA+DeCoSTAR results files to be consistent with AGO data. +- USAGE: +``` +python DeCoSTAR_ecceTERA_reformat.py + + + +``` + `DeCoSTAR_statistics.py`: - Creates CSV files about gene adjacencies generated by DeCoSTAR. - USAGE: `python DeCoSTAR_statistics.py ` From e4598619a613f7d31f3e7c6d24e5d0565cdaf494 Mon Sep 17 00:00:00 2001 From: Cedric Chauve Date: Thu, 27 Apr 2023 11:04:11 -0700 Subject: [PATCH 5/6] Experiments --- example/README.md | 134 +++++++++++++----------- example/anopheles_X_GeneRax.yaml | 5 +- example/anopheles_X_GeneRax_header.yaml | 2 + 3 files changed, 80 insertions(+), 61 deletions(-) diff --git a/example/README.md b/example/README.md index dde6614..be090ec 100644 --- a/example/README.md +++ b/example/README.md @@ -100,19 +100,20 @@ SUCCESS SEQUENCES Then we create the pipeline parameters file: ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml create example/anopheles_X_3_GeneRax_header.yaml parameters MACSE GeneRax DeCoSTAR SPPDCJ - example/anopheles_X_3_GeneRax.yaml +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml create example/anopheles_X_GeneRax_header.yaml parameters MACSE GeneRax DeCoSTAR SPPDCJ + example/anopheles_X_GeneRax.yaml ``` The next step was to initialize the pipeline. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml init +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml init /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/species_tree_4.newick -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/species_tree_4.newick. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/species_4.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/species_4.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/families_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/families_X_3.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/gene_orders_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/gene_orders_X_3.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/sequences_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/sequences_X_3.txt. + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/gene_trees_X_3.txt will be computed. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/alignments_X_3.txt will be computed. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/reconciliations_X_3.txt will be computed. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/adjacencies_X_3.txt will be computed. @@ -123,10 +124,10 @@ Then we run in sequence the pipeline tools. First we run `MACSE` to compute an MSA for each family. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml slurm MACSE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml slurm MACSE (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/MACSE/MACSE.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml check MACSE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml check MACSE ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/MACSE.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/alignments_X_3.txt @@ -136,15 +137,15 @@ This shows that `MACSE` succeeded to compute an MSA for all gene families. The next step is `GeneRax` to compute reconciled gene trees. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml slurm GeneRax +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml slurm GeneRax /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/GeneRax/GeneRax.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/GeneRax/GeneRax.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml check GeneRax +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml check GeneRax ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/GeneRax.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/reconciliations_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml stats GeneRax +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml stats GeneRax /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/GeneRax/GeneRax_species.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/GeneRax/GeneRax_families.csv (AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/GeneRax/GeneRax_species.csv @@ -171,15 +172,15 @@ either through `slurm` or a `bash` script, we run `DeCoSTAR` as a bash script, redirecting the standard output and error output into specific files. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml bash DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml bash DeCoSTAR /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/DeCoSTAR/DeCoSTAR.sh (AGO-pipeline) > chmod 755 /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/DeCoSTAR/DeCoSTAR.sh (AGO-pipeline) > /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/DeCoSTAR/DeCoSTAR.sh 2> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/DeCoSTAR/DeCoSTAR.err 1> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/DeCoSTAR/DeCoSTAR.log -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml check DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml check DeCoSTAR ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/DeCoSTAR.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/adjacencies_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml stats DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml stats DeCoSTAR /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/DeCoSTAR/DeCoSTAR_species.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/DeCoSTAR/DeCoSTAR_components.csv (AGO-pipeline) > head -1 /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/DeCoSTAR/DeCoSTAR_species.csv; grep -P ':0.5\t' /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/DeCoSTAR/DeCoSTAR_species.csv | sort @@ -219,26 +220,26 @@ Finally, we clear sytenic conflicts in ancestral adjacencies using Program (MILP) and then to solve it using `gurobi`. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml slurm SPPDCJ_ILP +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml slurm SPPDCJ_ILP /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/SPPDCJ_ILP/SPPDCJ_ILP.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/SPPDCJ_ILP/SPPDCJ_ILP.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml check SPPDCJ_ILP +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml check SPPDCJ_ILP ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/SPPDCJ_ILP.log OUTPUT: No output file is created ``` We solve the MILP. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml slurm SPPDCJ +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml slurm SPPDCJ /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/SPPDCJ/SPPDCJ.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/aux/SPPDCJ/SPPDCJ.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml check SPPDCJ +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml check SPPDCJ ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/SPPDCJ.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/adjacencies_ago_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_GeneRax.yaml stats SPPDCJ +(AGO-pipeline) > python src/AGO.py example/anopheles_X_GeneRax.yaml stats SPPDCJ /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/SPPDCJ/SPPDCJ_species.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/statistics/SPPDCJ/SPPDCJ_components.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/CARs.txt @@ -308,12 +309,12 @@ SUCCESS ALIGNMENTS ``` ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml create example/anopheles_X_3_ALE_header.yaml parameters IQ-TREE ALE DeCoSTAR SPPDCJ - example/anopheles_X_3_ALE.yaml +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml create example/anopheles_X_ALE_header.yaml parameters IQ-TREE ALE DeCoSTAR SPPDCJ + example/anopheles_X_ALE.yaml ``` Next we initialize he pipeline, and we can observe that indeed the MSA files will not be recomputed. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml init +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml init /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/species_tree_4.newick -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/species_tree_4.newick. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/species_4.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/species_4.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/families_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/families_X_3.txt. @@ -328,19 +329,19 @@ Next we initialize he pipeline, and we can observe that indeed the MSA files wil Next we run `IQ-TREE` through `slurm`. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml slurm IQ-TREE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml slurm IQ-TREE /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/aux/IQ-TREE/IQ-TREE.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/aux/IQ-TREE/IQ-TREE.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml check IQ-TREE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml check IQ-TREE ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/log/IQ-TREE.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/gene_trees_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml slurm ALE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml slurm ALE /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/aux/ALE/ALE.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/aux/ALE/ALE.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml check ALE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml check ALE ERRORS: 120 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/log/ALE.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/reconciliations_X_3.txt @@ -351,7 +352,7 @@ discarded from further steps without the need to do anything, as this is handled automatically by the AGO pipeline. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml stats ALE +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml stats ALE /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/ALE/ALE_species.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/ALE/ALE_families.csv (AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/ALE/ALE_species.csv @@ -375,15 +376,15 @@ reconciled gene tree included a lateral gene transfer. Next we run `DeCoSTAR` through `slurm`. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml slurm DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml slurm DeCoSTAR /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/aux/DeCoSTAR/DeCoSTAR.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/aux/DeCoSTAR/DeCoSTAR.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml check DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml check DeCoSTAR ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/log/DeCoSTAR.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/adjacencies_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ALE.yaml stats DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ALE.yaml stats DeCoSTAR /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/DeCoSTAR/DeCoSTAR_species.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/DeCoSTAR/DeCoSTAR_components.csv (AGO-pipeline) > head -1 /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/DeCoSTAR/DeCoSTAR_species.csv; grep -P ':0.5\t' /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/statistics/DeCoSTAR/DeCoSTAR_species.csv | sort @@ -447,93 +448,106 @@ SUCCESS GENE TREES Next we create the pipeline parameters file and initialize the pipeline. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml create example/anopheles_X_3_ecceTERA_header.yaml parameters DeCoSTAR SPPDCJ - example/anopheles_X_3_ecceTERA.yaml -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml init +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml create example/anopheles_X_ecceTERA_header.yaml parameters ecceTERA_DeCoSTAR SPPDCJ + example/anopheles_X_ecceTERA.yaml +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml init /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/species_tree_4.newick -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/species_tree_4.newick. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/species_4.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/species_4.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/families_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/families_X_3.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/data/VectorBase/gene_orders_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/gene_orders_X_3.txt. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ALE/data/gene_trees_X_3.txt -> /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/gene_trees_X_3.txt. + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/reconciliations_X_3.txt will be computed. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/adjacencies_X_3.txt will be computed. /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/adjacencies_ago_X_3.txt will be computed. ``` Next, we run `DeCoSTAR` to compute reconciled gene trees and ancestral adjacencies. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml slurm DeCoSTAR - /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/DeCoSTAR/DeCoSTAR.sh -(AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/DeCoSTAR/DeCoSTAR.sh +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml slurm ecceTERA_DeCoSTAR + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR.sh +(AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml check DeCoSTAR +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml check ecceTERA_DeCoSTAR ERRORS: 0 - LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/log/DeCoSTAR.log + LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/log/ecceTERA_DeCoSTAR.log OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/adjacencies_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml stats DeCoSTAR - /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/DeCoSTAR/DeCoSTAR_species.csv - /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/DeCoSTAR/DeCoSTAR_components.csv -(AGO-pipeline) > head -1 /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/DeCoSTAR/DeCoSTAR_species.csv; grep -P ':0.5\t' /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/DeCoSTAR/DeCoSTAR_species.csv | sort +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml stats ecceTERA_DeCoSTAR + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_adjacencies_species.csv + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_adjacencies_components.csv + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_reconciliations_species.csv + /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_reconciliations_families.csv +(AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_reconciliations_species.csv +#species:genes:duplications:losses:transfers +node2:534:34:0:0 +AnophelesgambiaePEST:564:7:23:0 +AnophelesfunestusFUMOZ:579:6:7:0 +node0:550:5:32:0 +AnophelesatroparvusEBRO:577:20:50:0 +node1:525:20:1:0 +AnophelesalbimanusSTECLA:545:10:3:0 +(AGO-pipeline) > head -1 /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_adjacencies_species.csv; grep -P ':0.5\t' /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_adjacencies_species.csv | sort #species:nb_genes_in_adj:min_weight nb_genes_in_adj:nb_adjacencies:nb_ext_in_conflict:nb_free_ext AnophelesalbimanusSTECLA:545:0.5 545:544:0:2 AnophelesatroparvusEBRO:577:0.5 577:576:0:2 AnophelesfunestusFUMOZ:579:0.5 579:578:0:2 AnophelesgambiaePEST:564:0.5 564:563:0:2 -node0:580:0.5 559:524:7:119 -node1:607:0.5 534:430:8:362 -node2:538:0.5 362:245:0:586 -(AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/DeCoSTAR/DeCoSTAR_components.csv +node0:580:0.5 562:524:7:119 +node1:607:0.5 532:426:7:369 +node2:538:0.5 362:243:0:590 +(AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/ecceTERA_DeCoSTAR/ecceTERA_DeCoSTAR_adjacencies_components.csv #species nb_comp:nb_lin_comp:nb_circ_comp:list(nb_genes.nb_adj.nb_comp) node2 292:292:0:11.10.1,10.9.1,9.8.1,8.7.1,7.6.3,6.5.5,5.4.4,4.3.14,3.2.24,2.1.63,1.0.175 -node1 172:164:7:29.31.1,19.18.1,18.17.1,15.14.1,14.13.1,10.10.3,10.9.1,9.9.2,9.8.4,8.7.8,7.6.5,6.6.2,6.5.6,5.4.8,4.3.13,3.2.18,2.1.28,1.0.69 +node1 171:163:7:29.31.1,19.18.1,18.17.1,15.14.1,14.13.1,10.10.3,10.9.1,9.9.2,9.8.4,8.7.8,7.6.5,6.6.2,6.5.7,5.4.8,4.3.12,3.2.18,2.1.27,1.0.69 AnophelesalbimanusSTECLA 1:1:0:545.544.1 -node0 55:51:3:68.67.1,60.59.1,47.46.1,42.41.1,35.35.1,31.31.1,26.25.1,22.21.1,21.22.1,17.16.1,16.15.1,15.14.1,14.13.1,12.11.1,11.11.1,11.10.2,10.9.1,9.8.3,8.7.2,7.6.1,6.5.1,5.4.3,4.3.1,3.2.2,2.1.6,1.0.18 +node0 54:50:3:68.67.1,60.59.1,47.46.1,43.42.1,35.35.1,31.31.1,26.25.1,22.21.1,21.22.1,17.16.1,16.15.1,15.14.1,14.13.1,12.11.1,11.11.1,11.10.2,10.9.1,9.8.3,8.7.2,7.6.1,6.5.1,5.4.3,4.3.1,3.2.2,2.1.6,1.0.17 AnophelesatroparvusEBRO 1:1:0:577.576.1 AnophelesgambiaePEST 1:1:0:564.563.1 AnophelesfunestusFUMOZ 1:1:0:579.578.1 ``` -As in previous experiments, there is a low level of conflict and a -high number of free gene extremities and adjacencies component +The ancestral gene content is more reasonable than for the GeneRax pipeline. +There is a very low level of conflict and a +high number of free gene extremities and adjacencies components indicating the CARs will be highly fragmented. We clean conflicts using `spp_dcj` and generate CARs files. ``` -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml slurm SPPDCJ_ILP +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml slurm SPPDCJ_ILP /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/SPPDCJ_ILP/SPPDCJ_ILP.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/SPPDCJ_ILP/SPPDCJ_ILP.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml check SPPDCJ_ILP +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml check SPPDCJ_ILP ERRORS: 0 LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/log/SPPDCJ_ILP.log OUTPUT: No output file is created -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml slurm SPPDCJ +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml slurm SPPDCJ /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/SPPDCJ/SPPDCJ.sh (AGO-pipeline) > sbatch /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/aux/SPPDCJ/SPPDCJ.sh ... wait for the slurm processes to complete ... -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml check SPPDCJ +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml check SPPDCJ ERRORS: 0 - LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/log/SPPDCJ.log - OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_GeneRax/data/adjacencies_ago_X_3.txt -(AGO-pipeline) > python src/AGO.py example/anopheles_X_3_ecceTERA.yaml stats SPPDCJ + LOG: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/log/SPPDCJ.log + OUTPUT: /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/adjacencies_ago_X_3.txt +(AGO-pipeline) > python src/AGO.py example/anopheles_X_ecceTERA.yaml stats SPPDCJ /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/SPPDCJ/SPPDCJ_species.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/SPPDCJ/SPPDCJ_CARs.csv /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/data/CARs.txt (AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/SPPDCJ/SPPDCJ_species.csv #species number of adjacencies:total weight:kept adjacencies:kept weight -node2 246:244.32:245:244.01 -node1 446:428.68:422:417.51 +node2 246:244.28:243:243.0 +node1 447:428.52:419:414.97 AnophelesalbimanusSTECLA 544:544.0:544:544.0 -node0 530:521.07:518:514.8 +node0 531:521.17:518:515.04 AnophelesatroparvusEBRO 576:576.0:576:576.0 AnophelesgambiaePEST 563:563.0:563:563.0 AnophelesfunestusFUMOZ 578:578.0:578:578.0 (AGO-pipeline) > cat /home/chauvec/projects/ctb-chauvec/AGO-pipeline/example/anopheles_X_3_ecceTERA/statistics/SPPDCJ/SPPDCJ_CARs.csv #species nb_comp:nb_lin_comp:nb_circ_comp:list(nb_genes.nb_adj.nb_comp) -node2 293:293:0:10.9.2,9.8.1,8.7.1,7.6.3,6.5.5,5.4.4,4.3.14,3.2.24,2.1.63,1.0.176 -node1 185:185:0:21.20.1,18.17.1,15.14.1,14.13.1,11.10.1,10.9.1,9.8.5,8.7.11,7.6.5,6.5.9,5.4.9,4.3.14,3.2.19,2.1.31,1.0.76 +node2 295:295:0:10.9.2,9.8.1,8.7.1,7.6.2,6.5.4,5.4.4,4.3.16,3.2.25,2.1.64,1.0.176 +node1 188:188:0:23.22.1,18.17.1,15.14.1,14.13.1,11.10.1,10.9.1,9.8.6,8.7.10,7.6.6,6.5.6,5.4.10,4.3.13,3.2.20,2.1.31,1.0.80 AnophelesalbimanusSTECLA 1:1:0:545.544.1 -node0 62:62:0:68.67.1,57.56.1,47.46.1,42.41.1,35.34.1,31.30.1,26.25.1,22.21.1,17.16.1,16.15.1,15.14.1,14.13.1,12.11.1,11.10.3,10.9.2,9.8.3,8.7.1,7.6.2,6.5.2,5.4.2,4.3.2,3.2.3,2.1.8,1.0.21 +node0 62:62:0:68.67.1,47.46.1,42.41.1,41.40.1,35.34.1,31.30.1,26.25.1,22.21.1,17.16.1,16.15.2,15.14.1,14.13.1,12.11.2,11.10.2,10.9.2,9.8.3,8.7.1,7.6.3,6.5.1,5.4.2,4.3.1,3.2.4,2.1.8,1.0.20 AnophelesatroparvusEBRO 1:1:0:577.576.1 AnophelesgambiaePEST 1:1:0:564.563.1 AnophelesfunestusFUMOZ 1:1:0:579.578.1 diff --git a/example/anopheles_X_GeneRax.yaml b/example/anopheles_X_GeneRax.yaml index 2d6a51f..528932f 100644 --- a/example/anopheles_X_GeneRax.yaml +++ b/example/anopheles_X_GeneRax.yaml @@ -45,6 +45,8 @@ data: NT_ext: &data_alignments_NT_ext '_NT.fasta' AA_ext: &data_alignments_AA_ext '_AA.fasta' ext: &data_alignments_ext !ref [*data_alignments_NT_ext] + gene_trees: + path: &data_gene_trees_path !join [*dir_data, 'gene_trees_X_3.txt'] reconciliations: path: &data_reconciliations_path !join [*dir_data, 'reconciliations_X_3.txt'] ext: &data_reconciliations_ext '.recphyloxml' @@ -234,7 +236,8 @@ tools: - !concat [!join [*run_dir_scripts, 'GeneRax_reformat.py'], ' \'] - !concat [' ', *generax_input_file, ' \'] - !concat [' ', *generax_results_dir, ' \'] - - !concat [' ', *data_reconciliations_ext] + - !concat [' ', *data_reconciliations_ext, ' \'] + - !concat [' ', *data_gene_trees_path] stats: names: - &generax_stats_file_species_name !concat [*generax_name, '_species', *log_ext_stat] diff --git a/example/anopheles_X_GeneRax_header.yaml b/example/anopheles_X_GeneRax_header.yaml index 65a2f53..c4cdfa3 100644 --- a/example/anopheles_X_GeneRax_header.yaml +++ b/example/anopheles_X_GeneRax_header.yaml @@ -45,6 +45,8 @@ data: NT_ext: &data_alignments_NT_ext '_NT.fasta' AA_ext: &data_alignments_AA_ext '_AA.fasta' ext: &data_alignments_ext !ref [*data_alignments_NT_ext] + gene_trees: + path: &data_gene_trees_path !join [*dir_data, 'gene_trees_X_3.txt'] reconciliations: path: &data_reconciliations_path !join [*dir_data, 'reconciliations_X_3.txt'] ext: &data_reconciliations_ext '.recphyloxml' From 7b494e5da7f9dd82cc8fc07fb400c9ccdb0f612a Mon Sep 17 00:00:00 2001 From: Cedric Chauve Date: Thu, 27 Apr 2023 11:04:28 -0700 Subject: [PATCH 6/6] Experiments --- example/anopheles_X_ecceTERA.yaml | 47 +++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/example/anopheles_X_ecceTERA.yaml b/example/anopheles_X_ecceTERA.yaml index 8737495..ab0ce91 100644 --- a/example/anopheles_X_ecceTERA.yaml +++ b/example/anopheles_X_ecceTERA.yaml @@ -40,6 +40,9 @@ data: gene_trees: path: &data_gene_trees_path !join [*run_dir_root, 'example', 'anopheles_X_3_ALE', 'data', 'gene_trees_X_3.txt'] # Paths to computed output files (generated data, no reconciliation) + reconciliations: + path: &data_reconciliations_path !join [*dir_data, 'reconciliations_X_3.txt'] + ext: &data_reconciliations_ext '.recphyloxml' adjacencies: path: &data_adjacencies_path !join [*dir_data, 'adjacencies_X_3.txt'] ext: &data_adjacencies_ext '_adjacencies.txt' @@ -121,8 +124,8 @@ log: tools: # DO NOT EDIT # DeCoSTAR: computing ancestral adjacencies - DeCoSTAR: - name: &decostar_name 'DeCoSTAR' + ecceTERA_DeCoSTAR: + name: &decostar_name 'ecceTERA_DeCoSTAR' input: dirs: - &decostar_input_dir !join [*dir_aux, *decostar_name] @@ -156,10 +159,12 @@ tools: files: - &decostar_slurm_results_species_adjacencies !join [*decostar_results_dir, *decostar_slurm_results_species_adjacencies_name] other: - - &decostar_slurm_results_adjacencies_file !join [*decostar_results_dir, 'adjacencies.txt'] - - &decostar_slurm_results_genes_file_1 !join [*decostar_results_dir, 'genes.txt'] - - &decostar_slurm_results_genes_file_2 !join [*decostar_results_dir, 'genes_reformatted.txt'] - - &decostar_slurm_results_species_file !join [*decostar_results_dir, 'species.txt'] + - &decostar_slurm_results_adjacencies_file !join [*decostar_results_dir, 'adjacencies.txt'] + - &decostar_slurm_results_genes_file_1 !join [*decostar_results_dir, 'genes.txt'] + - &decostar_slurm_results_genes_file_2 !join [*decostar_results_dir, 'genes_reformatted.txt'] + - &decostar_slurm_results_species_file !join [*decostar_results_dir, 'species.txt'] + - &decostar_slurm_results_species_tree_file !join [*decostar_results_dir, 'speciesTree.phyloxml'] + - &decostar_slurm_results_reconciliations_file !join [*decostar_results_dir, 'reconciliations.xml'] cmd: - !concat [*decostar_exec, ' \'] # Input files @@ -204,27 +209,45 @@ tools: - !concat [' ', *decostar_slurm_results_adjacencies_file, ' \'] - !concat [' ', *decostar_slurm_results_genes_file_2, ' \'] - !concat [' ', *decostar_results_dir] + - ' ' + - !concat [!join [*run_dir_scripts, 'DeCoSTAR_ecceTERA_reformat.py'], ' \'] + - !concat [' ', *decostar_slurm_results_genes_file_1, ' \'] + - !concat [' ', *decostar_slurm_results_genes_file_2, ' \'] + - !concat [' ', *decostar_slurm_results_species_tree_file, ' \'] + - !concat [' ', *decostar_slurm_results_reconciliations_file, ' \'] + - !concat [' ', *decostar_results_dir, ' \'] + - !concat [' ', *data_reconciliations_ext, ' \'] + - !concat [' ', *data_reconciliations_path] stats: names: - - &decostar_stats_file_species_name !concat [*decostar_name, '_species', *log_ext_stat] - - &decostar_stats_file_components_name !concat [*decostar_name, '_components', *log_ext_stat] + - &decostar_stats_file_adj_species_name !concat [*decostar_name, '_adjacencies_species', *log_ext_stat] + - &decostar_stats_file_adj_components_name !concat [*decostar_name, '_adjacencies_components', *log_ext_stat] + - &decostar_stats_file_rec_species_name !concat [*decostar_name, '_reconciliations_species', *log_ext_stat] + - &decostar_stats_file_rec_families_name !concat [*decostar_name, '_reconciliations_families', *log_ext_stat] files: - - &decostar_stats_file_species !join [*dir_stats, *decostar_name, *decostar_stats_file_species_name] - - &decostar_stats_file_components !join [*dir_stats, *decostar_name, *decostar_stats_file_components_name] + - &decostar_stats_file_adj_species !join [*dir_stats, *decostar_name, *decostar_stats_file_adj_species_name] + - &decostar_stats_file_adj_components !join [*dir_stats, *decostar_name, *decostar_stats_file_adj_components_name] + - &decostar_stats_file_rec_species !join [*dir_stats, *decostar_name, *decostar_stats_file_rec_species_name] + - &decostar_stats_file_rec_families !join [*dir_stats, *decostar_name, *decostar_stats_file_rec_families_name] script: + - !join [*run_dir_scripts, 'recPhyloXML_statistics.py'] + - !ref [*data_reconciliations_path] + - !ref [*decostar_stats_file_rec_species] + - !ref [*decostar_stats_file_rec_families] + - !concat [';'] - !join [*run_dir_scripts, 'DeCoSTAR_statistics.py'] - !ref [*data_species_path] - !ref [*decostar_slurm_results_genes_file_2] - !ref [*data_adjacencies_path] - !ref [*decostar_stats_thresholds] - - !ref [*decostar_stats_file_species] + - !ref [*decostar_stats_file_adj_species] - !concat [';'] - !join [*run_dir_scripts, 'gene_orders_utils.py'] - 'stats' - !ref [*decostar_slurm_results_genes_file_2] - !ref [*data_adjacencies_path] - !ref [*decostar_results_dir] - - !ref [*decostar_stats_file_components] + - !ref [*decostar_stats_file_adj_components] # DO NOT EDIT # SPPDCJ_ILP: Writing the SPPDCJ ILP file SPPDCJ_ILP: