diff --git a/scripts/ALE_reformat.py b/scripts/ALE_reformat.py index 43f57de..0847f3c 100755 --- a/scripts/ALE_reformat.py +++ b/scripts/ALE_reformat.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys @@ -38,16 +38,25 @@ def main(): in_rec_xml_file = sys.argv[3] out_rec_xml_file = sys.argv[4] + # Creates a map from ALE species names to original species names species_map = newick_create_species_map(in_ale_species_tree, in_data_species_tree) + # Read an ALE recPhyloXML file tree = ET.parse(in_rec_xml_file) + # If the file includes an HGT, we do not create a reformatted file if not xml_check_transfer(tree, xml_ALE_identify_transfer): + # Rename species xml_rename_species(tree, species_map) + # Reformat losses xml_rename_losses(tree, xml_ALE_identify_loss, 'loss') - _ = xml_rename_ancestral_genes(tree, xml_ALE_identify_ancestral_gene, start_id=1) - out_rec_xml_file_tmp = f'{out_rec_xml_file}_tmp' - tree.write(out_rec_xml_file_tmp) - _ = xml_reformat_file(out_rec_xml_file_tmp, out_rec_xml_file) - os.remove(out_rec_xml_file_tmp) + # Reformat ancestral gene names with integers ID starting at 1 + _ = xml_rename_ancestral_genes(tree, xml_ALE_identify_ancestral_gene, start_id=1) + # Create a temporary recPhyloXML file + tmp_rec_file = f'{out_rec_xml_file}_tmp' + tree.write(tmp_rec_file) + # Reformat the temporary recPhyloXML file int the final recPhyloXML file + _ = xml_reformat_file(tmp_rec_file, out_rec_xml_file) + # Delete the temporary recPhyloXML file + os.remove(tmp_rec_file) if __name__ == "__main__": main() diff --git a/scripts/DeCoSTAR_create_input_files.py b/scripts/DeCoSTAR_create_input_files.py index 3742b00..f0a9922 100755 --- a/scripts/DeCoSTAR_create_input_files.py +++ b/scripts/DeCoSTAR_create_input_files.py @@ -5,35 +5,34 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys from data_utils import ( data_gene2family, - data_species2gene_order_path, - data_family2reconciliation_path, + data_index2path, data_read_gene_order_file ) ''' Read a gene order file and returns an order list of adjacencies strings ''' def decostar_gene_order2adjacencies_str( - in_gene_order_file, in_gene2family_map, in_reconciled_families + in_gene_order_file, in_gene2family_map, in_families ): ''' input: - path to gene order file - map of gene to family ID - - list of reconciled families to consider - (genes from other families are not accounted for) + - list of families to consider (genes from other families are not accounted for) output: list(str) ''' orientation = {'1': '+', '0': '-'} adjacencies = [] gene_order = data_read_gene_order_file(in_gene_order_file) + in_gene2family_keys = list(in_gene2family_map.keys()) prev_gene = None for (gene_name,gene_chr,_,_,gene_sign) in gene_order: - if in_gene2family_map[gene_name] in in_reconciled_families: + if gene_name in in_gene2family_keys and in_gene2family_map[gene_name] in in_families: gene_orientation = orientation[gene_sign] if prev_gene is not None and prev_gene[1] == gene_chr: adjacency = [ @@ -45,46 +44,44 @@ def decostar_gene_order2adjacencies_str( return adjacencies ''' -Creates the DeCoSTAR input gene trees distribution file -Compute a list of families for which the reconciled gene tree is available +Creates the DeCoSTAR input (reconciled) gene trees distribution file +Compute a list of families for which the (reconciled) gene tree is provided ''' -def create_gene_distribution_file(in_reconciliations_file, out_trees_file): +def decostar_create_gene_distribution_file(in_trees_file, out_trees_file): ''' input: - - dataset file with link from family to reconciled gene tree + - dataset file family(reconciled) gene trees output: - creates out_trees_file - - list(str) of families for which a reonciliation is available + - list(str) of families for which a (reconciled) gene tree is available ''' - family2reconciliation_path = data_family2reconciliation_path( - in_reconciliations_file - ) - reconciled_families = [] - with open(out_trees_file, 'w') as trees: - for fam_id,reconciliation_file in family2reconciliation_path.items(): - trees.write(f'{reconciliation_file}\n') - reconciled_families.append(fam_id) - return reconciled_families + family2trees_path = data_index2path(in_trees_file) + families = [] + with open(out_trees_file, 'w') as out_trees: + for fam_id,trees_file in family2trees_path.items(): + out_trees.write(f'{trees_file}\n') + families.append(fam_id) + return families ''' Creates the DeCoSTAR input adjacencies file ''' -def create_adjacencies_file( - in_gene_orders_file, in_families_file, in_reconciled_families, +def decostar_create_adjacencies_file( + in_gene_orders_file, in_families_file, in_trees_families, out_adjacencies_file ): ''' input: - dataset file with link from species to gene order file - dataset families file - - list of reconciled families + - list of families with a (reconciled) tree provided as input output: creates out_adjacencies_file ''' - gene_order_files= data_species2gene_order_path(in_gene_orders_file) + gene_order_files= data_index2path(in_gene_orders_file) gene2family_map = data_gene2family(in_families_file) with open(out_adjacencies_file, 'w') as out_adjacencies: for species,gene_order_file in gene_order_files.items(): species_adjacencies = decostar_gene_order2adjacencies_str( - gene_order_file, gene2family_map, in_reconciled_families + gene_order_file, gene2family_map, in_trees_families ) for adjacency in species_adjacencies: out_adjacencies.write(f'{adjacency}\n') @@ -93,13 +90,17 @@ def create_adjacencies_file( def main(): in_gene_orders_file = sys.argv[1] - in_reconciliations_file = sys.argv[2] + in_trees_file = sys.argv[2] in_families_file = sys.argv[3] out_adjacencies_file = sys.argv[4] out_trees_file = sys.argv[5] - reconciled_families = create_gene_distribution_file(in_reconciliations_file, out_trees_file) - create_adjacencies_file(in_gene_orders_file, in_families_file, reconciled_families, out_adjacencies_file) + # Creates the gene trees distribution file (reconciliations or samples of gene trees) + trees_families = decostar_create_gene_distribution_file(in_trees_file, out_trees_file) + # Creates the adjacencies file + decostar_create_adjacencies_file( + in_gene_orders_file, in_families_file, trees_families, out_adjacencies_file + ) if __name__ == "__main__": main() diff --git a/scripts/DeCoSTAR_ecceTERA_reformat.py b/scripts/DeCoSTAR_ecceTERA_reformat.py index 0568aac..8ccf18f 100755 --- a/scripts/DeCoSTAR_ecceTERA_reformat.py +++ b/scripts/DeCoSTAR_ecceTERA_reformat.py @@ -5,14 +5,13 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys import os import xml.etree.ElementTree as ET - from recPhyloXML_utils import xml_rename_species ''' @@ -25,11 +24,11 @@ def eccetera_read_results(in_genes_file1, in_genes_file2, sep='|'): input: - original and reformatted genes file output: - - dict(ecceTERA species -> data species) - - dict(ecceTERA family -> data family) + - dict(ecceTERA species -> original species) + - dict(ecceTERA family -> original family) ''' species_dict,families_dict = {},{} - # Reading original genes + # Reading original DeCoSTAR genes data,data_idx = {},0 with open(in_genes_file1, 'r') as in_genes: for gene_data in in_genes.readlines(): @@ -117,9 +116,9 @@ def eccetera_write_reconciliations(in_reconciliations_files, species_map, out_di out_reconciliation_file = os.path.join(out_dir, f'{family_id}{out_xml_ext}') tree.write(out_reconciliation_file) out_file.write(f'{family_id}\t{out_reconciliation_file}\n') + os.remove(tmp_rec_file) - def main(): in_genes_file1 = sys.argv[1] # DeCoSTAR genes file in_genes_file2 = sys.argv[2] # Reformatted genes file @@ -129,11 +128,17 @@ def main(): out_xml_ext = sys.argv[6] # Extension to recPhyloXML files out_reconciliations_file = sys.argv[7] # Data set reconciliations file + # Creates a map from DeCoSTAR species names to original species names species_map,families_map = eccetera_read_results(in_genes_file1, in_genes_file2) + # Read the species tree in XML format created by DeCoSTAR xml_sp_str = eccetera_read_species_tree(in_sp_xml_file) + # Creates one temporary recPhyloXML file per family extracted from the + # DeCoSTAR reocnciliations file tmp_reconciliations_files = eccetera_read_reconciliations( in_reconciliations_file, xml_sp_str, families_map, out_dir ) + # Reformat the temporary rcPhyloXML files and writes a reconciliations file + # () eccetera_write_reconciliations( tmp_reconciliations_files, species_map, out_dir, out_xml_ext, out_reconciliations_file ) diff --git a/scripts/DeCoSTAR_reformat.py b/scripts/DeCoSTAR_reformat.py index 977e920..df97367 100755 --- a/scripts/DeCoSTAR_reformat.py +++ b/scripts/DeCoSTAR_reformat.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys @@ -16,7 +16,7 @@ data_create_equivalence_map, data_species_map, data_gene2family, - data_reconciliation_path2family + data_path2index ) from recPhyloXML_utils import ( xml_get_gene_tree_root, @@ -71,7 +71,7 @@ def decostar_family_map( output: dict(str->str) from DeCoSTAR family ID to original family ID ''' - reconciliation2family = data_reconciliation_path2family( + reconciliation2family = data_path2index( in_input_file ) family_map = {} @@ -326,9 +326,11 @@ def main(): out_genes_file = sys.argv[9] out_adjacencies_dir = sys.argv[10] + # Creates a map from DeCoSTAR species names to original species names species_map = decostar_species_map( in_species_file, in_decostar_species_file ) + # Reformat the genes file created by DeCoSTAR to rename genes, families and species genes_map = decostar_reformat_genes( species_map, in_families_file, in_reconciliations_file, @@ -336,6 +338,7 @@ def main(): out_genes_file, already_reconciled = in_already_reconciled ) + # Reformat the adjacencies file created by DeCoSTAR to rename genes, families and species decostar_reformat_adjacencies_file( species_map, genes_map, in_adjacencies_file, out_adjacencies_dir diff --git a/scripts/DeCoSTAR_statistics.py b/scripts/DeCoSTAR_statistics.py index 9cd4235..59d5567 100755 --- a/scripts/DeCoSTAR_statistics.py +++ b/scripts/DeCoSTAR_statistics.py @@ -5,14 +5,14 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys from data_utils import ( data_species_list, - data_species2adjacencies_path + data_index2path ) from DeCoSTAR_reformat import decostar_read_adjacencies @@ -60,7 +60,7 @@ def decostar_read_results(in_genes_file, in_adjacencies_file, in_species_list): for (species,gene) in species_gene: adjacencies_dicts[species][gene] = {'h': [], 't': []} ## Populate dictionaries from species to adjacency tabulated file - species2adjacencies = data_species2adjacencies_path(in_adjacencies_file) + species2adjacencies = data_index2path(in_adjacencies_file) for species,in_adjacencies_file in species2adjacencies.items(): in_adjacencies = decostar_read_adjacencies( in_adjacencies_file, species=species diff --git a/scripts/GeneRax_create_input_files.py b/scripts/GeneRax_create_input_files.py index 2fb8025..fab1c88 100755 --- a/scripts/GeneRax_create_input_files.py +++ b/scripts/GeneRax_create_input_files.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys @@ -14,11 +14,11 @@ from data_utils import ( data_family2genes, data_gene2species, - data_family2alignment_path + data_index2path ) ''' Write the GeneRax families file from paths to alignments ''' -def GeneRax_write_families_file( +def generax_write_families_file( in_family2genes, in_gene2species, in_family2alignment, in_subst_model, out_map_files_dir, out_families_file @@ -58,11 +58,9 @@ def main(): # Read all genes gene2species = data_gene2species(in_gene_orders_file) # Read alignmed families - family2alignment = data_family2alignment_path( - in_alignments_file, in_suffix - ) + family2alignment = data_index2path(in_alignments_file) # Create GeneRax input files - GeneRax_write_families_file( + generax_write_families_file( family2genes, gene2species, family2alignment, in_subst_model, out_map_files_dir, out_families_file ) diff --git a/scripts/GeneRax_reformat.py b/scripts/GeneRax_reformat.py index 08bb320..0282015 100755 --- a/scripts/GeneRax_reformat.py +++ b/scripts/GeneRax_reformat.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys @@ -13,51 +13,85 @@ from recPhyloXML_utils import xml_reformat_file -def get_families_from_input(in_GeneRax_families_file): - with open(in_GeneRax_families_file, 'r') as in_file: - families = [ +# Suffix of reconciliation files created by GeneRax +GENERAX_XML_SUFFIX = '_reconciliated.xml' +# Nam of a GeneRax gene tree in Newick format +GENERAX_GT_NEWICK = 'geneTree.newick' + +''' Read families ID from a GeneRax families file ''' +def generax_get_families_from_input(in_generax_families_file): + ''' + input: path to GeneRax families file + output: list of families ID + ''' + with open(in_generax_families_file, 'r') as in_file: + generax_families = [ f.rstrip()[2:] for f in in_file.readlines() if f.startswith('- ') ] - return families + return generax_families -def reformat_reconciliations(families, results_dir, suffix): +''' Reformat the recPhyloXML files created by GeneRax ''' +def generax_reformat_reconciliations(generax_families, results_dir, suffix): + ''' + input: + - list of GeneRax families ID + - directory containing GeneRax results + - suffix of refortmated reconciliations files + output: + - creates one reformatted file for every GeneRax reconciliation + - labels all ancestral genes by integers in increasing order (starting at 0) + ''' current_gene_id = 0 # current gene number - for fam_id in families: + for fam_id in generax_families: in_file = os.path.join( - results_dir, 'reconciliations', - f'{fam_id}_reconciliated.xml' + results_dir, 'reconciliations', f'{fam_id}{GENERAX_XML_SUFFIX}' ) out_file = os.path.join( - results_dir, 'reconciliations', - f'{fam_id}{suffix}' + results_dir, 'reconciliations', f'{fam_id}{suffix}' ) if os.path.isfile(in_file): current_gene_id = xml_reformat_file( - in_file, out_file, - start_id=current_gene_id + in_file, out_file, start_id=current_gene_id ) -def create_gene_trees_file(families, results_dir, out_gene_trees_file): +''' Creates a tabulated file ''' +def generax_create_gene_trees_file(generax_families, results_dir, out_gene_trees_file): + ''' + input: + - list of families ID + - directory containing GeneRax results + - tabulated file to create + output: + - creates out_gene_trees_file + ''' with open(out_gene_trees_file, 'w') as out_file: - for fam_id in families: - in_file = os.path.join( - results_dir, 'results', fam_id, - 'geneTree.newick' + for fam_id in generax_families: + gt_file = os.path.join( + results_dir, 'results', fam_id, GENERAX_GT_NEWICK ) - if os.path.isfile(in_file): - out_file.write(f'{fam_id}\t{in_file}\n') + if os.path.isfile(gt_file): + out_file.write(f'{fam_id}\t{gt_file}\n') def main(): - in_GeneRax_families_file = sys.argv[1] + in_generax_families_file = sys.argv[1] results_dir = sys.argv[2] rec_ext = sys.argv[3] out_gene_trees_file = sys.argv[4] - families = get_families_from_input(in_GeneRax_families_file) - reformat_reconciliations(families, results_dir, rec_ext) - create_gene_trees_file(families, results_dir, out_gene_trees_file) + # Read GeneRax families ID from GenRax families file + generax_families = generax_get_families_from_input( + in_generax_families_file + ) + # Reformat GeneRax reconciliations files + generax_reformat_reconciliations( + generax_families, results_dir, rec_ext + ) + # Creates a gene trees file from GeneRax gene trees + generax_create_gene_trees_file( + generax_families, results_dir, out_gene_trees_file + ) if __name__ == "__main__": main() diff --git a/scripts/SPPDCJ_create_input_files.py b/scripts/SPPDCJ_create_input_files.py index 1d9578c..0d621e5 100755 --- a/scripts/SPPDCJ_create_input_files.py +++ b/scripts/SPPDCJ_create_input_files.py @@ -5,13 +5,13 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "0.99" -__status__ = "Development" +__version__ = "1.0.3" +__status__ = "Released" import sys import ete3 -from data_utils import data_species2adjacencies_path +from data_utils import data_index2path from newick_utils import ( newick_get_children_map, newick_get_lca_species @@ -63,7 +63,7 @@ def split_gene(gene): species2adjacencies_file = [ (species,adj_path) - for (species,adj_path) in data_species2adjacencies_path( + for (species,adj_path) in data_index2path( in_adjacencies_file ).items() if (species_list is None or species in species_list) @@ -105,18 +105,20 @@ def main(): out_species_tree = sys.argv[5] out_adjacencies_file = sys.argv[6] - if in_extant_species == 'all': + # Define species to consider + if in_extant_species == 'all':# Consider all species species_list = None - else: + else:# Consider species covered by the LCA of in_extant_species species_list = newick_get_lca_species( in_species_tree, in_extant_species.split() ) - + # Creates an SPP-DCJ species tree sppdcj_species_trees( in_species_tree, out_species_tree, species_list=species_list ) + # Creates an SPP-DCJ adjacencies file sppdcj_adjacencies( in_adjacencies_file, in_weight_threshold, out_adjacencies_file, diff --git a/scripts/SPPDCJ_reformat.py b/scripts/SPPDCJ_reformat.py index ac4b17d..ed1c1a9 100755 --- a/scripts/SPPDCJ_reformat.py +++ b/scripts/SPPDCJ_reformat.py @@ -5,13 +5,13 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys import os -from data_utils import data_species2adjacencies_path +from data_utils import data_index2path from DeCoSTAR_reformat import ( decostar_sep, @@ -21,7 +21,7 @@ from DeCoSTAR_statistics import decostar_sign2extremity ''' Read the SPPDCJ adajcencies file and create a species-indexed dictionary ''' -def read_sppdcj_adjacencies(in_adjacencies_file): +def sppdcj_read_sppdcj_adjacencies(in_adjacencies_file): ''' input: SPPDCJ adjacencies file output: dict(species -> dict((gene1,ext1,gene2,ext2)->weight) @@ -48,12 +48,12 @@ def read_sppdcj_adjacencies(in_adjacencies_file): return adjacencies_dict ''' Read the DeCoSTAR adajcencies file and create a species-indexed dictionary ''' -def read_decostar_adjacencies(in_data_adjacencies_file): +def sppdcj_read_decostar_adjacencies(in_data_adjacencies_file): ''' input: data adjacencies file (speciesDeCoSTAR adjacencies file output: dict(species -> dict((gene1,ext1,gene2,ext2)->(gene1,sign1,gene2,sign2,weight1,weight2)) ''' - species2adjacencies_files = data_species2adjacencies_path(in_data_adjacencies_file) + species2adjacencies_files = data_index2path(in_data_adjacencies_file) species2adjacencies_aux = {} for species,adjacencies_file in species2adjacencies_files.items(): species2adjacencies_aux[species] = decostar_read_adjacencies(adjacencies_file, species=species) @@ -72,7 +72,7 @@ def read_decostar_adjacencies(in_data_adjacencies_file): return adjacencies_dict ''' Filter DeCoSTAR adjacencies discarded by SPPDCJ ''' -def filter_adjacencies(in_decostar_adjacencies, in_sppdcj_adjacencies): +def sppdcj_filter_adjacencies(in_decostar_adjacencies, in_sppdcj_adjacencies): ''' input: dictionaries dict(species -> dict((gene1,gene2,ext1,ext2)->weight) @@ -90,7 +90,7 @@ def filter_adjacencies(in_decostar_adjacencies, in_sppdcj_adjacencies): return out_adjacencies ''' Write the filtered adjacncies ''' -def write_adjacencies(adjacencies_dict,out_dir): +def sppdcj_write_adjacencies(adjacencies_dict,out_dir): for species,adjacencies_dict in adjacencies_dict.items(): out_adjacencies_path = os.path.join(out_dir, f'{species}_adjacencies.txt') with open(out_adjacencies_path,'w') as out_adjacencies_file: @@ -102,13 +102,20 @@ def main(): in_data_adjacencies_file = sys.argv[1] in_sppdcj_adjacencies_file = sys.argv[2] out_dir = sys.argv[3] + # Reading SPPDCJ adjacencies - sppdcj_adjacencies = read_sppdcj_adjacencies(in_sppdcj_adjacencies_file) + sppdcj_adjacencies = sppdcj_read_sppdcj_adjacencies( + in_sppdcj_adjacencies_file + ) # Reading DeCoSTAR adjacencies - decostar_adjacencies = read_decostar_adjacencies(in_data_adjacencies_file) + decostar_adjacencies = sppdcj_read_decostar_adjacencies( + in_data_adjacencies_file + ) # Filter DeCoSTAR adjacencies - filtered_adjacencies = filter_adjacencies(decostar_adjacencies,sppdcj_adjacencies) - write_adjacencies(filtered_adjacencies,out_dir) + filtered_adjacencies = sppdcj_filter_adjacencies( + decostar_adjacencies,sppdcj_adjacencies + ) + sppdcj_write_adjacencies(filtered_adjacencies,out_dir) if __name__ == "__main__": main() diff --git a/scripts/SPPDCJ_statistics.py b/scripts/SPPDCJ_statistics.py index 36fbeb3..e5c0061 100755 --- a/scripts/SPPDCJ_statistics.py +++ b/scripts/SPPDCJ_statistics.py @@ -5,19 +5,19 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys import os from SPPDCJ_reformat import ( - read_sppdcj_adjacencies, - read_decostar_adjacencies, - filter_adjacencies + sppdcj_read_sppdcj_adjacencies, + sppdcj_read_decostar_adjacencies, + sppdcj_filter_adjacencies ) -def SPPDCJ_statistics(decostar_adjacencies, filtered_adjacencies): +def sppdcj_statistics(decostar_adjacencies, filtered_adjacencies): statistics = {} for species,adjacencies in decostar_adjacencies.items(): statistics[species] = {} @@ -32,7 +32,7 @@ def SPPDCJ_statistics(decostar_adjacencies, filtered_adjacencies): statistics[species]['weight2'] = sum_weights return statistics -def write_statistics(statistics, out_statistics_file): +def sppdcj_write_statistics(statistics, out_statistics_file): with open(out_statistics_file,'w') as out_file: out_file.write('#species\tnumber of adjacencies:total weight:kept adjacencies:kept weight') for species,stats in statistics.items(): @@ -45,14 +45,14 @@ def main(): in_sppdcj_adjacencies_file = sys.argv[2] out_statistics_file = sys.argv[3] # Reading SPPDCJ adjacencies - sppdcj_adjacencies = read_sppdcj_adjacencies(in_sppdcj_adjacencies_file) + sppdcj_adjacencies = sppdcj_read_sppdcj_adjacencies(in_sppdcj_adjacencies_file) # Reading DeCoSTAR adjacencies - decostar_adjacencies = read_decostar_adjacencies(in_data_adjacencies_file) + decostar_adjacencies = sppdcj_read_decostar_adjacencies(in_data_adjacencies_file) # Filter DeCoSTAR adjacencies - filtered_adjacencies = filter_adjacencies(decostar_adjacencies,sppdcj_adjacencies) + filtered_adjacencies = sppdcj_filter_adjacencies(decostar_adjacencies,sppdcj_adjacencies) # Statistics - statistics = SPPDCJ_statistics(decostar_adjacencies, filtered_adjacencies) - write_statistics(statistics, out_statistics_file) + statistics = sppdcj_statistics(decostar_adjacencies, filtered_adjacencies) + sppdcj_write_statistics(statistics, out_statistics_file) if __name__ == "__main__": main() diff --git a/scripts/data_utils.py b/scripts/data_utils.py index 1731188..0219840 100644 --- a/scripts/data_utils.py +++ b/scripts/data_utils.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys @@ -87,68 +87,14 @@ def data_family2genes(in_families_file): output: dict(family ID -> list(genes)) ''' return _data_create_map(in_families_file, sep2=' ') - -''' Creates a map from species to gene order file ''' -def data_species2gene_order_path(in_gene_orders_file): - ''' - input: path to dataset gene orders file - output: dict(species -> gene order file path) - ''' - return _data_create_map(in_gene_orders_file) - -''' Creates a map from species to adjacencies file ''' -def data_species2adjacencies_path(in_adjacencies_file): - ''' - input: path to dataset gene orders file - output: dict(species -> adjacencies file path) - ''' - return _data_create_map(in_adjacencies_file) -''' Creates a map from family to sequences file ''' -def data_family2sequences_path(in_sequences_file): - ''' - input: path to dataset sequences file - output: dict(family -> sequences path) - ''' - return _data_create_map(in_reconciliations_file) +''' Creates a map from index to files path ''' +def data_index2path(in_file): + return _data_create_map(in_file) -''' Creates a map from family to alignment file ''' -def data_family2alignment_path(in_alignments_file, in_suffix): - ''' - input: path to dataset alignments file - output: dict(family -> alignment path) if path ends by in_suffix - ''' - family2alignment_path = {} - with open(in_alignments_file, 'r') as alignments: - for family in alignments.readlines(): - fam_id,alignment_file = family.rstrip().split('\t') - if alignment_file.endswith(in_suffix): - family2alignment_path[fam_id] = alignment_file - return family2alignment_path - -''' Creates a map from family to gene tree(s) file ''' -def data_family2genetree_path(in_gene_trees_file): - ''' - input: path to dataset gene trees file - output: dict(family -> gene tree(s) path) - ''' - return _data_create_map(in_gene_trees_file) - -''' Creates a map from family to reconciliation file ''' -def data_family2reconciliation_path(in_reconciliations_file): - ''' - input: path to dataset reconciliations file - output: dict(family -> reconciliation path) - ''' - return _data_create_map(in_reconciliations_file) - -''' Creates a map from reconciliation file to family ''' -def data_reconciliation_path2family(in_reconciliations_file): - ''' - input: path to dataset reconciliations file - output: dict(family -> reconciliation path) - ''' - return _data_create_inverse_map(in_reconciliations_file) +''' Creates a map from files path to index ''' +def data_path2index(in_file): + return _data_create_inverse_map(in_file) ''' Creates a list of species ''' def data_species_list(in_file): @@ -324,7 +270,7 @@ def data_check_gene_orders_file(in_gene_orders_file, species_list, genes_list): - genes errors: 3,[list of genes in gene orders files not in genes list,list of genes in genes list not in gene orders files] - missing files: 4,[missing files] ''' - species2gene_order_path = data_species2gene_order_path(in_gene_orders_file) + species2gene_order_path = data_index2path(in_gene_orders_file) go_species_list = species2gene_order_path.keys() # Checking species lists species_diff = _data_compare_lists(go_species_list,species_list) diff --git a/scripts/fasta_utils.py b/scripts/fasta_utils.py index 509eaaa..287b711 100644 --- a/scripts/fasta_utils.py +++ b/scripts/fasta_utils.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys diff --git a/scripts/gene_orders_utils.py b/scripts/gene_orders_utils.py index 99a7060..602b85c 100755 --- a/scripts/gene_orders_utils.py +++ b/scripts/gene_orders_utils.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import os @@ -13,12 +13,14 @@ from collections import defaultdict import networkx as nx -from data_utils import data_species2adjacencies_path +from data_utils import data_index2path from DeCoSTAR_reformat import decostar_sep from DeCoSTAR_statistics import decostar_sign2extremity +""" Reading input data to create CARs: DeCoSTAR-format genes and adjacencies """ + ''' Read DeCoSTAR genes for all species ''' -def read_DeCoSTAR_genes(in_genes_file): +def go_read_decostar_genes(in_genes_file): ''' input: DeCoSTAR genes file, for all species output: dict(species -> list of genes in format familygene) @@ -32,7 +34,7 @@ def read_DeCoSTAR_genes(in_genes_file): return genes_dict ''' Read adjacencies for a given species ''' -def read_DeCoSTAR_adjacencies_species(in_adjacencies_file): +def go_read_decostar_adjacencies_species(in_adjacencies_file): ''' input: DeCoSTAR adjacencies file, for one species output: list((gene1,sign1,gene2,sign2,weight)) @@ -44,23 +46,33 @@ def read_DeCoSTAR_adjacencies_species(in_adjacencies_file): gene1,sign1 = adjacency_data[0],adjacency_data[2] gene2,sign2 = adjacency_data[1],adjacency_data[3] weight = adjacency_data[5] - adjacencies_list.append((gene1,gene2,sign1,sign2,weight)) + adjacencies_list.append( + (gene1,gene2,sign1,sign2,weight) + ) return adjacencies_list ''' Read adjacencies for all species ''' -def read_DeCoSTAR_adjacencies(in_adjacencies_file): +def go_read_decostar_adjacencies(in_adjacencies_file): ''' input: adjacencies file output: dict(species -> list((gene1,sign1,gene2,sign2,weight))) ''' adjacencies_dict = {} - species2adjacencies_file = data_species2adjacencies_path(in_adjacencies_file) + species2adjacencies_file = data_index2path(in_adjacencies_file) for species,adjacencies_file_path in species2adjacencies_file.items(): - adjacencies_dict[species] = read_DeCoSTAR_adjacencies_species(adjacencies_file_path) + adjacencies_dict[species] = go_read_decostar_adjacencies_species( + adjacencies_file_path + ) return adjacencies_dict -''' Create a graph from genes and adjacencies ''' -def create_adjacencies_graph(in_genes_list, in_adjacencies): +""" Graph creation and manipulation """ + +''' +Create graphs from genes and adjacencies, on per species +Vertices = gene extremities +Edges = adjacencies, each with a weight +''' +def go_create_adjacencies_graph(in_genes_list, in_adjacencies): ''' input: - dict(species -> list of genes list(familygene)) @@ -76,56 +88,80 @@ def create_adjacencies_graph(in_genes_list, in_adjacencies): graphs[species].add_edge((gene,'h'),(gene,'t'),weight=0) for (gene1,gene2,sign1,sign2,weight) in in_adjacencies_list: exts = decostar_sign2extremity[(sign1,sign2)] - graphs[species].add_edge((gene1,exts[0]),(gene2,exts[1]), weight=weight) + graphs[species].add_edge( + (gene1,exts[0]),(gene2,exts[1]), + weight=weight + ) return graphs -''' Graph to FASTA string ''' - -def check_linear(C): - ''' Check if a component is linear ''' +def go_check_linear(C): + ''' Check if a component C (networkx graph) is linear ''' return C.number_of_nodes() == C.number_of_edges()+1 -def get_start_node(C): - ''' Return the first node for a traversal of a component ''' +def go_get_start_node(C): + ''' Return the first node for a traversal of a component C ''' nodes = list(C.nodes) - if check_linear(C): + if go_check_linear(C): return [x for x in nodes if C.degree(x)==1][0] else: return nodes[0] -def signed_gene(edge): +def go_signed_gene(edge): + ''' + Add a "-" before a the gene defined by the first gene extremity + of an edge if this extremity is a tail + ''' if edge[0][1] == 't': return f'{edge[0][0]}' else: return f'-{edge[0][0]}' + + +''' Graph to CARs in FASTA-like format ''' -def traversal2FASTA(C, start_node, species, C_id): - ''' DFS traversal of a component ''' +def go_component2str(C, start_node, species, C_id): + ''' + DFS traversal of a component C (assumed to be linear or circular) + given the first node for the traversal and the ID of the + component. + species is needed to remove the gene name + Creates a FASTA-like string describing the oriented gene order + of the traversal + ''' edges_list = list(nx.dfs_edges(C, source=start_node)) - nodes_list = [signed_gene(edge).replace(f'{species}|','') for edge in edges_list if edge[0][0]==edge[1][0]] + nodes_list = [ + go_signed_gene(edge).replace(f'{species}|','') + for edge in edges_list + if edge[0][0]==edge[1][0] + ] nodes_str = ' '.join(nodes_list) C_str = f'>{C_id}\n{nodes_str}\n' return C_str -def graph2FASTA(in_graph, species): +def go_graph2str(in_graph, species): ''' input: graph, species - output: FASTA str + output: string in FASTA-like format for all components of the graph ''' components = list(nx.connected_components(in_graph)) component_id = 1 - FASTA_str = '' + result = '' for component in components: - subgraph = nx.induced_subgraph(in_graph, component) - start_node = get_start_node(subgraph) - FASTA_str += traversal2FASTA(subgraph, start_node, species, component_id) + C = nx.induced_subgraph(in_graph, component) + start_node = go_get_start_node(C) + result += go_component2str( + C, start_node, species, component_id + ) component_id += 1 - return FASTA_str + return result -def write_FASTA(in_graphs, out_dir, out_file_path): +def go_write_CARs(in_graphs, out_dir, out_file_path): + ''' + Write CARs + ''' with open(out_file_path,'w') as out_data_file: for species,graph in in_graphs.items(): - species_str = graph2FASTA(graph,species) + species_str = go_graph2str(graph,species) species_out_file = os.path.join(out_dir,f'{species}_CARs.txt') out_data_file.write(f'{species}\t{species_out_file}\n') with open(species_out_file,'w') as out_file: @@ -133,12 +169,10 @@ def write_FASTA(in_graphs, out_dir, out_file_path): ''' Statistics about a graph ''' -def graph2stats(in_graph): +def go_graph2stats(in_graph): ''' - input: graph, species - output: - - dict(nb_nodes -> dict(nb_edges -> number of components)) - + input: graph + output: dict(nb_nodes -> dict(nb_edges -> number of components)) ''' components = list(nx.connected_components(in_graph)) components_stats = defaultdict(lambda: defaultdict(int)) @@ -151,7 +185,7 @@ def graph2stats(in_graph): components_stats[nb_genes][nb_adjacencies] += 1 return components_stats -def write_stats(in_stats, out_file_path): +def go_write_stats(in_stats, out_file_path): ''' input: dict(species -> dict(nb_nodes -> dict(nb_edges -> number of components)), stats file ''' @@ -178,20 +212,22 @@ def main(): in_data_adjacencies_file = sys.argv[3] out_dir = sys.argv[4] out_file = sys.argv[5] - # Reading genes - genes = read_DeCoSTAR_genes(in_genes_file) + + # Reading DeCoSTAR genes + genes = go_read_decostar_genes(in_genes_file) # Reading DeCoSTAR adjacencies - adjacencies = read_DeCoSTAR_adjacencies(in_data_adjacencies_file) + adjacencies = go_read_decostar_adjacencies(in_data_adjacencies_file) # Create graphs - graphs = create_adjacencies_graph(genes, adjacencies) + graphs = go_create_adjacencies_graph(genes, adjacencies) if command == 'build': - # Read graphs and write into FASTA - write_FASTA(graphs, out_dir, out_file) + # Read graphs and write CARs + go_write_CARs(graphs, out_dir, out_file) elif command == 'stats': + # Compute and write statistics species_stats = {} for species,graph in graphs.items(): - species_stats[species] = graph2stats(graphs[species]) - write_stats(species_stats, out_file) + species_stats[species] = go_graph2stats(graphs[species]) + go_write_stats(species_stats, out_file) if __name__ == "__main__": main() diff --git a/scripts/newick_utils.py b/scripts/newick_utils.py index e6c4585..b52de04 100755 --- a/scripts/newick_utils.py +++ b/scripts/newick_utils.py @@ -5,16 +5,13 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys from collections import defaultdict import ete3 -NEWICK_EXTANT='extant' -NEWICK_ANCESTRAL='ancestral' - ''' Check that the tree is rooted ''' def newick_check_rooted(species_tree_file): ''' @@ -95,21 +92,6 @@ def newick_remove_internal_names(in_species_tree_file, out_species_tree_file): tree = ete3.Tree(in_species_tree_file, format=1) tree.write(format=5, outfile=out_species_tree_file) -''' Creates a map from species names to species status ''' -def newick_get_species_status(tree_file): - ''' - input: paths to a Newick tree file with internal nodes named - output: dic(str->str) node (species) name -> 'extant', 'ancestral' - ''' - species = {} - tree = ete3.Tree(tree_file, format=1) - for node in tree.traverse(): - if node.is_leaf(): - species[node.name] = NEWICK_EXTANT - else: - species[node.name] = NEWICK_ANCESTRAL - return(species) - ''' Creates a map from node names to children ''' def newick_get_children_map(tree_file): ''' @@ -164,10 +146,12 @@ def main(): command = sys.argv[1] if command == 'species': + # Creates a species -> descendants file from a species tree in_species_tree_file = sys.argv[2] out_species_file = sys.argv[3] newick_create_species_file(in_species_tree_file, out_species_file) elif command == 'unlabel': + # Creates a new file with internal nodes unlabeled in_species_tree_file = sys.argv[2] out_species_tree_file = sys.argv[3] newick_remove_internal_names(in_species_tree_file, out_species_tree_file) diff --git a/scripts/recPhyloXML_statistics.py b/scripts/recPhyloXML_statistics.py index b1d6d8c..1ef9d6a 100755 --- a/scripts/recPhyloXML_statistics.py +++ b/scripts/recPhyloXML_statistics.py @@ -5,12 +5,12 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys -from data_utils import data_family2reconciliation_path +from data_utils import data_index2path from recPhyloXML_utils import ( xml_get_tag, xml_get_prefix, @@ -64,7 +64,7 @@ def parse_clade_recursive(node, result): parse_clade_recursive(root, result) return result -def recPhyloXML_read_events(in_file): +def xml_read_events(in_file): ''' Read a recPhyloXML file and returns a dictionary indexed by species and for each containing a dictionary recording number of genes, @@ -118,20 +118,20 @@ def parse_clade_recursive(node, tag_pref, stats): ''' Collects statistics from a dataset reconciliations file ''' -def collect_statistics(in_reconciliations_file): +def xml_collect_statistics(in_reconciliations_file): ''' input: dataset path family ID -> path to reconciliation file output: - dict(species -> dict(key: value for key in STATS_keys)) - dict(family ID -> dict(species -> dict(key: value for key in STATS_keys))) ''' - family2reconciliation = data_family2reconciliation_path( + family2reconciliation = data_index2path( in_reconciliations_file ) stats_species = {} stats_families = {} for fam_id,reconciliation_path in family2reconciliation.items(): - events = recPhyloXML_read_events(reconciliation_path) + events = xml_read_events(reconciliation_path) stats_families[fam_id] = events for species,stats in stats_families[fam_id].items(): if species not in stats_species.keys(): @@ -151,7 +151,7 @@ def _stats_str(stats, sp, sep=':'): ] ) -def write_statistics_species( +def xml_write_statistics_species( in_statistics_species, out_stats_file_species, sep1=':', sep2='\t', sep3=' ' @@ -165,7 +165,7 @@ def write_statistics_species( ) out_stats_file.write('\n') -def write_statistics_families( +def xml_write_statistics_families( in_statistics_families, out_stats_file_families, sep1=':', sep2='\t', sep3=' ' @@ -199,15 +199,15 @@ def main(): out_stats_file_families = sys.argv[3] # Reading reconciliations and collecting statistics - (statistics_families,statistics_species) = collect_statistics( + (statistics_families,statistics_species) = xml_collect_statistics( in_reconciliations_file ) # Writting species statistics - write_statistics_species( + xml_write_statistics_species( statistics_species, out_stats_file_species ) # Writting families statistics - write_statistics_families( + xml_write_statistics_families( statistics_families, out_stats_file_families ) diff --git a/scripts/recPhyloXML_utils.py b/scripts/recPhyloXML_utils.py index b68d2f7..5a5aae6 100644 --- a/scripts/recPhyloXML_utils.py +++ b/scripts/recPhyloXML_utils.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys diff --git a/src/AGO.py b/src/AGO.py index 7665e15..5cf3110 100644 --- a/src/AGO.py +++ b/src/AGO.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys diff --git a/src/AGO_parameters.py b/src/AGO_parameters.py index c3e5a04..3a7d0f2 100644 --- a/src/AGO_parameters.py +++ b/src/AGO_parameters.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import sys diff --git a/src/AGO_utils.py b/src/AGO_utils.py index cc4205a..8bf19fc 100644 --- a/src/AGO_utils.py +++ b/src/AGO_utils.py @@ -5,7 +5,7 @@ __author__ = "Cedric Chauve" __email__ = "cedric.chauve@sfu.ca" -__version__ = "1.0" +__version__ = "1.0.3" __status__ = "Released" import os @@ -69,7 +69,7 @@ def create_bash_script(parameters, tool): script.write('#!/bin/bash\n\n') bash_modules = parameters.get_slurm_modules(tool) if bash_modules is not None: - script.write(f'\n\nmodule load {bash_modules}') + script.write(f'\n\nmodule load {bash_modules}\n\n') if parameters.check_slurm_array_input(tool): TASK_ID = '${TASK_ID}' array_size = parameters.get_slurm_array_input_len(tool)