Merge pull request #19 from cchauve/code_cleaning

Code cleaning
cchauve · Apr 29, 2023 · 8d54534 · 8d54534
2 parents ee815c7 + 4fd28e3
commit 8d54534
Show file tree

Hide file tree

Showing 19 changed files with 275 additions and 250 deletions.
diff --git a/scripts/ALE_reformat.py b/scripts/ALE_reformat.py
@@ -5,7 +5,7 @@
 
 __author__    = "Cedric Chauve"
 __email__     = "cedric.chauve@sfu.ca"
-__version__   = "1.0"
+__version__   = "1.0.3"
 __status__    = "Released"
 
 import sys
@@ -38,16 +38,25 @@ def main():
     in_rec_xml_file = sys.argv[3]
     out_rec_xml_file = sys.argv[4]
 
+    # Creates a map from ALE species names to original species names
     species_map = newick_create_species_map(in_ale_species_tree, in_data_species_tree)
+    # Read an ALE recPhyloXML file
     tree = ET.parse(in_rec_xml_file)
+    # If the file includes an HGT, we do not create a reformatted file
     if not xml_check_transfer(tree, xml_ALE_identify_transfer):
+        # Rename species
         xml_rename_species(tree, species_map)
+        # Reformat losses
         xml_rename_losses(tree, xml_ALE_identify_loss, 'loss')
-        _ = xml_rename_ancestral_genes(tree, xml_ALE_identify_ancestral_gene, start_id=1)    
-        out_rec_xml_file_tmp = f'{out_rec_xml_file}_tmp'
-        tree.write(out_rec_xml_file_tmp)
-        _ = xml_reformat_file(out_rec_xml_file_tmp, out_rec_xml_file)
-        os.remove(out_rec_xml_file_tmp)
+        # Reformat ancestral gene names with integers ID starting at 1
+        _ = xml_rename_ancestral_genes(tree, xml_ALE_identify_ancestral_gene, start_id=1)
+        # Create a temporary recPhyloXML file
+        tmp_rec_file = f'{out_rec_xml_file}_tmp'
+        tree.write(tmp_rec_file)
+        # Reformat the temporary recPhyloXML file int the final recPhyloXML file
+        _ = xml_reformat_file(tmp_rec_file, out_rec_xml_file)
+        # Delete the temporary recPhyloXML file
+        os.remove(tmp_rec_file)
 
 if __name__ == "__main__":
     main()

diff --git a/scripts/DeCoSTAR_create_input_files.py b/scripts/DeCoSTAR_create_input_files.py
@@ -5,35 +5,34 @@
 
 __author__    = "Cedric Chauve"
 __email__     = "cedric.chauve@sfu.ca"
-__version__   = "1.0"
+__version__   = "1.0.3"
 __status__    = "Released"
 
 import sys
 from data_utils import (
     data_gene2family,
-    data_species2gene_order_path,
-    data_family2reconciliation_path,
+    data_index2path,
     data_read_gene_order_file
 )
 
 ''' Read a gene order file and returns an order list of adjacencies strings '''
 def decostar_gene_order2adjacencies_str(
-        in_gene_order_file, in_gene2family_map, in_reconciled_families
+        in_gene_order_file, in_gene2family_map, in_families
 ):
     '''
     input: 
     - path to gene order file
     - map of gene to family ID
-    - list of reconciled families to consider 
-    (genes from other families are not accounted for)
+    - list of families to consider (genes from other families are not accounted for)
     output: list(str)
     '''
     orientation = {'1': '+', '0': '-'}
     adjacencies = []
     gene_order = data_read_gene_order_file(in_gene_order_file)
+    in_gene2family_keys = list(in_gene2family_map.keys())
     prev_gene = None
     for (gene_name,gene_chr,_,_,gene_sign) in gene_order:
-        if in_gene2family_map[gene_name] in in_reconciled_families:
+        if gene_name in in_gene2family_keys and in_gene2family_map[gene_name] in in_families:
             gene_orientation = orientation[gene_sign]
             if prev_gene is not None and prev_gene[1] == gene_chr:
                 adjacency = [
@@ -45,46 +44,44 @@ def decostar_gene_order2adjacencies_str(
     return adjacencies
 
 ''' 
-Creates the DeCoSTAR input gene trees distribution file 
-Compute a list of families for which the reconciled gene tree is available
+Creates the DeCoSTAR input (reconciled) gene trees distribution file 
+Compute a list of families for which the (reconciled) gene tree is provided
 '''
-def create_gene_distribution_file(in_reconciliations_file, out_trees_file):
+def decostar_create_gene_distribution_file(in_trees_file, out_trees_file):
     '''
     input: 
-    - dataset file with link from family to reconciled gene tree
+    - dataset file family<TAB>(reconciled) gene trees
     output: 
     - creates out_trees_file
-    - list(str) of families for which a reonciliation is available
+    - list(str) of families for which a (reconciled) gene tree is available
     '''
-    family2reconciliation_path = data_family2reconciliation_path(
-        in_reconciliations_file
-    )
-    reconciled_families = []
-    with open(out_trees_file, 'w') as trees:
-        for fam_id,reconciliation_file in family2reconciliation_path.items():
-            trees.write(f'{reconciliation_file}\n')
-            reconciled_families.append(fam_id)
-    return reconciled_families
+    family2trees_path = data_index2path(in_trees_file)
+    families = []
+    with open(out_trees_file, 'w') as out_trees:
+        for fam_id,trees_file in family2trees_path.items():
+            out_trees.write(f'{trees_file}\n')
+            families.append(fam_id)
+    return families
 
 ''' Creates the DeCoSTAR input adjacencies file '''
-def create_adjacencies_file(
-        in_gene_orders_file, in_families_file, in_reconciled_families,
+def decostar_create_adjacencies_file(
+        in_gene_orders_file, in_families_file, in_trees_families,
         out_adjacencies_file
 ):
     '''
     input: 
     - dataset file with link from species to gene order file
     - dataset families file
-    - list of reconciled families
+    - list of families with a (reconciled) tree provided as input
     output: 
     creates out_adjacencies_file
     '''
-    gene_order_files= data_species2gene_order_path(in_gene_orders_file)
+    gene_order_files= data_index2path(in_gene_orders_file)
     gene2family_map = data_gene2family(in_families_file)
     with open(out_adjacencies_file, 'w') as out_adjacencies:
         for species,gene_order_file in gene_order_files.items():
             species_adjacencies = decostar_gene_order2adjacencies_str(
-                gene_order_file, gene2family_map, in_reconciled_families            
+                gene_order_file, gene2family_map, in_trees_families            
             )
             for adjacency in species_adjacencies:
                 out_adjacencies.write(f'{adjacency}\n')
@@ -93,13 +90,17 @@ def create_adjacencies_file(
 
 def main():
     in_gene_orders_file = sys.argv[1]
-    in_reconciliations_file = sys.argv[2]
+    in_trees_file = sys.argv[2]
     in_families_file = sys.argv[3]
     out_adjacencies_file = sys.argv[4]
     out_trees_file = sys.argv[5]
 
-    reconciled_families = create_gene_distribution_file(in_reconciliations_file, out_trees_file)
-    create_adjacencies_file(in_gene_orders_file, in_families_file, reconciled_families, out_adjacencies_file)
+    # Creates the gene trees distribution file (reconciliations or samples of gene trees)
+    trees_families = decostar_create_gene_distribution_file(in_trees_file, out_trees_file)
+    # Creates the adjacencies file
+    decostar_create_adjacencies_file(
+        in_gene_orders_file, in_families_file, trees_families, out_adjacencies_file
+    )
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/DeCoSTAR_ecceTERA_reformat.py b/scripts/DeCoSTAR_ecceTERA_reformat.py
@@ -5,14 +5,13 @@
 
 __author__    = "Cedric Chauve"
 __email__     = "cedric.chauve@sfu.ca"
-__version__   = "1.0"
+__version__   = "1.0.3"
 __status__    = "Released"
 
 import sys
 import os
 import xml.etree.ElementTree as ET
 
-
 from recPhyloXML_utils import xml_rename_species
 
 '''
@@ -25,11 +24,11 @@ def eccetera_read_results(in_genes_file1, in_genes_file2, sep='|'):
     input:
     - original and reformatted genes file
     output:
-    - dict(ecceTERA species -> data species)
-    - dict(ecceTERA family -> data family)
+    - dict(ecceTERA species -> original species)
+    - dict(ecceTERA family -> original family)
     '''
     species_dict,families_dict = {},{}
-    # Reading original genes
+    # Reading original DeCoSTAR genes
     data,data_idx = {},0
     with open(in_genes_file1, 'r') as in_genes:
         for gene_data in in_genes.readlines():
@@ -117,9 +116,9 @@ def eccetera_write_reconciliations(in_reconciliations_files, species_map, out_di
          out_reconciliation_file = os.path.join(out_dir, f'{family_id}{out_xml_ext}')
          tree.write(out_reconciliation_file)
          out_file.write(f'{family_id}\t{out_reconciliation_file}\n')
+         os.remove(tmp_rec_file)
 
 
-
 def main():
     in_genes_file1 = sys.argv[1] # DeCoSTAR genes file
     in_genes_file2 = sys.argv[2] # Reformatted genes file
@@ -129,11 +128,17 @@ def main():
     out_xml_ext = sys.argv[6] # Extension to recPhyloXML files
     out_reconciliations_file = sys.argv[7] # Data set reconciliations file
 
+    # Creates a map from DeCoSTAR species names to original species names
     species_map,families_map = eccetera_read_results(in_genes_file1, in_genes_file2)
+    # Read the species tree in XML format created by DeCoSTAR
     xml_sp_str = eccetera_read_species_tree(in_sp_xml_file)
+    # Creates one temporary recPhyloXML file per family extracted from the
+    # DeCoSTAR reocnciliations file
     tmp_reconciliations_files = eccetera_read_reconciliations(
         in_reconciliations_file, xml_sp_str, families_map, out_dir
     )
+    # Reformat the temporary rcPhyloXML files and writes a reconciliations file
+    # (<family><TAB><path to reconciliation file>)
     eccetera_write_reconciliations(
         tmp_reconciliations_files, species_map, out_dir, out_xml_ext, out_reconciliations_file
     )

diff --git a/scripts/DeCoSTAR_reformat.py b/scripts/DeCoSTAR_reformat.py
@@ -5,7 +5,7 @@
 
 __author__    = "Cedric Chauve"
 __email__     = "cedric.chauve@sfu.ca"
-__version__   = "1.0"
+__version__   = "1.0.3"
 __status__    = "Released"
 
 import sys
@@ -16,7 +16,7 @@
     data_create_equivalence_map,
     data_species_map,
     data_gene2family,
-    data_reconciliation_path2family
+    data_path2index
 )
 from recPhyloXML_utils import (
     xml_get_gene_tree_root,
@@ -71,7 +71,7 @@ def decostar_family_map(
     output:
     dict(str->str) from DeCoSTAR family ID to original family ID
     '''
-    reconciliation2family = data_reconciliation_path2family(
+    reconciliation2family = data_path2index(
         in_input_file
     )
     family_map = {}
@@ -326,16 +326,19 @@ def main():
     out_genes_file = sys.argv[9]
     out_adjacencies_dir = sys.argv[10]    
 
+    # Creates a map from DeCoSTAR species names to original species names
     species_map = decostar_species_map(
         in_species_file, in_decostar_species_file
     )
+    # Reformat the genes file created by DeCoSTAR to rename genes, families and species
     genes_map = decostar_reformat_genes(
         species_map,
         in_families_file, in_reconciliations_file,
         in_gene_trees_file, in_genes_file,
         out_genes_file,
         already_reconciled = in_already_reconciled
     )
+    # Reformat the adjacencies file created by DeCoSTAR to rename genes, families and species
     decostar_reformat_adjacencies_file(
         species_map, genes_map, in_adjacencies_file,
         out_adjacencies_dir

diff --git a/scripts/DeCoSTAR_statistics.py b/scripts/DeCoSTAR_statistics.py
@@ -5,14 +5,14 @@
 
 __author__    = "Cedric Chauve"
 __email__     = "cedric.chauve@sfu.ca"
-__version__   = "1.0"
+__version__   = "1.0.3"
 __status__    = "Released"
 
 import sys
 
 from data_utils import (
     data_species_list,
-    data_species2adjacencies_path
+    data_index2path
 )
 from DeCoSTAR_reformat import decostar_read_adjacencies
 
@@ -60,7 +60,7 @@ def decostar_read_results(in_genes_file, in_adjacencies_file, in_species_list):
     for (species,gene) in species_gene:
         adjacencies_dicts[species][gene] = {'h': [], 't': []}
     ## Populate dictionaries from species to adjacency tabulated file
-    species2adjacencies = data_species2adjacencies_path(in_adjacencies_file)
+    species2adjacencies = data_index2path(in_adjacencies_file)
     for species,in_adjacencies_file in species2adjacencies.items():
         in_adjacencies = decostar_read_adjacencies(
             in_adjacencies_file, species=species

diff --git a/scripts/GeneRax_create_input_files.py b/scripts/GeneRax_create_input_files.py
@@ -5,7 +5,7 @@
 
 __author__    = "Cedric Chauve"
 __email__     = "cedric.chauve@sfu.ca"
-__version__   = "1.0"
+__version__   = "1.0.3"
 __status__    = "Released"
 
 import sys
@@ -14,11 +14,11 @@
 from data_utils import (
     data_family2genes,
     data_gene2species,
-    data_family2alignment_path
+    data_index2path
 )
 
 ''' Write the GeneRax families file from paths to alignments '''
-def GeneRax_write_families_file(
+def generax_write_families_file(
         in_family2genes, in_gene2species, in_family2alignment,
         in_subst_model,
         out_map_files_dir, out_families_file
@@ -58,11 +58,9 @@ def main():
     # Read all genes
     gene2species = data_gene2species(in_gene_orders_file)
     # Read alignmed families
-    family2alignment = data_family2alignment_path(
-        in_alignments_file, in_suffix
-    )
+    family2alignment = data_index2path(in_alignments_file)
     # Create GeneRax input files
-    GeneRax_write_families_file(
+    generax_write_families_file(
         family2genes, gene2species, family2alignment, in_subst_model,
         out_map_files_dir, out_families_file
     )