Skip to content

Commit

Permalink
Merge pull request #19 from cchauve/code_cleaning
Browse files Browse the repository at this point in the history
Code cleaning
  • Loading branch information
cchauve authored Apr 29, 2023
2 parents ee815c7 + 4fd28e3 commit 8d54534
Show file tree
Hide file tree
Showing 19 changed files with 275 additions and 250 deletions.
21 changes: 15 additions & 6 deletions scripts/ALE_reformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

__author__ = "Cedric Chauve"
__email__ = "cedric.chauve@sfu.ca"
__version__ = "1.0"
__version__ = "1.0.3"
__status__ = "Released"

import sys
Expand Down Expand Up @@ -38,16 +38,25 @@ def main():
in_rec_xml_file = sys.argv[3]
out_rec_xml_file = sys.argv[4]

# Creates a map from ALE species names to original species names
species_map = newick_create_species_map(in_ale_species_tree, in_data_species_tree)
# Read an ALE recPhyloXML file
tree = ET.parse(in_rec_xml_file)
# If the file includes an HGT, we do not create a reformatted file
if not xml_check_transfer(tree, xml_ALE_identify_transfer):
# Rename species
xml_rename_species(tree, species_map)
# Reformat losses
xml_rename_losses(tree, xml_ALE_identify_loss, 'loss')
_ = xml_rename_ancestral_genes(tree, xml_ALE_identify_ancestral_gene, start_id=1)
out_rec_xml_file_tmp = f'{out_rec_xml_file}_tmp'
tree.write(out_rec_xml_file_tmp)
_ = xml_reformat_file(out_rec_xml_file_tmp, out_rec_xml_file)
os.remove(out_rec_xml_file_tmp)
# Reformat ancestral gene names with integers ID starting at 1
_ = xml_rename_ancestral_genes(tree, xml_ALE_identify_ancestral_gene, start_id=1)
# Create a temporary recPhyloXML file
tmp_rec_file = f'{out_rec_xml_file}_tmp'
tree.write(tmp_rec_file)
# Reformat the temporary recPhyloXML file int the final recPhyloXML file
_ = xml_reformat_file(tmp_rec_file, out_rec_xml_file)
# Delete the temporary recPhyloXML file
os.remove(tmp_rec_file)

if __name__ == "__main__":
main()
Expand Down
59 changes: 30 additions & 29 deletions scripts/DeCoSTAR_create_input_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,34 @@

__author__ = "Cedric Chauve"
__email__ = "cedric.chauve@sfu.ca"
__version__ = "1.0"
__version__ = "1.0.3"
__status__ = "Released"

import sys
from data_utils import (
data_gene2family,
data_species2gene_order_path,
data_family2reconciliation_path,
data_index2path,
data_read_gene_order_file
)

''' Read a gene order file and returns an order list of adjacencies strings '''
def decostar_gene_order2adjacencies_str(
in_gene_order_file, in_gene2family_map, in_reconciled_families
in_gene_order_file, in_gene2family_map, in_families
):
'''
input:
- path to gene order file
- map of gene to family ID
- list of reconciled families to consider
(genes from other families are not accounted for)
- list of families to consider (genes from other families are not accounted for)
output: list(str)
'''
orientation = {'1': '+', '0': '-'}
adjacencies = []
gene_order = data_read_gene_order_file(in_gene_order_file)
in_gene2family_keys = list(in_gene2family_map.keys())
prev_gene = None
for (gene_name,gene_chr,_,_,gene_sign) in gene_order:
if in_gene2family_map[gene_name] in in_reconciled_families:
if gene_name in in_gene2family_keys and in_gene2family_map[gene_name] in in_families:
gene_orientation = orientation[gene_sign]
if prev_gene is not None and prev_gene[1] == gene_chr:
adjacency = [
Expand All @@ -45,46 +44,44 @@ def decostar_gene_order2adjacencies_str(
return adjacencies

'''
Creates the DeCoSTAR input gene trees distribution file
Compute a list of families for which the reconciled gene tree is available
Creates the DeCoSTAR input (reconciled) gene trees distribution file
Compute a list of families for which the (reconciled) gene tree is provided
'''
def create_gene_distribution_file(in_reconciliations_file, out_trees_file):
def decostar_create_gene_distribution_file(in_trees_file, out_trees_file):
'''
input:
- dataset file with link from family to reconciled gene tree
- dataset file family<TAB>(reconciled) gene trees
output:
- creates out_trees_file
- list(str) of families for which a reonciliation is available
- list(str) of families for which a (reconciled) gene tree is available
'''
family2reconciliation_path = data_family2reconciliation_path(
in_reconciliations_file
)
reconciled_families = []
with open(out_trees_file, 'w') as trees:
for fam_id,reconciliation_file in family2reconciliation_path.items():
trees.write(f'{reconciliation_file}\n')
reconciled_families.append(fam_id)
return reconciled_families
family2trees_path = data_index2path(in_trees_file)
families = []
with open(out_trees_file, 'w') as out_trees:
for fam_id,trees_file in family2trees_path.items():
out_trees.write(f'{trees_file}\n')
families.append(fam_id)
return families

''' Creates the DeCoSTAR input adjacencies file '''
def create_adjacencies_file(
in_gene_orders_file, in_families_file, in_reconciled_families,
def decostar_create_adjacencies_file(
in_gene_orders_file, in_families_file, in_trees_families,
out_adjacencies_file
):
'''
input:
- dataset file with link from species to gene order file
- dataset families file
- list of reconciled families
- list of families with a (reconciled) tree provided as input
output:
creates out_adjacencies_file
'''
gene_order_files= data_species2gene_order_path(in_gene_orders_file)
gene_order_files= data_index2path(in_gene_orders_file)
gene2family_map = data_gene2family(in_families_file)
with open(out_adjacencies_file, 'w') as out_adjacencies:
for species,gene_order_file in gene_order_files.items():
species_adjacencies = decostar_gene_order2adjacencies_str(
gene_order_file, gene2family_map, in_reconciled_families
gene_order_file, gene2family_map, in_trees_families
)
for adjacency in species_adjacencies:
out_adjacencies.write(f'{adjacency}\n')
Expand All @@ -93,13 +90,17 @@ def create_adjacencies_file(

def main():
in_gene_orders_file = sys.argv[1]
in_reconciliations_file = sys.argv[2]
in_trees_file = sys.argv[2]
in_families_file = sys.argv[3]
out_adjacencies_file = sys.argv[4]
out_trees_file = sys.argv[5]

reconciled_families = create_gene_distribution_file(in_reconciliations_file, out_trees_file)
create_adjacencies_file(in_gene_orders_file, in_families_file, reconciled_families, out_adjacencies_file)
# Creates the gene trees distribution file (reconciliations or samples of gene trees)
trees_families = decostar_create_gene_distribution_file(in_trees_file, out_trees_file)
# Creates the adjacencies file
decostar_create_adjacencies_file(
in_gene_orders_file, in_families_file, trees_families, out_adjacencies_file
)

if __name__ == "__main__":
main()
17 changes: 11 additions & 6 deletions scripts/DeCoSTAR_ecceTERA_reformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

__author__ = "Cedric Chauve"
__email__ = "cedric.chauve@sfu.ca"
__version__ = "1.0"
__version__ = "1.0.3"
__status__ = "Released"

import sys
import os
import xml.etree.ElementTree as ET


from recPhyloXML_utils import xml_rename_species

'''
Expand All @@ -25,11 +24,11 @@ def eccetera_read_results(in_genes_file1, in_genes_file2, sep='|'):
input:
- original and reformatted genes file
output:
- dict(ecceTERA species -> data species)
- dict(ecceTERA family -> data family)
- dict(ecceTERA species -> original species)
- dict(ecceTERA family -> original family)
'''
species_dict,families_dict = {},{}
# Reading original genes
# Reading original DeCoSTAR genes
data,data_idx = {},0
with open(in_genes_file1, 'r') as in_genes:
for gene_data in in_genes.readlines():
Expand Down Expand Up @@ -117,9 +116,9 @@ def eccetera_write_reconciliations(in_reconciliations_files, species_map, out_di
out_reconciliation_file = os.path.join(out_dir, f'{family_id}{out_xml_ext}')
tree.write(out_reconciliation_file)
out_file.write(f'{family_id}\t{out_reconciliation_file}\n')
os.remove(tmp_rec_file)



def main():
in_genes_file1 = sys.argv[1] # DeCoSTAR genes file
in_genes_file2 = sys.argv[2] # Reformatted genes file
Expand All @@ -129,11 +128,17 @@ def main():
out_xml_ext = sys.argv[6] # Extension to recPhyloXML files
out_reconciliations_file = sys.argv[7] # Data set reconciliations file

# Creates a map from DeCoSTAR species names to original species names
species_map,families_map = eccetera_read_results(in_genes_file1, in_genes_file2)
# Read the species tree in XML format created by DeCoSTAR
xml_sp_str = eccetera_read_species_tree(in_sp_xml_file)
# Creates one temporary recPhyloXML file per family extracted from the
# DeCoSTAR reocnciliations file
tmp_reconciliations_files = eccetera_read_reconciliations(
in_reconciliations_file, xml_sp_str, families_map, out_dir
)
# Reformat the temporary rcPhyloXML files and writes a reconciliations file
# (<family><TAB><path to reconciliation file>)
eccetera_write_reconciliations(
tmp_reconciliations_files, species_map, out_dir, out_xml_ext, out_reconciliations_file
)
Expand Down
9 changes: 6 additions & 3 deletions scripts/DeCoSTAR_reformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

__author__ = "Cedric Chauve"
__email__ = "cedric.chauve@sfu.ca"
__version__ = "1.0"
__version__ = "1.0.3"
__status__ = "Released"

import sys
Expand All @@ -16,7 +16,7 @@
data_create_equivalence_map,
data_species_map,
data_gene2family,
data_reconciliation_path2family
data_path2index
)
from recPhyloXML_utils import (
xml_get_gene_tree_root,
Expand Down Expand Up @@ -71,7 +71,7 @@ def decostar_family_map(
output:
dict(str->str) from DeCoSTAR family ID to original family ID
'''
reconciliation2family = data_reconciliation_path2family(
reconciliation2family = data_path2index(
in_input_file
)
family_map = {}
Expand Down Expand Up @@ -326,16 +326,19 @@ def main():
out_genes_file = sys.argv[9]
out_adjacencies_dir = sys.argv[10]

# Creates a map from DeCoSTAR species names to original species names
species_map = decostar_species_map(
in_species_file, in_decostar_species_file
)
# Reformat the genes file created by DeCoSTAR to rename genes, families and species
genes_map = decostar_reformat_genes(
species_map,
in_families_file, in_reconciliations_file,
in_gene_trees_file, in_genes_file,
out_genes_file,
already_reconciled = in_already_reconciled
)
# Reformat the adjacencies file created by DeCoSTAR to rename genes, families and species
decostar_reformat_adjacencies_file(
species_map, genes_map, in_adjacencies_file,
out_adjacencies_dir
Expand Down
6 changes: 3 additions & 3 deletions scripts/DeCoSTAR_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

__author__ = "Cedric Chauve"
__email__ = "cedric.chauve@sfu.ca"
__version__ = "1.0"
__version__ = "1.0.3"
__status__ = "Released"

import sys

from data_utils import (
data_species_list,
data_species2adjacencies_path
data_index2path
)
from DeCoSTAR_reformat import decostar_read_adjacencies

Expand Down Expand Up @@ -60,7 +60,7 @@ def decostar_read_results(in_genes_file, in_adjacencies_file, in_species_list):
for (species,gene) in species_gene:
adjacencies_dicts[species][gene] = {'h': [], 't': []}
## Populate dictionaries from species to adjacency tabulated file
species2adjacencies = data_species2adjacencies_path(in_adjacencies_file)
species2adjacencies = data_index2path(in_adjacencies_file)
for species,in_adjacencies_file in species2adjacencies.items():
in_adjacencies = decostar_read_adjacencies(
in_adjacencies_file, species=species
Expand Down
12 changes: 5 additions & 7 deletions scripts/GeneRax_create_input_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

__author__ = "Cedric Chauve"
__email__ = "cedric.chauve@sfu.ca"
__version__ = "1.0"
__version__ = "1.0.3"
__status__ = "Released"

import sys
Expand All @@ -14,11 +14,11 @@
from data_utils import (
data_family2genes,
data_gene2species,
data_family2alignment_path
data_index2path
)

''' Write the GeneRax families file from paths to alignments '''
def GeneRax_write_families_file(
def generax_write_families_file(
in_family2genes, in_gene2species, in_family2alignment,
in_subst_model,
out_map_files_dir, out_families_file
Expand Down Expand Up @@ -58,11 +58,9 @@ def main():
# Read all genes
gene2species = data_gene2species(in_gene_orders_file)
# Read alignmed families
family2alignment = data_family2alignment_path(
in_alignments_file, in_suffix
)
family2alignment = data_index2path(in_alignments_file)
# Create GeneRax input files
GeneRax_write_families_file(
generax_write_families_file(
family2genes, gene2species, family2alignment, in_subst_model,
out_map_files_dir, out_families_file
)
Expand Down
Loading

0 comments on commit 8d54534

Please sign in to comment.