-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbacktranslate.py
executable file
·110 lines (101 loc) · 3.89 KB
/
backtranslate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import subprocess
import argparse
# Parsing arguments
parser = argparse.ArgumentParser(
prog="backtranslate.py",
description="Takes a Nucleotide Fasta as input, translates into protein, performs protein alignment with MAFFT and returns the nucleotide alignment.",
epilog="",
)
parser.add_argument(
"--stop_codon",
help="Add this flag if there are stop codons in the CDS sequences.",
action="store_true",
default=False,
)
parser.add_argument("input_fasta", type=str)
parser.add_argument(
"-p",
"--prefix",
type=str,
default="out",
help="Name of output file. Default value: out",
)
args = parser.parse_args()
def translate_dna(dna_seq):
protein_seq = dna_seq.translate(to_stop=True)
return protein_seq
def backtranslate_protein(protein_alg, nucleotide_fasta):
prot_char_dict = {}
# Reading protein alignment
for record in SeqIO.parse(protein_alg, "fasta"):
char_array = np.array(list(str(record.seq)), dtype="U1")
prot_char_dict[record.description] = char_array
# Reading nucleotides
nucl_char_dict = {}
for record in SeqIO.parse(nucleotide_fasta, "fasta"):
char_array = np.array(
["".join(record[i: i + 3]) for i in range(0, len(record), 3)], dtype="U3"
)
nucl_char_dict[record.description] = char_array
# insert '---' elements in nucl array at the same indices
for fasta_entry in nucl_char_dict.keys():
# find the indices where prot array has '-'
insert_indices = np.where(prot_char_dict[fasta_entry] == "-")[0]
for index in insert_indices:
nucl_char_dict[fasta_entry] = np.insert(
nucl_char_dict[fasta_entry], index, ["---"]
)
# Verify that each triplet encodes for the correct amino acid
for i, (codon, aa) in enumerate(
zip(nucl_char_dict[fasta_entry], prot_char_dict[fasta_entry])
):
if Seq(codon).translate() != aa:
sys.exit(
f"Error: codon {i//3} ({codon}) in nucl array does not encode for {prot_char_dict[fasta_entry][i]} amino acid in prot array. Exiting."
)
if args.stop_codon == True:
nucl_char_dict[fasta_entry] = "".join(nucl_char_dict[fasta_entry])[:-3]
else:
nucl_char_dict[fasta_entry] = "".join(nucl_char_dict[fasta_entry])
backtranslated_dna_seq = [
SeqRecord(Seq(seq), id=head, description="")
for head, seq in nucl_char_dict.items()
]
return backtranslated_dna_seq
def main():
# Making a protein FASTA from the nucleotide FASTA
dna_sequences = list(SeqIO.parse(args.input_fasta, "fasta"))
protein_sequences = []
for dna_seq in dna_sequences:
protein_seq = translate_dna(dna_seq.seq)
protein_record = SeqRecord(
protein_seq, id=dna_seq.id, description=dna_seq.description
)
protein_sequences.append(protein_record)
SeqIO.write(protein_sequences, f"{args.prefix}_protein_sequences.fasta", "fasta")
# Running protein alignment
subprocess.run(
["mafft-linsi", f"{args.prefix}_protein_sequences.fasta"],
stdout=open(f"{args.prefix}_protein_alignment.fasta", "w"),
)
# Backtranslating protein alignment
dna_alignment = backtranslate_protein(
f"{args.prefix}_protein_alignment.fasta", args.input_fasta
)
# Write output
SeqIO.write(dna_alignment, args.prefix + "_bck_tr.fa", "fasta")
# Clean intermediate files
if os.path.exists(f"{args.prefix}_protein_sequences.fasta"):
os.remove(f"{args.prefix}_protein_sequences.fasta")
if os.path.exists(f"{args.prefix}_protein_alignment.fasta"):
os.remove(f"{args.prefix}_protein_alignment.fasta")
if __name__ == "__main__":
main()