-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequencer.py
114 lines (107 loc) · 2.78 KB
/
frequencer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
from Bio import SeqIO
import json
parser = argparse.ArgumentParser(description='to create codon bias dictionary from reference nucleotide itself')
parser.add_argument('-ref', '--reference', type=str, help='reference file in nucleotide fasta')
parser.add_argument('-out', '--output', type=str, help='output file in dict.py')
args = parser.parse_args()
out = args.output
ref = args.reference
# Create a dictionary to store the codon frequencies
codon_prop = {}
codon_table = {
"TTT": "F",
"TTC": "F",
"TTA": "L",
"TTG": "L",
"CTT": "L",
"CTC": "L",
"CTA": "L",
"CTG": "L",
"ATT": "I",
"ATC": "I",
"ATA": "I",
"ATG": "M",
"GTT": "V",
"GTC": "V",
"GTA": "V",
"GTG": "V",
"TCT": "S",
"TCC": "S",
"TCA": "S",
"TCG": "S",
"CCT": "P",
"CCC": "P",
"CCA": "P",
"CCG": "P",
"ACT": "T",
"ACC": "T",
"ACA": "T",
"ACG": "T",
"GCT": "A",
"GCC": "A",
"GCA": "A",
"GCG": "A",
"TAT": "Y",
"TAC": "Y",
"TAA": "*",
"TAG": "*",
"CAT": "H",
"CAC": "H",
"CAA": "Q",
"CAG": "Q",
"AAT": "N",
"AAC": "N",
"AAA": "K",
"AAG": "K",
"GAT": "D",
"GAC": "D",
"GAA": "E",
"GAG": "E",
"TGT": "C",
"TGC": "C",
"TGA": "*",
"TGG": "W",
"CGT": "R",
"CGC": "R",
"CGA": "R",
"CGG": "R",
"AGT": "S",
"AGC": "S",
"AGA": "R",
"AGG": "R",
"GGT": "G",
"GGC": "G",
"GGA": "G",
"GGG": "G"
}
codon_freq = {aa: {} for aa in set(codon_table.values())}
# Loop over the sequence, counting the occurrence of each codon
reference = []
with open(ref, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
nucleotide = record.seq
nucleotide = nucleotide.replace("\n", "")
nucleotide = str(nucleotide)
reference.append(nucleotide)
for seq in reference:
for i in range(0, len(seq), 3):
codon = seq[i:i + 3]
if codon in codon_prop:
codon_prop[codon] += 1
else:
codon_prop[codon] = 1
# Calculate the total number of codons
total_codons = sum(codon_prop.values())
# Normalize the counts to obtain the frequencies
for codon in codon_prop:
codon_prop[codon] /= total_codons
codon_freq[codon_table[codon]][codon] = float("{:.3f}".format(codon_prop[codon]))
for aa in codon_freq:
total_aa_freq = sum(codon_freq[aa].values())
for codon in codon_freq[aa]:
codon_freq[aa][codon] = float("{:.3f}".format(codon_freq[aa][codon] / total_aa_freq))
# Print the resulting dictionary and saving it in a file
with open(out, 'w') as output_file:
output_file.write(json.dumps(codon_freq))
print(codon_freq)