-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_fasta_from_ids.py
64 lines (53 loc) · 1.91 KB
/
get_fasta_from_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
Takes file with DESeq results as an input and
returns multifasta of correspondent sequences
Multifasta with transcripts is the third positional argument
"""
import sys
def get_first_column(input_file):
"""
Reads infile with tab-separated values and returns its first column in outlist
"""
out_list = []
with open(input_file, 'r') as input_file:
for line in input_file:
mod_line = line.strip().split()[0]
out_list.append(mod_line)
return out_list
def fasta_filter(fastafile, outfile, list=0):
"""
:param fastafile: .fasta file to be parsed
:param list: list of IDs to find in the given .fasta file. Empty by default
:return: Returns nothing. Prints a file with sequences that went through the filter
"""
out = open(outfile, 'w')
with open(fastafile, 'r') as fasta:
line = next(fasta)
while True:
try:
current_id = ''
if line.startswith('>'):
current_id = line.split(' ')[0][1:]
if list and current_id in list:
out.write('>' + current_id + '\r\n')
while True:
line = next(fasta)
if line.startswith('>'):
break
out.write(line.strip() + '\r\n')
else:
line = next(fasta)
else:
line = next(fasta)
except StopIteration:
break
out.close()
def ids2multifasta(input_file, output_file, transcripts):
ids = get_first_column(input_file)
fasta_filter(transcripts, output_file, list=ids)
if __name__ == '__main__':
infile = sys.argv[1]
outfile = sys.argv[2]
transcripts = sys.argv[3]
print(infile, outfile, transcripts)
ids2multifasta(infile, outfile, transcripts)