-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextractITS.py
116 lines (95 loc) · 4.09 KB
/
extractITS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
import argparse
import subprocess
import sys
import pandas as pd
def get_params(argv):
parser = argparse.ArgumentParser(description='Extract ITS1 from fungal genome rapidly')
parser.add_argument('-i', '--i', help="input genome file", required=True)
parser.add_argument('-o', '--o', help="output directory", required=True)
parser.add_argument('-which', '--which', help="Which ITS sequence to extract (ITS1|ITS2) default=ITS1", default='ITS1')
parser.add_argument('-cpu', '--cpu', help="number of threads/cores to use", required=False, default='48')
parser.add_argument('-name', '--name', help="name", required=False, default='genome')
a = parser.parse_args()
return a
def terminal(cmd):
p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
p.wait()
if __name__ == '__main__':
a = get_params(sys.argv[1:])
if a.o[-1]!='/':
a.o=a.o+'/'
if a.name=='genome':
name=a.i.split('/')[-1].split('.')[0]
else:
name=a.name
if a.which not in ['ITS1','ITS2']:
print("ERROR, argument --which not recognized. Please choose between 'ITS1' and 'ITS2'")
## Running barrnap
terminal('barrnap --kingdom euk --threads '+a.cpu+' '+a.i+' > '+a.o+'barrnap.tmp')
## Parsing and getting rDNA gene cluster coordinates
gff=pd.read_csv(a.o+'/barrnap.tmp', header=None,comment='#',sep='\t')
gff=gff[gff[8].str.contains('18S') | gff[8].str.contains('28S')].sort_values(by=[0,6,3,4]).reset_index(drop=True)
regions=[]
print(gff)
for i in gff.index:
try:
if 'partial' in gff.loc[i,8]:
pass
elif gff.loc[i,6]=='+':
if '18S_rRNA' in gff.loc[i,8] and '28S_rRNA' in gff.loc[i+1,8] and gff.loc[i,0]==gff.loc[i+1,0] and gff.loc[i,6]==gff.loc[i+1,6]:
regions.append((gff.loc[i,0],gff.loc[i,6],gff.loc[i,3]-1,gff.loc[i+1,4]-1))
elif gff.loc[i,6]=='-':
if '28S_rRNA' in gff.loc[i,8] and '18S_rRNA' in gff.loc[i+1,8] and gff.loc[i,0]==gff.loc[i+1,0] and gff.loc[i,6]==gff.loc[i+1,6]:
regions.append((gff.loc[i,0],gff.loc[i,6],gff.loc[i,3]-1,gff.loc[i+1,4]-1))
else:
print('Error, impossible to read '+gff.loc[i,6])
except:
pass
#terminal('rm '+a.o+'barrnap.tmp')
print(regions)
## Opening genome fasta, extracting regions
with open(a.i,'r') as handle:
fasta=SeqIO.to_dict(SeqIO.parse(handle,'fasta'))
extractedRegions=[]
for r in regions:
print(r)
SEQ=fasta[r[0]][r[2]:r[3]]
SEQ.id=r[0]+'_'+r[1]+'_'+str(r[2])+'_'+str(r[3])
SEQ.name=SEQ.id
SEQ.description=SEQ.id
extractedRegions.append(SEQ)
print(SEQ)
with open(a.o+'regions.fasta', "w") as output_handle:
SeqIO.write(extractedRegions, output_handle, 'fasta')
if len(extractedRegions)>0:
## Run ITSx on regions.fasta
terminal('ITSx -t F -i %s -o %s --cpu %s --save_regions %s --not_found F --graphical F --summary F --positions F --fasta F' % (a.o+'regions.fasta',a.o+name,a.cpu,a.which))
## Remove Duplicates from ITSx output
with open(a.o+name+'.'+a.which+'.fasta','r') as handle:
its1s=list(SeqIO.parse(handle,'fasta'))
seqs=[str(s.seq) for s in its1s]
diff=list(set(seqs))
unique=[]
for n in range(len(diff)):
if len(diff)==1:
Name=name
else:
Name=name+'_#'+str(n+1)
unique.append(SeqRecord(Seq(diff[n], generic_dna),Name,Name,str(seqs.count(diff[n]))+' copies of this sequence found in '+a.i.split('/')[-1]))
with open(a.o+name+'.'+a.which+'_filtered.fasta', "w") as output_handle:
SeqIO.write(unique, output_handle, 'fasta')
## End message
if len(diff)==1:
print('\nDONE. A single '+a.which+' sequence was found in '+str(seqs.count(diff[n]))+' copies, in file '+a.i.split('/')[-1]+'.')
elif len(diff)>1:
print('\nDONE. '+str(len(diff))+' different '+a.which+' sequences were found in file '+a.i.split('/')[-1]+'. See output files for more info.')
elif len(diff)==0:
print('\nDONE. No '+a.which+' sequence found in file '+a.i.split('/')[-1])
else:
print('\nDONE. No '+a.which+' sequence found in file '+a.i.split('/')[-1]+'.')
terminal('touch '+a.o+name+'.'+a.which+'_filtered.fasta')