-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkt_combine.py
148 lines (123 loc) · 5.97 KB
/
kt_combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
#
# Copyright (c) 2014-2016 Christian Schudoma, The Sainsbury Laboratory
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import sys
import argparse
import ktoolu_io as KTIO
def readClassification(fn):
with KTIO.openFile(fn) as fi:
return set(line.strip().split()[:2] for line in fi if line.strip().startswith('C'))
def computeSets(resultsA, resultsB):
setA = readClassification(resultsA)
setB = readClassification(resultsB)
setAB = setA.intersection(setB)
setA.difference_update(setB)
setB.difference_update(setA)
return setA, setB, setAB
def assignSequences(sets, fileInfo):
assert fileInfo.input_format in ('fq', 'fa')
if fileInfo.input_format == 'fq':
getID, getSeqs, nlines, outfmt = KTIO.getFastqIdentifier, KTIO.readFastq, 4, '%s\n%s\n+\n%s\n'
else:
getID, getSeqs, nlines, outfmt = KTIO.getFastaIdentifier, KTIO.readFasta, 2, '%s\n%s\n'
if fileInfo.gz_output:
ffmt = 'gz'
elif fileInfo.bz2_output:
ffmt = 'bz2'
else:
ffmt = None
R1gen, R2gen = getSeqs(fileInfo.inR1), None
R1A, R1B, R1AB, R1U = map(lambda x:KTIO.openFile(x, fmt=ffmt, mode='wb'), [fileInfo.outAR1, fileInfo.outBR1, fileInfo.outABR1, fileInfo.outUR1])
R2A, R2B, R2AB, R2U = None, None, None, None
if fileInfo.inR2 is not None:
R2gen = getSeqs(fileInfo.inR2)
R2A, R2B, R2AB, R2U = map(lambda x:KTIO.openFile(x, fmt=ffmt, mode='wb'), [fileInfo.outAR2, fileInfo.outBR2, fileInfo.outABR2, fileInfo.outUR2])
fxid1, fxid2 = None, None
while 1:
try:
R1rec = R1gen.next()
except:
break
fxid1, fxid2 = getID(R1rec[0]), None
if R2gen is not None:
try:
R2rec = R2gen.next()
except:
break
fxid2 = getID(R2rec[0])
assert fxid1 == fxid2 or fxid2 is None
# set order is A, B, AB
if fxid1 in sets[0]:
dest, destid = (R1A, R2A), 'A'
elif fxid1 in sets[1]:
dest, destid = (R1B, R2B), 'B'
elif fxid1 in sets[2]:
dest, destid = (R1AB, R2AB), '+'
else:
dest, destid = (R1U, R2U), 'U'
sys.stdout.write('\t'.join([destid, fxid1]) + '\n')
dest[0].write(outfmt % R1rec)
if dest[1] is not None:
dest[1].write(outfmt % R2rec)
map(lambda x:x.close(), [R1A, R1B, R1AB, R1U])
if R2A is not None:
map(lambda x:x.close(), [R2A, R2B, R2AB, R2U])
pass
def main():
parser = argparse.ArgumentParser(description='')
parser.add_argument('--input-format', help='Input sequences stored in Fasta (fa) or Fastq (fq) file(s).', default='fq')
parser.add_argument('--inR1', help='The R1-file (single-end reads or forward paired-end reads).')
parser.add_argument('--inR2', help='The R2-file (reverse paired-end reads)')
parser.add_argument('--kraken-resultsA', type=str, help='A file containing the results of classification A for the input sequences.')
parser.add_argument('--kraken-resultsB', type=str, help='A file containing the results of classification B for the input sequences.')
parser.add_argument('--outAR1', type=str, help='')
parser.add_argument('--outAR2', type=str, help='')
parser.add_argument('--outBR1', type=str, help='')
parser.add_argument('--outBR2', type=str, help='')
parser.add_argument('--outABR1', type=str, help='')
parser.add_argument('--outABR2', type=str, help='')
parser.add_argument('--outUnclassifiedR1', type=str, help='')
parser.add_argument('--outUnclassifiedR2', type=str, help='')
parser.add_argument('--gz-output', action='store_true')
parser.add_argument('--bz2-output', action='store_true')
args = parser.parse_args()
assert args.kraken_resultsA is not None and os.path.exists(args.kraken_resultsA)
assert args.kraken_resultsB is not None and os.path.exists(args.kraken_resultsB)
assert args.inR1 is not None and os.path.exists(args.inR1) and verifyFileFormat(args.inR1, args.input_format)
assert (args.inR2 is None) or (os.path.exists(args.inR2) and verifyFileFormat(args.inR2, args.input_format))
assert args.outAR1 is not None and args.outBR1 is not None and args.outABR1 is not None and args.outUR1 is not None
assert (args.inR2 is None) or (args.outAR2 is not None and args.outBR2 is not None and args.outABR2 is not None and args.outUR2 is not None)
def xor(a,b):
return (a and not b) or (not a and b)
assert xor(args.gz_output, args.bz2_output) or not(args.gz_output or args.bz2_output)
sets = computeSets(args.kraken_resultsA, args.kraken_resultsB)
assignSequences(sets, args)
pass
if __name__ == '__main__': main()
__author__ = "Christian Schudoma"
__copyright__ = "Copyright 2014-2016, Christian Schudoma, The Sainsbury Laboratory"
__credits__ = ["Pirasteh Pahlavan", "Agathe Jouet", "Yogesh Gupta"]
__license__ = "MIT"
__version__ = "1.0.1"
__maintainer__ = "Christian Schudoma"
__email__ = "cschu1981@gmail.com"
__status__ = "Development"