forked from barricklab/barricklab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconsensus_reads_with_stacks.py
executable file
·190 lines (166 loc) · 13 KB
/
consensus_reads_with_stacks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 30 11:15:09 2014
Stacks duplex seq
steps:
1. read in R1 R2 fastq files together
2. remove reads with incorrect MI
3. output fasta file with following format: #unique number to come from header line, and be the coordinates of the read
>F-readposition-R1(MI trimmed off)
R1MIR2MI
>R-unique#-R2(MI trimmed off)
R2MIR1MI
4. run stacks with following considerations:
1. allow 1 MI to start a stack // -m 1
2. allow 1 mismatch between stacks (corresponds to 3-4% error rate depending on if you count constant region 4bp) // -M 1
3. first consideration eliminates the possible existence of secondary reads, therefore disable their mapping // -N 0
4. happlotypes do not exist within MI, therefore disable // -H
5. this assumed to be run on stampede, use maximal processors. currently set to 16 potential sight of softcode/dynamic choice // -p 16
6. Keep highly represented sequences // --keep_high_cov
7. Each sample required to have unique ID, as only running single sample at time, force all to standard 1 // -i 1 #TODO consider larger script to deal with multiples
5. read in .tags.tsv file
1. primary sequence names represent F/R reads
2. store in dictionary of dict ={F:[read name, read name, etc], R: [read name, read name]}
3. dictionary resets list of sequences on consensus line,
4. on each consensus, if length of either F or R list is greater than 1, generate consensus sequence.
6. write output statistics to global log file
1. figure out how to access singe file from multiple processes. need check to make sure is quickly appending and then closing the file and maybe some kind of wait/repeat if something returns error?
for gdname in 01_Data/*.gd;do name=$(echo $gdname|sed 's/.gd//'|sed 's/01_Data\///');echo "consensus_reads_with_stacks.py -f1 02_Downloads/$name*_R1_* -f2 02_Downloads/$name*_R2_* -p $name";done > commands
version:
2->3
changes to consensus reads, not sscs/dcs. This means that all reads with same MI (regardless of R1/R2) will be combined to generate consensus
need determination of when this should kick in (ie at the begining before stacks, or
"consensus_reads_with_stacks"
no longer making distinction of SSCS/DCS
using ustacks V1.48
removed lambda functions from consensus base call function (replaced with simpler .itervalues and better use of key argument in max)
reports updated
gone SSCS/DCS/singletons
added discrepancy counter (ie how many times was there disagreement among raw reads at a single base)
non-consensus (ie how often could a consensus base not be identified
@author: ded
"""
import re
import argparse
import sys
import os
import subprocess
parser = argparse.ArgumentParser(description="Generate consensus sequences based on molecular indexes")
parser.add_argument("-c", "--constant_region", default="CAGT", help="expected constant region sequence")
parser.add_argument("-l", "--length", default=int(12), help="length of random MI region")
parser.add_argument("-f1", "--fastq1", help="fastq read1 file to check", required=True)
parser.add_argument("-f2", "--fastq2", help="fastq read2 file to check", required=True)
parser.add_argument("-cf", "--cutoff_frequency", default=float(0.5), type=float, help="minimum cutoff frequency required to call consensus base")
parser.add_argument("-p", "--prefix", help="prefix for output files", required=True)
parser.add_argument("-m", "--minimum_reads", default=2, type=int, help="minimum number of raw reads needed generate a consensus sequence")
parser.add_argument("--testing", action='store_true', default=False, help="break fastq read in after certain number of reads to increase speed")
# parser.add_argument("-v", "--verbose", action='store_true', help="increase output (not set to do anything)") # TODO implement log function and verbose printing options within
# show help if no arguments passed
if len(sys.argv) < 2:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# Initialize key dictionary and search string, Verify proper command line arguments
main_dict = {"reads": [], "raw reads": 0, "dual MI reads": 0, "consensus sequence reads": 0, "consensus sequences": 0, "discrepancies removed": 0, "unknown consensus base": 0, "consensus": str()} # Reads, counters, and consensus name
molecular_index_search_string = "^[ACTGN]{%d}%s" % (int(args.length), args.constant_region)
assert re.match("^[A-Za-z0-9_]+$", args.prefix), "\nNon-alphanumeric characters found:\n%s\nPlease restrict to letters, numbers, or underscore characters in prefix name" % [x for x in args.prefix if not re.match("[A-Za-z0-9_]", x)]
assert 0.5 <= args.cutoff_frequency <= 1.0, "Requiring a sub 50% frequency for consensus can lead to equal representation of 2 different bases. Requiring a frequency above 100% would mean no consenus could be generated. Please select a cutoff_frequency between 0.5 and 1"
def consensus_seq_gen(raw_sequences, consesnsus_freq=args.cutoff_frequency):
"""function to determine consensus sequence with given variable values. take in list of sequences, and consensus sequence above threshold."""
assert len(list(set([len(x) for x in raw_sequences]))) == 1, "Unequal length sequences passed to consensus sequence generator.\n%s\n%s" % ([len(x) for x in raw_sequences], raw_sequences)
consensus_read = []
for i in xrange(len(raw_sequences[0])): # i is the nucleotide index within the read
consensus_dict = {'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0} # set dictionary to empty for each base within the MI group
for j in xrange(len(raw_sequences)): # j is the read index within raw_sequences
consensus_dict[raw_sequences[j][i]] += 1.0
# if consensus_dict[max(consensus_dict.iterkeys(), key=(lambda key: consensus_dict[key]))] / float(len(raw_sequences)) > consesnsus_freq:
if max(consensus_dict.itervalues()) / float(len(raw_sequences)) > consesnsus_freq: # is (frequency of highest represented base) > (frequency required)?
# consensus_read.append(max(consensus_dict.iterkeys(), key=(lambda key: consensus_dict[key])))
consensus_read.append(max(consensus_dict, key=consensus_dict.get)) # append highest represented base to consensus read.
main_dict["discrepancies removed"] += len(raw_sequences) - max(consensus_dict.itervalues()) # add number of reads with disagreeing base calls
else:
consensus_read.append("N") # Most frequently observed base doesn't exceed consensus threshold
main_dict["unknown consensus base"] += 1 # "True" base couldn't be determined therefore, count how often this occurs
consensus_read = "".join(consensus_read)
return consensus_read
def consensus_assignment():
"""generate consesnsus sequences and print them based on criteria"""
if len(main_dict["reads"]) >= args.minimum_reads:
consensus = consensus_seq_gen(main_dict["reads"])
main_dict["consensus sequences"] += 1
main_dict["consensus sequence reads"] += len(main_dict["reads"])
print>>consensus_out, "\n".join(map(str, ["@_" + args.prefix.replace("_", "and") + "_consensus_" + main_dict["consensus"] + "_" + str(len(main_dict["reads"])) + "_" + str(main_dict["consensus sequences"]), consensus, "+", "I" * len(consensus)])) # @_prefix_"consensus"_consensus.seq_#reads_#consensus.seq
def tag_parser():
"""function designed to parse .tags.tsv output from ustacks and print consensus sequences to *.concensus.fastq"""
with open(args.prefix + ".tags.tsv", "r") as tags_in:
for line in tags_in:
line = line.rstrip().split("\t")
if re.match("model", line[6]) or re.match("^#", line[0]):
continue # not doing anything with model lines, and header line at top starts with "#" in ustacks V 1.48
elif re.match("consensus", line[6]):
consensus_assignment() # consesnsus sequence is first thing encountered, first time will correctly do nothing, but must be called after completion of the loop
main_dict["reads"] = [] # reset sequences to empty lists
main_dict["consensus"] = line[9] # name will be used in header line of consensus.fastq file
elif re.match("primary", line[6]):
main_dict["reads"].append(line[8].split("_")[-1]) # non-MI sequence is after _ in fasta name line
elif re.match("secondary", line[6]):
assert False, "Secondary read alignment found, ustacks, not behaving as intended.\n%s" % line
else:
assert False, "Unknown sequence type identified, don't know how to handle this.\n%s\n%s" % (line[6], line)
consensus_assignment() # must run on last consensus sequence
def paired_end_fastq_read_in(fastq1, fastq2):
"""Function designed to read in paired end fastq files simultaneously, and pass each read pair to function to check for molecular index"""
line_count = 0
with open(fastq1, "r") as Fastq1, open(fastq2, "r") as Fastq2: #, open(args.prefix + ".log.txt", "w", 0) as log:
for line1 in Fastq1:
line1 = line1.rstrip()
line2 = Fastq2.next().rstrip()
line_count += 1
if line_count % 4 == 1: # line = header
header1 = line1
header2 = line2
assert (header1.split(" ")[0] == header2.split(" ")[0]) and (len(header1.split(" ")) == len(header1.split(" "))) and len(header1.split(" ")) == 2, "Header lines of R1 and R2 do not agree with expectations: single space in header line, and header before space is identical for both reads.\nHeaders:\n%s\n%s" % (header1, header2) # this may be inaccurate for different Illumina outputs
elif line_count % 4 == 2: # line = read
read1 = line1
read2 = line2
assert re.match('^[ACTGN]*$', read1) and re.match('^[ACTGN]*$', read2), "Non-sequence characters found on one of th following sequence lines:\n%s\n%s" % (line1, line2)
elif line_count % 4 == 3: # line = optional sequence descriptor. assume line to use 'standard' "+" notation rather than actual descriptor.
assert line1 == line2 == "+", "line1 or line2 does not display expected '+' sign on line3 here:\nline1: %s\nline2: %s" % (line1, line2)
elif line_count % 4 == 0: # line = quality
# not doing anything with quality scores
prep_for_stacks(header1, read1, header2, read2)
if line_count % 200000 == 0:
print>>log, line_count / 4, "reads processed"
if args.testing:
break # for testing subset of reads rather than full read list
main_dict["raw reads"] = line_count / 4
def prep_for_stacks(header_1, read_1, header_2, read_2):
"""Steps: 1 Take in paired end reads, 2 Identify correct MI placement, 3 Output to new fasta file"""
if re.match(molecular_index_search_string, read_1) and re.match(molecular_index_search_string, read_2): # only work with read pairs that have correct MI on both reads
# TODO slices are hardcoded. should be based on argparse variables
f_tag = ">F-" + "-".join(header_1.split(":")[3:7]) + "_" + read_1[17:]
f_read = read_1[:12] + read_2[:12]
r_tag = ">R-" + "-".join(header_2.split(":")[3:7]) + "_" + read_2[17:]
r_read = read_2[:12] + read_1[:12]
print>>fasta_out, "\n".join(map(str, [f_tag, f_read, r_tag, r_read]))
main_dict["dual MI reads"] += 1
# Read Fastq files in, output fasta files
assert not os.path.isfile(args.prefix + ".fasta"), "fasta file already exists. Please remove or rename existing file: %s.fasta" % args.prefix
with open(args.prefix + ".fasta", "w") as fasta_out, open(args.prefix + ".log.txt", "w", 0) as log:
paired_end_fastq_read_in(args.fastq1, args.fastq2)
# Group reads by molecular index
stacks_command = ["ustacks", "-t", "fasta", "-f", str(args.prefix) + ".fasta", "-m", "1", "-M", "1", "-N", "0", "-H", "-p", "16", "--keep_high_cov", "-i", "1"]
external_stacks_call = subprocess.Popen(stacks_command, stdout=open(args.prefix + ".log.txt", "a"), stderr=open(args.prefix + ".log.txt", "a"))
external_stacks_call.wait()
# Read in molecular inedex, output consensus fastq file
assert not os.path.isfile(args.prefix + ".consensus.fastq"), "consensus file already exists. please remove or rename existing file: %s.consensus.fastq" % args.prefix
with open(args.prefix + ".consensus.fastq", "w") as consensus_out:
tag_parser()
# Log final stats
with open(args.prefix + ".log.txt", "a") as log:
print>>log, "\t".join(map(str, ["Sample", "raw reads", "dual MI reads", "consensus sequence reads", "consensus sequences", "discrepancies removed", "unknown consensus base"]))
to_print = [args.prefix]
for entry in ["raw reads", "dual MI reads", "consensus sequence reads", "consensus sequences", "discrepancies removed", "unknown consensus base"]:
to_print.append(main_dict[entry])
print>>log, "\t".join(map(str, to_print))