-
Notifications
You must be signed in to change notification settings - Fork 3
/
metaSNV_Filtering.py
executable file
·301 lines (242 loc) · 13.4 KB
/
metaSNV_Filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python
from __future__ import division
import os
import sys
import argparse
import glob
import shutil
from multiprocessing import Pool
from functools import partial
basedir = os.path.dirname(os.path.abspath(__file__))
# ======================================================================================================================
# Parse Commandline Arguments
def get_arguments():
"""
Get commandline arguments and return namespace
"""
# Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_filtering.py', description='metaSNV filtering step',
epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Not Shown:
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
# REQUIRED arguments:
parser.add_argument('projdir', help='project name', metavar='Proj')
# OPTIONAL arguments:
parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
help="Coverage depth: minimal average vertical genome coverage per sample per species")
parser.add_argument('-m', metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c', metavar='FLOAT', type=float,
help="FILTERING STEP II:"
"minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p', metavar='FLOAT', type=float,
help="FILTERING STEP II:"
"required proportion of informative samples (coverage non-zero) per position",
default=0.50)
parser.add_argument('--ind', action='store_true', help="Compute individual SNVs")
parser.add_argument('--n_threads', metavar=': Number of Processes',
default=1, type=int, help="Number of jobs to run simultaneously.")
return parser.parse_args()
# ======================================================================================================================
# Basic functions
def file_check():
"""Check if required files exist (True / False)"""
args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir + '/' + args.projdir.split('/')[-1] + '.all_cov.tab'
args.percentage_file = args.projdir + '/' + args.projdir.split('/')[-1] + '.all_perc.tab'
args.all_samples = args.projdir + '/' + 'all_samples'
print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit(
"\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples))
else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
def print_arguments():
# Print Defined Thresholds:
print("Options:")
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b))
if args.d:
print("threshold: average coverage (depth) {}".format(args.d))
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m))
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c))
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p))
if args.ind:
print("Compute indiv SNVs : {}".format(args.ind))
if args.n_threads:
print("Number of parallel processes : {}".format(args.n_threads))
print("")
# ======================================================================================================================
# FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
# Example: "Which taxon has at least m SoI?"
# Coverage Conditions:
# Sample of Interest:
# 1. breadth > b (default 40 %)
# 2. depth > d (default 5 X)
# Min. number Samples:
# 3. min samples/TaxID > m (default 2)
def relevant_taxa(args):
"""function that goes through the coverage files and determines taxa and samples of interest"""
samples_of_interest = {}
with open(args.coverage_file, 'r') as COV, open(args.percentage_file, 'r') as PER:
header_cov = COV.readline().split()
header_per = PER.readline().split()
COV.readline() # skip second row in COV_FILE
PER.readline() # skip second row in PER_FILE
# Check if header match:
if header_cov != header_per:
sys.exit("ERROR: Coverage file headers do not match!") # Exit with error message
# Read taxon by taxon (line by line) check coverage conditions (thresholds)
for cov, perc in zip(COV, PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
cstring = cov.split()
pstring = perc.split()
cov_taxID = cstring.pop(0)
perc_taxID = pstring.pop(0)
coverage = list(map(float, cstring)) # convert list_of_strings to list_of_floats (TaxID=STR!)
percentage = list(map(float, pstring)) # convert list_of_strings to list_of_floats
sample_count = 0
sample_names = []
if cov_taxID != perc_taxID: # check if taxIDs match!
sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")
for c, p in zip(coverage, percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
if c >= args.d and p >= args.b:
sample_names.append(header_cov[sample_count - 1]) # starting at 0 in array
if sample_count == len(header_cov) and len(sample_names) >= args.m:
samples_of_interest[cov_taxID] = sample_names
return {'SoI': samples_of_interest, 'h': header_cov} # return dict()
COV.close()
PER.close()
# ======================================================================================================================
# FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
# SNP Conditions (Default):
# 1. Position covered by at least c (5) reads
# 2. Position present in at least proportion p (50 %) of the accepted samples_of_interest
def filter_two(species, args, snp_files, outdir, samples_of_interest):
"""position wise filtering"""
snp_taxID = '_'
# read <all_samples> for snp_file header:
all_samples = open(args.all_samples, 'r')
snp_header = all_samples.read().splitlines()
snp_header = [i.split('/')[-1] for i in snp_header] # get name /trim/off/path/to/sample.name.bam
for best_split_x in snp_files:
with open(best_split_x, 'r') as file:
for snp_line in file: # position wise loop
snp_taxID = snp_line.split()[0].split('.')[0] # Name of Genome change from . to ]
# Species filter:
if snp_taxID != species: # Check if Genome is of interest
continue # Taxon is not relevant, NEXT!
else:
# Sample filter: only load samples with enough coverage
sample_list = samples_of_interest[snp_taxID] # Load Sample List
# !!! Sample order, get indices - INDICES based on ORDER in COV/PERC file !!!
sample_indices = []
for name in sample_list:
sample_indices.append(snp_header.index(name))
# Position filter:
# Positions with sufficient coverage (c) and proportion (p) in samples of interests (SoIs).
whole_line = snp_line.split()
site_coverage = list(map(int, whole_line[4].split('|'))) # Site coverages as list of ints
nr_good = 0
for index in sample_indices:
if site_coverage[index] < args.c or site_coverage[index] == 0:
continue # Not enough coverage depth at position in sample, no increment
else:
nr_good += 1
# Filter: Position incidence with sufficient coverage:
if float(nr_good) / len(sample_indices) < args.p: # if snp_incidence < x % drop SNP
continue # mainly uninformative position, SNP incidence < x %, SNP dropped!
else:
# Calculate SNP allele frequency:
# If file do not exist yet, create it:
if not os.path.isfile(outdir + '/' + '%s.filtered.freq' % snp_taxID):
outfile = open(outdir + '/' + '%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(outdir + '/' + '%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n')
# Loop through alternative alleles [5](comma separated):
line_id = ":".join(whole_line[:4]) # line_id composed of CHROM:REFGENE:POS:REFBASE
reference_base = whole_line[3]
alt_bases_totalcov = [] # total coverage per alt allele
# VCF format
# Loop Start:
for snp in whole_line[5].split(','):
xS = snp.split('|')
snp_coverage = list(map(float, xS[3:])) # coverage string (unfiltered!)
# Sanity check:
if len(site_coverage) != len(snp_coverage):
print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
# Compute allele frequency tables:
alt_base = snp.split('|')[1] # alternative base
# Frequency Computation
total_reads = 0
snp_frq = [] # frequencies for SNPs (pos > cX in at least p% of the SoIs)
for index in sample_indices:
# prevent division by zero!
if site_coverage[index] >= args.c and site_coverage[index] != 0:
snp_frq.append(snp_coverage[index] / site_coverage[index])
else:
snp_frq.append(-1)
# Write Output Allele Frequencies (Default)
outfile.write(
":".join(snp_line.split()[:4]) + '>' + alt_base + ':' + xS[2] + '\t' + "\t".join(
str(x) for x in snp_frq) + '\n')
if 'outfile' in locals():
print("closing: {}".format(species))
outfile.close()
# ======================================================================================================================
# Script
if __name__ == "__main__":
args = get_arguments()
print_arguments()
file_check()
# ==========================================
# Filtering I - Determine Taxa of Interest:
# ==========================================
samples_of_interest = relevant_taxa(args)['SoI']
header_cov = relevant_taxa(args)['h']
print(samples_of_interest.keys())
# =========================================
# Filtering II - Position wise filtering
# =========================================
pars_toprint = '-m{}-d{}-b{}-c{}-p{}'.format(int(args.m), int(args.d), int(args.b), int(args.c), float(args.p))
# don't do this for now - makes it hard to work with subpopr
pars_toprint = ""
filt_folder = args.projdir + '/filtered' + pars_toprint + '/'
if not os.path.exists(filt_folder):
os.makedirs(filt_folder)
os.makedirs(filt_folder + '/pop/')
else:
shutil.rmtree(filt_folder)
os.makedirs(filt_folder)
os.makedirs(filt_folder + '/pop/')
p = Pool(processes=args.n_threads)
partial_Div = partial(filter_two,
args=args,
snp_files=glob.glob(args.projdir + '/snpCaller/called*'),
outdir=filt_folder + '/pop',
samples_of_interest=samples_of_interest)
p.map(partial_Div, samples_of_interest.keys())
p.close()
p.join()
if args.ind:
if not os.path.exists(filt_folder + '/ind/'):
os.makedirs(filt_folder + '/ind/')
p = Pool(processes=args.n_threads)
partial_Div = partial(filter_two,
args=args,
snp_files=glob.glob(args.projdir + '/snpCaller/indiv*'),
outdir=filt_folder + '/ind',
samples_of_interest=samples_of_interest)
p.map(partial_Div, samples_of_interest.keys())
p.close()
p.join()