forked from wassermanlab/BiasAway
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBiasAway.py
285 lines (239 loc) · 13.3 KB
/
BiasAway.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#!/usr/bin/python
"""
BiasAway module generating adapted background for motif overrepresentation.
BiasAway with the possibility of using very different ways of
generating backgrounds lying into two categories:
- Creation of new random sequences:
- mono-nucleotide shuffling using the foreground sequences
- mono-nucleotide shuffling within a sliding window using foreground
sequences
- di-nucleotide shuffling using the foreground sequences
- di-nucleotide shuffling within a sliding window using foreground
sequences
- Extraction of sequences from a set of possible background sequences:
- respecting the GC distribution of the foreground (using GC bins)
- respecting the GC distribution as in the previous item and also
respecting the GC composition within a sliding window for GC bin
"""
import argparse
import mononuc_shuffling_generator as mononuc_shuff
import mononuc_window_shuffling_generator as mononuc_win_shuff
import dinuc_shuffling_generator as dinuc_shuff
import dinuc_window_shuffling_generator as dinuc_win_shuff
import GC_compo_matching as GC_compo
import GC_window_compo_matching as GC_window_compo
from utils import get_seqs
import sys
import os
import errno
def mononuc_shuffling_generator(argu):
seqs, _, _ = get_seqs(argu.fg_file)
_, _ = mononuc_shuff.generate_sequences(seqs, argu.nfold)
def dinuc_shuffling_generator(argu):
seqs, _, _ = get_seqs(argu.fg_file)
_, _ = dinuc_shuff.generate_sequences(seqs, argu.nfold)
def mononuc_shuffling_window_generator(argu):
seqs, _, _ = get_seqs(argu.fg_file)
_, _ = mononuc_win_shuff.generate_sequences(seqs, argu.winlen, argu.step, argu.nfold)
def dinuc_shuffling_window_generator(argu):
seqs, _, _ = get_seqs(argu.fg_file)
_, _ = dinuc_win_shuff.generate_sequences(seqs, argu.winlen, argu.step, argu.nfold)
def test_empty_bg_dir(bg_dir):
if os.path.isdir(bg_dir):
if os.listdir(bg_dir):
sys.exit("EXITING since both a non-empty background directory and a "
"background file are given")
else:
try:
os.makedirs(bg_dir)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(bg_dir):
pass
else:
raise
def test_non_empty_bg_dir(bg_dir):
if not (os.path.isdir(bg_dir) and os.listdir(bg_dir)):
sys.exit("EXITING since the background directory does not exist or is empty")
def gc_compo_generator(argu):
if argu.len_opt:
gc_compo_len_generator(argu)
else:
gc_compo_generator_no_len(argu)
def gc_compo_generator_no_len(argu):
_, fg_gc_bins, _ = GC_compo.fg_GC_bins(argu.fg_file)
bg_gc_bins = None
if argu.bg_file:
test_empty_bg_dir(argu.bg_dir)
_, bg_gc_bins, _ = GC_compo.bg_GC_bins(argu.bg_file, argu.bg_dir)
else:
test_non_empty_bg_dir(argu.bg_dir)
_, _ = GC_compo.generate_sequences(fg_gc_bins, bg_gc_bins, argu.bg_dir, argu.nfold)
def gc_compo_len_generator(argu):
_, fg_gc_bins, _ = GC_compo.fg_len_GC_bins(argu.fg_file)
bg_gc_bins = None
if argu.bg_file:
test_empty_bg_dir(argu.bg_dir)
_, bg_gc_bins, _ = GC_compo.bg_len_GC_bins(argu.bg_file, argu.bg_dir)
else:
test_non_empty_bg_dir(argu.bg_dir)
_, _ = GC_compo.generate_len_sequences(fg_gc_bins, bg_gc_bins, argu.bg_dir,
argu.nfold)
def gc_compo_window_generator(argu):
if argu.len_opt:
gc_compo_len_window_generator(argu)
else:
gc_compo_window_generator_no_len(argu)
def gc_compo_len_window_generator(argu):
_, fg_gc_bins, _ = GC_window_compo.fg_len_GC_bins(argu.fg_file, argu.winlen,
argu.step)
bg_gc_bins = None
if argu.bg_file:
test_empty_bg_dir(argu.bg_dir)
_, bg_gc_bins, _ = GC_window_compo.bg_len_GC_bins(argu.bg_file, argu.bg_dir)
else:
test_non_empty_bg_dir(argu.bg_dir)
_, _ = GC_window_compo.generate_len_sequences(fg_gc_bins, bg_gc_bins, argu.bg_dir,
argu.deviation, argu.winlen, argu.step,
argu.nfold)
def gc_compo_window_generator_no_len(argu):
_, fg_gc_bins, _ = GC_window_compo.fg_GC_bins(argu.fg_file, argu.winlen, argu.step)
bg_gc_bins = None
if argu.bg_file:
test_empty_bg_dir(argu.bg_dir)
_, bg_gc_bins, _ = GC_window_compo.bg_GC_bins(argu.bg_file, argu.bg_dir)
else:
test_non_empty_bg_dir(argu.bg_dir)
_, _ = GC_window_compo.generate_sequences(fg_gc_bins, bg_gc_bins, argu.bg_dir,
argu.deviation, argu.winlen, argu.step,
argu.nfold)
def mononuc_shuffling_arg_parsing(subparsers):
parser_d = subparsers.add_parser("m", help="mono-nucleotide shuffling generator")
parser_d.add_argument("-f", "--foreground", required=True, type=str, dest="fg_file",
action="store",
help="Foreground file in FASTA format [REQUIRED]")
parser_d.add_argument("-n", "--nfold", required=False, type=int,
dest="nfold", action="store", default=1,
help="How many background sequences per each foreground "
"sequence will be generated (default: 1)")
parser_d.set_defaults(func=mononuc_shuffling_generator)
def mononuc_window_shuffling_arg_parsing(subparsers):
parser_w = subparsers.add_parser("f", help="mono-nucleotide shuffling within a "
"sliding window generator")
parser_w.add_argument("-w", "--winlen", required=False, type=int, dest="winlen",
action="store", default=100,
help="Window length (default: 100 bp)")
parser_w.add_argument("-s", "--step", required=False, type=int, dest="step",
action="store", default=1, help="Sliding step (default: 1 bp)")
parser_w.add_argument("-f", "--foreground", required=True, type=str, dest="fg_file",
action="store",
help="Foreground file in FASTA format [REQUIRED]")
parser_w.add_argument("-n", "--nfold", required=False, type=int,
dest="nfold", action="store", default=1,
help="How many background sequences per each foreground "
"sequence will be generated (default: 1 bp)")
parser_w.set_defaults(func=mononuc_shuffling_window_generator)
def dinuc_shuffling_arg_parsing(subparsers):
parser_d = subparsers.add_parser("d", help="di-nucleotide shuffling generator")
parser_d.add_argument("-f", "--foreground", required=True, type=str, dest="fg_file",
action="store",
help="Foreground file in FASTA format [REQUIRED]")
parser_d.add_argument("-n", "--nfold", required=False, type=int, dest="nfold",
action="store", default=1,
help="How many background sequences per each foreground "
"sequence will be generated (default: 1)")
parser_d.set_defaults(func=dinuc_shuffling_generator)
def dinuc_window_shuffling_arg_parsing(subparsers):
parser_w = subparsers.add_parser("w", help="di-nucleotide shuffling within a sliding "
"window generator")
parser_w.add_argument("-w", "--winlen", required=False, type=int, dest="winlen",
action="store", default=100,
help="Window length (default: 100 bp)")
parser_w.add_argument("-s", "--step", required=False, type=int, dest="step",
action="store", default=1, help="Sliding step (default: 1bp)")
parser_w.add_argument("-f", "--foreground", required=True, type=str, dest="fg_file",
action="store",
help="Foreground file in FASTA format [REQUIRED]")
parser_w.add_argument("-n", "--nfold", required=False, type=int,
dest="nfold", action="store", default=1,
help="How many background sequences per each foreground "
"sequence will be generated (default: 1 bp)")
parser_w.set_defaults(func=dinuc_shuffling_window_generator)
def gc_compo_arg_parsing(subparsers):
parser_g = subparsers.add_parser("g", help="GC content-based background chooser")
parser_g.add_argument("-r", "--bgdirectory", required=True, type=str,
dest="bg_dir", action="store",
help="Background directory [REQUIRED]")
parser_g.add_argument("-b", "--background", required=False, type=str,
dest="bg_file", action="store",
help="Background file in FASTA format")
parser_g.add_argument("-f", "--foreground", required=True, type=str,
dest="fg_file", action="store",
help="Foreground file in FASTA format [REQUIRED]")
parser_g.add_argument("-n", "--nfold", required=False, type=int,
dest="nfold", action="store", default=1,
help="How many background sequences per each foreground "
"sequence will be choosen (default: 1 bp)")
parser_g.add_argument("-l", "--length", required=False, dest="len_opt",
action="store_const", const=1, default=0,
help="Try to match the length as closely as possible")
parser_g.set_defaults(func=gc_compo_generator)
def gc_compo_window_arg_parsing(subparsers):
parser_c = subparsers.add_parser("c", help="GC distribution and GC content within"
"a sliding window background chooser")
parser_c.add_argument("-r", "--bgdirectory", required=True, type=str, dest="bg_dir",
action="store", help="Background directory [REQUIRED]")
parser_c.add_argument("-b", "--background", required=False, type=str, dest="bg_file",
action="store", help="Background file in FASTA format")
parser_c.add_argument("-w", "--winlen", required=False, type=int, dest="winlen",
action="store", default=100,
help="Window length (default: 100 bp)")
parser_c.add_argument("-s", "--step", required=False, type=int, dest="step",
action="store", default=1, help="Sliding step (default: 1 bp)")
parser_c.add_argument("-d", "--deviation", required=False, type=float,
dest="deviation", action="store", default=2.6,
help="Deviation from the mean (default: 2.6 for a threshold of "
"mean + 2.6 * stdev)")
parser_c.add_argument("-f", "--foreground", required=True, type=str, dest="fg_file",
action="store",
help="Foreground file in FASTA format [REQUIRED]")
parser_c.add_argument("-n", "--nfold", required=False, type=int,
dest="nfold", action="store", default=1,
help="How many background sequences per each foreground "
"sequence will be choosen (default: 1 bp)")
parser_c.add_argument("-l", "--length", required=False, dest="len_opt",
action="store_const", const=1, default=0,
help="Try to match the length as closely as possible "
"(not set by default)")
parser_c.set_defaults(func=gc_compo_window_generator)
def arg_parsing():
descr = """Background generator with the possibility of using very different ways of
generating backgrounds lying into two categories:
- Creation of new random sequences (generators):
- mono-nucleotide shuffling using the foreground sequences
- mono-nucleotide shuffling within a sliding window using foreground sequences
- di-nucleotide shuffling using the foreground sequences
- di-nucleotide shuffling within a sliding window using foreground sequences
- Extraction of sequences from a set of possible background sequences (choosers):
- respecting the GC distribution of the foreground (using GC bins)
- respecting the GC distribution as in the previous item and also respecting
the GC composition within a sliding window for GC bin
"""
parser = argparse.ArgumentParser(description=descr,
formatter_class=argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(help="Choice of the generator/chooser",
title="Subcommands",
description="Valid subcommands")
mononuc_shuffling_arg_parsing(subparsers)
mononuc_window_shuffling_arg_parsing(subparsers)
dinuc_shuffling_arg_parsing(subparsers)
dinuc_window_shuffling_arg_parsing(subparsers)
gc_compo_arg_parsing(subparsers)
gc_compo_window_arg_parsing(subparsers)
argu = parser.parse_args()
return argu
###############################################################################
# MAIN
###############################################################################
if __name__ == "__main__":
arguments = arg_parsing()
arguments.func(arguments)