-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabsolute_asymptote.cpp
130 lines (119 loc) · 5.01 KB
/
absolute_asymptote.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/* absolute_asymptote.cpp
Abhi Nellore
September 2, 2015
Consider an exon-exon junction filter where a junction is kept if and only
if it is found in >= some number K of RNA-seq samples analyzed.
This program takes S random samples of N RNA-seq samples of input junction
counts from a total of Q RNA-seq samples at various values of K and finds
the number of junctions J that make it past the filter, writing a table
to stdout whose rows have the following format.
N <tab> K <tab> J
The expected input from stdin is the junction output of Rail for all of SRA.
Note that some constants dependent on this input data are baked into the
code below.
Requires Boost.
*/
#include<iostream>
#include<vector>
#include<bitset>
#include<string>
#include<random>
#include "boost/multi_array.hpp"
const unsigned int INTRON_COUNT = 42882032; // Bake intron count in so bitsets can be preallocated
const unsigned int SAMPLE_COUNT = 21506; // Total number of samples
const unsigned int RANDOM_COUNT = 5; // Number of random samples S to take at each value of N
const unsigned int SAMPLE_INTERVAL = 100; // Minimum value of N defined above as well as interval between successive values pf N studied
const unsigned int SAMPLE_MAX = 20000; // Maximum value of N defined above as well as interval between successive values of N studied
const unsigned int COUNT_INTERVAL = 5; // Minimum value of K defined above as well as interval between successive values of K studied
const unsigned int COUNT_MAX = 100; // Maximum value of K studied
const unsigned int SEED = 5; // Or whatever; we used 5 for reproducibility
/* Sampling from range without replacement implementation inspired
by http://stackoverflow.com/questions/28287138/c-randomly-sample-k-numbers-from-range-0n-1-n-k-without-replacement */
std::bitset<SAMPLE_COUNT> randomSet(int r, std::default_random_engine& gen)
{
std::bitset<SAMPLE_COUNT> randomSample;
for (int k = SAMPLE_COUNT - r; k < SAMPLE_COUNT; ++k) {
int v = std::uniform_int_distribution<>(1, k)(gen);
// there are two cases.
// v is not in candidates ==> add it
// v is in candidates ==> well, k is definitely not, because
// this is the first iteration in the loop that we could've
// picked something that big.
if (!randomSample[v]) {
randomSample.set(v);
} else {
randomSample.set(k);
}
}
return randomSample;
}
std::bitset<SAMPLE_COUNT> intronFromLine(const std::string &str) {
std::bitset<SAMPLE_COUNT> intronInSampleQ;
if (!str.length()) return intronInSampleQ;
int tabCount = 0, i = 0;
while (tabCount < 6) {
if (str[i] == '\t') tabCount++;
i++;
}
int startIndex = i, numLength = 0;
while (str[i] != '\t') {
if (str[i] == ',') {
intronInSampleQ.set(std::stoi(str.substr(startIndex, numLength)));
startIndex = i + 1;
numLength = 0;
} else {
numLength += 1;
}
i++;
}
intronInSampleQ.set(std::stoi(str.substr(startIndex, numLength)));
return intronInSampleQ;
}
int main() {
int rowCount = SAMPLE_MAX / SAMPLE_INTERVAL;
int columnCount = COUNT_MAX / COUNT_INTERVAL + 1;
std::cerr << "Reservoir sampling to obtain random bitsets..." << std::endl;
std::default_random_engine gen(SEED);
std::vector<std::bitset<SAMPLE_COUNT> > randomSamples(rowCount * RANDOM_COUNT);
std::vector<int > randomSampleCounts(rowCount * RANDOM_COUNT);
int currentRandomSampleCount;
int k = 0;
for (int i = 0; i < rowCount; i++) {
currentRandomSampleCount = (i + 1) * SAMPLE_INTERVAL;
for (int j = 0; j < RANDOM_COUNT; j++) {
randomSamples[k] = randomSet(currentRandomSampleCount, gen);
randomSampleCounts[k] = randomSamples[k].count();
k += 1;
}
}
std::cerr << "Reading junctions and applying filters..." << std::endl;
typedef boost::multi_array<int, 2> array_type;
typedef array_type::index index;
array_type junctionCounts(boost::extents[randomSamples.size()][columnCount]);
std::fill( junctionCounts.origin(), junctionCounts.origin() + junctionCounts.size(), 0 );
int numSamplesWithJunction;
std::string str;
int count;
while (getline(std::cin, str)) {
std::bitset<SAMPLE_COUNT> intronInSampleQ = intronFromLine(str);
for (int k = 0; k < columnCount; k++) {
count = k * COUNT_INTERVAL;
for (int i = 0; i < randomSamples.size(); i++) {
numSamplesWithJunction = (randomSamples[i] & intronInSampleQ).count();
if (numSamplesWithJunction >= count && numSamplesWithJunction > 0) {
junctionCounts[i][k]++;
}
}
}
}
std::cerr << "Dumping output..." << std::endl;
for (int k = 0; k < columnCount; k++) {
count = k * COUNT_INTERVAL;
for (int i = 0; i < randomSampleCounts.size(); i++) {
std::cout << randomSampleCounts[i] << '\t' << count << '\t'
<< junctionCounts[i][k] << std::endl;
}
}
std::cerr << "Done." << std::endl;
return 0;
}