-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparse-gen.cpp
146 lines (121 loc) · 4.83 KB
/
sparse-gen.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <algorithm>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_set>
#include <glog/logging.h>
#include "common/Random.h"
#include "common/Zipf.h"
#include "core/Context.h"
#include "core/Macros.h"
#include "core/SparseVector.h"
DEFINE_double(sparsity, 0.01, "sparsity of data");
DEFINE_double(zipf, 0, "skew factor on data columns");
DEFINE_string(format, "dok", "output format: (dok, dense)");
DEFINE_double(row_uncertainty, 0.0,
"uncertainty on number of non-zero elements");
void check_format(const std::string &format) {
CHECK(format == "dok" || format == "line" || format == "dense")
<< "output format is not valid.";
}
void check_uncertainty(double row_uncertainty) {
CHECK(row_uncertainty >= 0 && row_uncertainty <= 1)
<< "valid uncertainty range [0, 1]";
}
template <class DataType, class RNG>
void sparse_data_gen(SparseVector<DataType> &vec, RNG &random,
std::size_t size) {
vec.clear();
std::unordered_set<int> offset_set;
while (offset_set.size() < size) {
int offset = Zipf::globalZipf().next();
offset_set.insert(offset);
}
std::vector<int> offsets(offset_set.begin(), offset_set.end());
std::sort(offsets.begin(), offsets.end());
for (auto i = 0u; i < size; i++) {
auto v = random.next();
static_assert(std::is_same<decltype(v), DataType>::value == 1,
"the return type of random.next() different from data type "
"of dense vector");
vec.push_back(std::make_tuple(offsets[i], v));
}
}
#undef function_code_gen
#define function_code_gen(data_type) \
UniformRealRandom r0(1 - context.row_uncertainty, \
1 + context.row_uncertainty); \
uint64_t n_elements = context.n_elements; \
for (auto i = 0u; i < context.n_points; i++) { \
SparseVector<data_type> v(i); \
int size = n_elements / (context.n_points - i); \
int uncertain_size = size * r0.next(); \
sparse_data_gen(v, r, uncertain_size); \
n_elements -= size; \
v.print(output, context); \
}
void binary_data_gen(const Context &context, std::ofstream &output) {
BinaryRandom r(context.bernoulli, context.seed);
function_code_gen(bool);
}
void int_data_gen(const Context &context, std::ofstream &output) {
UniformIntRandom r(context.range_low, context.range_high, context.seed);
function_code_gen(int);
}
void real_uniform_data_gen(const Context &context, std::ofstream &output) {
UniformRealRandom r(context.range_low, context.range_high, context.seed);
function_code_gen(double);
}
void real_normal_data_gen(const Context &context, std::ofstream &output) {
NormalRandom r(context.mean, context.stddev, context.seed);
function_code_gen(double);
}
void real_gamma_data_gen(const Context &context, std::ofstream &output) {
GammaRandom r(context.alpha, context.beta, context.seed);
function_code_gen(double);
}
void real_weibull_data_gen(const Context &context, std::ofstream &output) {
WeibullRandom r(context.a, context.b, context.seed);
function_code_gen(double);
}
int main(int argc, char *argv[]) {
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
google::ParseCommandLineFlags(&argc, &argv, true);
Context context;
SETUP_CONTEXT(context);
context.sparsity = FLAGS_sparsity;
context.zipf = FLAGS_zipf;
context.n_elements =
1LL * context.n_points * context.n_dimension * context.sparsity;
check_uncertainty(FLAGS_row_uncertainty);
context.row_uncertainty = FLAGS_row_uncertainty;
check_format(FLAGS_format);
context.format = FLAGS_format;
std::string file = context.file;
std::ofstream output(file);
LOG(INFO) << "init zipf ...";
Zipf::globalZipf().init(context.n_dimension, context.zipf);
LOG(INFO) << "init zipf done.";
if (context.type == "binary") {
binary_data_gen(context, output);
} else if (context.type == "int") {
int_data_gen(context, output);
} else if (context.type == "real") {
if (context.distribution == "uniform") {
real_uniform_data_gen(context, output);
} else if (context.distribution == "normal") {
real_normal_data_gen(context, output);
} else if (context.distribution == "gamma") {
real_gamma_data_gen(context, output);
} else if (context.distribution == "weibull") {
real_weibull_data_gen(context, output);
} else {
CHECK(false) << "wrong distribution type.";
}
} else {
CHECK(false) << "wrong data type.";
}
output.close();
return 0;
}