-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset.h
272 lines (228 loc) · 7.36 KB
/
dataset.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#ifndef __DATASET_H
#define __DATASET_H
#include <vector>
#include <assert.h>
using namespace std;
// elmentary byte datatype
typedef char etype;
// array of bytes datatype
typedef vector<etype> dtype;
// array iterator type
typedef dtype::iterator itype;
// #define count_bits(c) (c&1)+((c>>1)&1)+((c>>2)&1)+((c>>3)&1)+((c>>4)&1)+((c>>5)&1)+((c>>6)&1)+((c>>7)&1)
// Look up table to count bits in a byte in constant time
const size_t count_bits[]={0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2,
3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3,
4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3,
4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3,
4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4,
4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
// An Itemset contains multiple bytes which is a bitset presentation
class Itemset{
friend class Dataset;
// Starting address
itype address;
// Number of bytes
size_t length;
// Data - if allocated
dtype data;
public:
Itemset(itype t_address, size_t t_length): address(t_address), length(t_length) {}
Itemset(itype begin, itype end):length(end-begin),data(begin,end){
address=data.begin();
}
Itemset(size_t t_length):length(t_length), data(t_length, 0){
address=data.begin();
}
Itemset(dtype t_data):data(t_data){
address=data.begin();
length=data.size();
}
// indexing
inline char& operator [] (size_t i){
assert (i<length);
return address[i];
}
// begin iterator
inline itype begin() const{
return address;
}
// end iterator
inline itype end() const{
return address+length;
}
// bitwise and -> set intersection
Itemset operator &(Itemset I) const{
Itemset ret(length);
for (unsigned i=0;i<length;i++){
ret[i]=address[i] & I.address[i];
}
return ret;
}
// bitwise or -> set union
inline Itemset operator |(Itemset I) const{
Itemset ret(length);
for (unsigned i=0;i<length;i++){
ret[i]=address[i] | I.address[i];
}
return ret;
}
// bitwise not
inline Itemset operator ~() const{
Itemset ret(length);
for (itype a1=address,a2=ret.address;a1!=address+length;a1++,a2++){
*a2=~*a1;
}
return ret;
}
// For binary search
inline bool operator >(Itemset I) const{
for (long i=length-1;i>=0;i++){
if((unsigned char)address[i] > (unsigned char)I.address[i]) return true;
if((unsigned char)address[i] < (unsigned char)I.address[i]) return false;
}
return false;
}
// for binary search
inline bool operator <(Itemset I) const{
for (long i=length-1;i>=0;i++){
if((unsigned char)address[i] < (unsigned char)I.address[i]) return true;
if((unsigned char)address[i] > (unsigned char)I.address[i]) return false;
}
return false;
}
// for binary search
inline bool operator == (Itemset I) const{
for (unsigned i=0;i<length;i++){
if(address[i] != I.address[i]) return false;
}
return true;
}
// determine if subset -> set subset operation
inline bool is_subset_of(Itemset I) const{
int not_subset=0;
for (itype a1=address,a2=I.address;a1!=address+length;a1++,a2++){
not_subset = not_subset | (*a1 &(~*a2));
}
return !not_subset;
}
// Get matching number of bits in the beginning -> used for sibling determination
inline size_t match_start(Itemset I){
size_t bits=0;
etype a1,a2,b1,b2;
for(unsigned i=0;i<length;i++){
a1=address[i];
a2=I.address[i];
if (a1==a2){
bits+=count_bits[(unsigned char)a1];
} else {
while (a1)
{
b1=a1&1;
b2=a2&1;
if (b1!=b2) break;
bits+=b1;
a1>>=1;
a2>>=1;
}
break;
}
}
return bits;
}
// Set contains operation if a bit is set
inline bool contains(size_t i) const{
return (address[i>>3]>>(i&7))&1;
}
// Add an item -> set a bit
inline void add_item(size_t i){
unsigned b= i>>3;
address[b]=address[b]|(1<<(i&7));
}
// Get byte length
inline size_t get_len(){
return length;
}
// Count number of set bits
inline size_t count_items(){
size_t num_items=0;
etype c;
for (itype b=address;b!=address+length;b++){
c=*b;
num_items+=count_bits[(unsigned char)c];
}
return num_items;
}
// Get items as set of integers
vector<size_t> items() const{
vector<size_t> v;
for (size_t i=0;i<length*8;i++){
if (this->contains(i)) v.push_back(i);
}
return v;
}
};
// Stores itemsets in a contiguos manner in the memory
class Dataset{
// data -> vector of bytes
dtype data;
// byte length of each transaction
size_t trans_len;
public:
Dataset(size_t transaction_length):trans_len(transaction_length){}
Dataset(dtype data_buffer, size_t transaction_length):
data(data_buffer), trans_len(transaction_length) {}
Dataset(itype begin, itype end, size_t transaction_length):
data(begin, end), trans_len(transaction_length) {}
Dataset(size_t transaction_length, size_t dataset_length):
trans_len(transaction_length){
data.reserve(trans_len*dataset_length);
}
// indexing
inline Itemset operator [] (size_t i){
assert (i*trans_len<data.size());
return Itemset(data.begin()+(i*trans_len), trans_len);
}
// Reserve memory for later push_backs
inline void reserve(size_t items){
data.reserve(items*trans_len);
}
// Get transaction lengths in bytes
inline size_t get_trans_len() const{
return trans_len;
}
// get number of transactions
inline size_t get_length() const{
return data.size()/trans_len;
}
// get memory address of the data
inline etype* get_data(){
return data.data();
}
// get size of the data
inline size_t get_size() const{
return data.size();
}
// Push back an Itemset
inline void push_back(const Itemset I){
data.insert(data.end(), I.begin(),I.end());
}
// Pop back and Itemset
inline Itemset pop_back(){
Itemset back(data.end()-trans_len,data.end());
data.resize(data.size()-trans_len);
return back;
}
// Swap data
inline void swap(Dataset& D){
data.swap(D.data);
}
// Swap data
inline void swap_data(dtype& d){
data.swap(d);
}
};
#endif