-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmysMultisetEquality.c
233 lines (205 loc) · 6.83 KB
/
mysMultisetEquality.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#include <stdio.h>
#include <stdlib.h>
#include "SFMT.h"
#define NEW_LINE '\n' //<- New line character
#define P 2147483647UL //!< 2^31 - 1 = 2147483647
sfmt_t sfmt; //!< SIMD-oriented Fast Mersenne Twister
/**
* Basicly does x % P, but faster than the O3 compiler.
* Note that x must be smaller than 2*P
*/
unsigned long field(const unsigned long x);
/**
* Initilizes the Mersenne Twister with some truely random numbers,
* generated by RANDOM.ORG along with a user defined random number.
* Exists the program if the paramter contains a non-base 10 symbol.
*
* @param[in] seed A character string only containing values 0-9
*/
void init_mt(char *seed);
/**
* Works as follows:
* 1. Generates 2*80 + 2 random numbers
* 2. Constructs 'a' such that it contains 2*80 random numbers in Fp
* 3. Sets w[0] and w[1] to be random numbers in Fp
*
* @param[out] a List of 2*80 random numbers
* @param[out] w The two w-values used in f(w)
*/
void setAW(unsigned long *a, unsigned long *w);
/**
* Evaluates the univariate polynomial (w-h(x_1))(w-h(x_2))***(w-h(x_n)),
* for both w[0] and w[1] and sets val[0] and val[1] accordingly
*
* @param[in] content The content of a file (each line ends with '\n')
* @param[in] numChar Number of characters in content
* @param[in] w Variables for f(z), i.e. w[0] and w[1]
* @param[in] a A list of random numbers in Fp of size 2*80
* @param[out] val A list of size two, containing the value of each polynomial
*/
void evalPoly( const unsigned char *content,
const long numChar,
const unsigned long *w,
const unsigned long *a,
unsigned long *val);
/**
* Prints 'Yes' if X and Y represent the same multiset of lines.
* 'No' otherwise. Under the assumption that a line consists of
* at most 80 characters, and each character is represented by
* at most 16 bits, so each element in the multiset is represented
* by at most b = 80*16 = 1280 bits and a multiset contains at most
* n <= 2^24 lines this algorithm outputs correctly with probability
* 1-n/2147483647
*
* @param[in] argc The number of command line arguments
* @param[in] argv A pointer to an array of string where:
* There must be exactly 3 arguments where:
* -- The first argument is data-file-1 (X)
* -- The second argument is data-file-2 (Y)
* -- A user seed in base 10
* @return Exit code 0 on success, 1 otherwise
*/
int main(int argc, char *argv[]) {
FILE *fp1, *fp2; //!< file pointers
long fSize; //!< file size (in bytes)
long numChar; //!< number of chars in file
unsigned char *buffer; //!< buffer with the whole file
unsigned long a[2*80]; //!< mapping from 2^16 to Fp
unsigned long w[2]; //!< used to evaulate polynomial
unsigned long f1[2], f2[2]; //!< fingerprint of file 1 and 2
/* Initialize Mersenne Twister */
init_mt(argv[3]);
/* Initialize a and w */
setAW(a, w);
/* Open first file */
if((fp1 = fopen(argv[1], "r")) == NULL) {
fprintf(stderr, "Cannot open data-file-1\n");
exit(1);
}
/* Get the file size */
fseek(fp1, 0, SEEK_END);
fSize = ftell(fp1);
numChar = fSize/sizeof(unsigned char);
rewind(fp1);
/* Read the whole file into buffer */
buffer = (unsigned char *) malloc(sizeof(unsigned char)*(fSize));
if(fread(buffer, sizeof(unsigned char), fSize, fp1) != fSize) {
fprintf(stderr, "Could not read data-file-1\n");
exit(1);
}
/* Close data files (We are done with them!) */
fclose(fp1);
/* Fingerprint of data-file-1 */
evalPoly(buffer, numChar, w, a, f1);
/* Open second file */
if((fp2 = fopen(argv[2], "r")) == NULL) {
fprintf(stderr, "Cannot open data-file-2\n");
exit(1);
}
/* Read the whole file into (previus) buffer */
if(fread(buffer, sizeof(unsigned char), fSize, fp2) != fSize) {
fprintf(stderr, "Could not read data-file-2\n");
exit(1);
}
/* Close the file */
fclose(fp2);
/* Fingerprint of data-file-2 */
evalPoly(buffer, numChar, w, a, f2);
/*
* We have the following cases:
* - f1[0] - f2[0] == 0 and f1[1] - f2[1] != 0: 1st was false positive
* - f1[0] - f2[0] != 0 and f1[1] - f2[1] == 0: 2nd was false positive
* - f1[0] - f2[0] != 0 and f1[1] - f2[1] != 0: They are distinct
* - f1[0] - f2[0] == 0 and f1[1] - f2[1] == 0: They are probably equal
*/
if(((f1[0] - f2[0]) == 0) && ((f1[1] - f2[1]) == 0)) {
printf("Yes"); //<- With prob. 1-2^-12
} else {
printf("No");
}
/* Cleanup */
free(buffer);
return 0;
}
inline unsigned long field(const unsigned long x) {
unsigned long tmp = (x >> 31) + (x & P);
return (tmp < P) ? tmp : tmp - P;
}
inline void init_mt(char *seed) {
char *pEnd; //!< Used to check if user seed is parsable
/* Seed from
* http://www.random.org/integers/?num=9&min=1&max=1000000000&col=9&base=16&format=plain&rnd=date.2014-05-01
*/
// Seed the MT using 9 random integers and 1 from user input
uint32_t init[10] = { 0x012d661f, 0x399ba1e1, 0x01d033e9,
0x24ab21da, 0x3a1b142b, 0x21109a50,
0x20d005a4, 0x35215de9, 0x091c98f1};
// Parse user input
init[9] = strtol(seed, &pEnd, 10);
// Check if all was parsed
if(pEnd[0] != '\0') {
fprintf(stderr, "Could not convert user seed to an integer ('%c' is not in base 10)\n", pEnd[0]);
exit(1);
}
// Init SFMT
sfmt_init_by_array(&sfmt, init, 10);
}
void setAW(unsigned long *a, unsigned long *w) {
int min = sfmt_get_min_array_size32(&sfmt); //<- Min. size for rand
int nRandom = 162; //<- 2*80 for the 'a' array and 2 for w
if(min > 162) {
nRandom = min;
}
uint32_t *rand = malloc(sizeof(uint32_t)*nRandom);
/* Fill the array with 32-bit random uints */
sfmt_fill_array32(&sfmt, rand, nRandom);
/* Map the random integers to our field */
for(int i = 0; i < 2*80; i++) {
a[i] = field(rand[i]);
}
/* And set the w-values */
w[0] = field(rand[2*80]);
w[1] = field(rand[2*80 + 1]);
}
inline void evalPoly( const unsigned char *content,
const long numChar,
const unsigned long *w,
const unsigned long *a,
unsigned long *val) {
unsigned long h[2] = {0, 0}; //!< Accumulated value of h
unsigned long s; //!< Concatanation of c1 and c2 (block s)
int i = 0, c = 0; //!< Index variable
unsigned long val0 = 1,val1 = 1;//<- Tmp. value of f1(w) and f2(w)
unsigned long tmp; //<- Tmp. value of a*h and w-h
/* Example string 0x6D7973 (mys in ASCII) */
while (c < numChar) {
/* Calclulate h(X), i.e. a line */
for(/* */; content[c] != '\n'; c++) {
/* setAW = 0x6D00\n */
s = content[c] << 8;
if(content[c + 1] != '\n') {
/* s = 0x6D79 */
c++;
s |= content[c];
}
tmp = a[i ] * s;
h[0] = field(field(tmp) + h[0]);
tmp = a[i + 1] * s;
h[1] = field(field(tmp) + h[1]);
i += 2;
}
/* Calculate f(X) (partial) */
tmp = abs(w[0]-h[0]) * val0;
val0 = field(tmp);
tmp = abs(w[1]-h[1]) * val1;
val1 = field(tmp);
/* Reset counters */
h[0] = 0;
h[1] = 0;
i = 0;
/* Skip the new line just seen */
c++;
}
val[0] = val0; //<- We are done, update the values
val[1] = val1;
}