-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathFieldAggregates.ecl
349 lines (349 loc) · 13.8 KB
/
FieldAggregates.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
IMPORT $ AS ML_Core;
IMPORT ML_Core.Types AS Types;
IMPORT ML_Core.Utils AS Utils;
IMPORT Std.System.ThorLib;
/**
* Calculate various statistical aggregations of the fields in
* a NumericField dataset.
* @param d The dataset to be aggregated.
**/
EXPORT FieldAggregates(DATASET(Types.NumericField) d) := MODULE
// Simple statistics
SingleField := RECORD
d.wi;
d.number;
Types.t_fieldreal minval :=MIN(GROUP,d.Value);
Types.t_fieldreal maxval :=MAX(GROUP,d.Value);
Types.t_fieldreal sumval :=SUM(GROUP,d.Value);
Types.t_fieldreal countval:=COUNT(GROUP);
Types.t_fieldreal mean :=AVE(GROUP,d.Value);
Types.t_fieldreal var :=VARIANCE(GROUP,d.Value);
Types.t_fieldreal sd :=SQRT(VARIANCE(GROUP,d.Value));
END;
/**
* Calculate basic statistics about each field.
* <p>Calculates: min, max, sum, count, mean, variance, and
* standard deviation for each field.
* <p>There are no parameters.
* <p>Example:
* <pre>myAggs := FieldAggregates(myDS).simple;</pre>
**/
EXPORT Simple:=TABLE(d,SingleField, wi, Number,FEW);
// Simple Ranked
RankableField := RECORD
d;
UNSIGNED Pos := 0;
END;
Sorted_D := SORT(D,wi,Number,Value);
T := TABLE(Sorted_D,RankableField);
P := Utils.SequenceInField(T, Number, Pos, wi);
/**
* Calculate the rank (order) of each cell for each field.
* <p>The returned data adds a 'Pos' field to each cell, indicating
* its rank within it's field number.
* <p> There are no parameters.
* <p> Example:
* <pre>myRankedDS := FieldAggregates(myDS).SimpleRanked;</pre>
**/
EXPORT SimpleRanked := P;
// Medians
dMedianPos:=TABLE(SimpleRanked,
{wi; number;
SET OF UNSIGNED pos:=IF(MAX(GROUP,pos)%2=0,
[(MAX(GROUP,pos))/2,(MAX(GROUP,pos))/2+1],
[(MAX(GROUP,pos)/2)+1]);},
wi, number, FEW);
dMedianValues:=JOIN(SimpleRanked,dMedianPos,
LEFT.wi=RIGHT.wi AND LEFT.number=RIGHT.number
AND LEFT.pos IN RIGHT.pos,
TRANSFORM({RECORDOF(SimpleRanked) AND NOT [id,pos];},
SELF:=LEFT;),
LOOKUP);
/**
* Calculate the median value of each field.
* <p> There are no parameters.
* @return DATASET({wi, number, median}), one record per work-item and field number.
* <p>Example:
* <pre>myFieldMedians := FieldAggregates(myDS).Medians;</pre>
**/
EXPORT Medians:=TABLE(dMedianValues,
{wi, number;
TYPEOF(dMedianValues.value) median:=IF(COUNT(GROUP)=1,
MIN(GROUP,value),
SUM(GROUP,value)/2);},
wi, number,FEW);
// Min median
nextRec:= RECORD(RECORDOF(Medians))
TYPEOF(SimpleRanked.value) nextval;
END;
// MinMedNext is used on KDTree, when median = minval nextval is used
// instead median in order to avoid endless right-node propagation
//NOTE: when RIGHT is not present MAX(RIGHT.value, LEFT.median) works
// only for LEFT.median positive values, otherwise it returns 0
// (0 is greater than a negative number)
dNextVals:= JOIN(Medians, SimpleRanked,
LEFT.wi=RIGHT.wi AND LEFT.number = RIGHT.number
AND LEFT.median < RIGHT.value,
TRANSFORM(nextRec, SELF:= LEFT,
SELF.nextval:= MAX(RIGHT.value, LEFT.median)),
KEEP(1), LEFT OUTER);
EXPORT MinMedNext:= JOIN(dNextVals, Simple,
LEFT.wi=RIGHT.wi
AND LEFT.number = RIGHT.number, LOOKUP);
// Buckets
{RECORDOF(SimpleRanked);Types.t_Discrete bucket;}
tAssign(SimpleRanked L,Simple R,Types.t_Discrete n):=TRANSFORM
SELF.bucket:=IF(L.value=R.maxval,
n,
(Types.t_Discrete)(n*((L.value-R.minval)/(R.maxval-R.minval)))+1);
SELF:=L;
END;
/**
* Bucketize the datapoints into N buckets for each field.
* <p>Bucketization splits the range of the data into N equal size
* range buckets. The data will not normally be evenly split
* among buckets unless it is uniformly distributed. Contrast this
* with N-tile, where the data is split nearly evenly.
* @param n The number of buckets to use.
* @return DATASET OF {wi, id, number, value, pos, bucket}, where
* pos is the rank within each field, and bucket is the
* bucket number.
**/
EXPORT Buckets(Types.t_Discrete n)
:=JOIN(SimpleRanked,Simple,
LEFT.wi=RIGHT.wi AND LEFT.number=RIGHT.number,
tAssign(LEFT,RIGHT,n),LOOKUP);
/**
* Return the ranges associated with each of N buckets
* as computed by 'Buckets' above.
* @param n The number of buckets.
* @return DATASET OF {wi, number, bucket, Min, and Max}, one for
* each bucket for each field.
**/
EXPORT BucketRanges(Types.t_Discrete n)
:=TABLE(Buckets(n),
{wi, number;bucket;
Types.t_fieldreal Min:=MIN(GROUP,value);
Types.t_fieldreal Max:=MAX(GROUP,value);
UNSIGNED cnt:=COUNT(GROUP);},
wi, number, bucket);
// Modes
MR := RECORD
SimpleRanked.wi;
SimpleRanked.Number;
SimpleRanked.Value;
Types.t_FieldReal Pos := AVE(GROUP,SimpleRanked.Pos);
UNSIGNED valcount:=COUNT(GROUP);
END;
SHARED T := TABLE(SimpleRanked, MR, wi, Number, Value);
dModeVals:=TABLE(T,{wi,number;UNSIGNED modeval:=MAX(GROUP,valcount);},
wi, number,FEW);
/**
* Calculate the mode (i.e. the most common value) for each field
*
* <p>There are no parameters.
* @return DATASET OF {wi, number, mode, cnt}, one per field.
* 'cnt' is the number of times the mode value occurred.
**/
EXPORT Modes:=JOIN(T,dModeVals,
LEFT.wi=RIGHT.wi AND LEFT.number=RIGHT.number
AND LEFT.valcount=RIGHT.modeval,
TRANSFORM({TYPEOF(T.wi) wi; TYPEOF(T.number) number;
TYPEOF(T.value) mode; UNSIGNED cnt},
SELF.cnt := LEFT.valcount;
SELF.mode:=LEFT.value;
SELF:=LEFT;),
LOOKUP);
// Cardinality
/**
* Returns the cardinality of each field. That is the number of different
* values occurring in each field.
* <p>There are no parameters.
* @return DATASET OF {wi, number, cardinality}, one per field.
**/
EXPORT Cardinality:=TABLE(T,{wi, number;
UNSIGNED cardinality:=COUNT(GROUP);},wi, number);
// Ranked
SHARED AveRanked := RECORD
d;
Types.t_FieldReal Pos;
END;
AveRanked Into(D le,T ri) := TRANSFORM
SELF.Pos := ri.pos;
SELF := le;
END;
EXPORT RankedInput := JOIN(D,T,
LEFT.wi=RIGHT.wi AND LEFT.Number=RIGHT.Number
AND LEFT.Value = RIGHT.Value,
Into(LEFT,RIGHT));
// N-Tile
{RECORDOF(RankedInput);Types.t_Discrete ntile;}
tNTile(RankedInput L,Simple R,Types.t_Discrete n):=TRANSFORM
SELF.ntile:=IF(L.pos=R.countval,n,
(Types.t_Discrete)(n*((L.pos-1)/R.countval))+1);
SELF:=L;
END;
/**
* Calculate the N-tile of each datapoint within its field.
* For example, if N is 100, we calculate percentiles.
* @param n The number of groups into which to balance the data
* @return DATASET OF {wi, id, number, value, pos, ntile}, where
* pos is the rank within each field.
**/
EXPORT NTiles(Types.t_Discrete n):=JOIN(RankedInput, Simple,
LEFT.wi=RIGHT.wi
AND LEFT.number=RIGHT.number,
tNTile(LEFT,RIGHT,n),LOOKUP);
/**
* Return the ranges associated with each of N-tiles
* as computed by 'Ntiles' above.
* @param n The number of N-tile groups.
* @return DATASET OF {wi, number, bucket, Min, and Max}, one for
* each N-tile group for each field.
**/
EXPORT NTileRanges(Types.t_Discrete n):=TABLE(NTiles(n),
{wi;number;ntile;
Types.t_fieldreal Min:=MIN(GROUP,value);
Types.t_fieldreal Max:=MAX(GROUP,value);
UNSIGNED cnt:=COUNT(GROUP);},
wi, number, ntile);
// N-Histogram
{RECORDOF(d);Types.t_Discrete hbin;}
tHistBin(RECORDOF(d) L,Simple R,Types.t_Discrete n):=TRANSFORM
SELF.hbin := IF(L.value = R.maxval, n,
(Types.t_Discrete) n*(L.value - R.minval)/(R.maxval - R.minval)+1);
SELF:=L;
END;
EXPORT HistBins(Types.t_Discrete n):=JOIN(d, Simple,
LEFT.wi=RIGHT.wi
AND LEFT.number=RIGHT.number,
tHistBin(LEFT,RIGHT,n),LOOKUP);
EXPORT HistBinRanges(Types.t_Discrete n):=TABLE(HistBins(n),
{wi;number;hbin;
Types.t_fieldreal Min:=MIN(GROUP,value);
Types.t_fieldreal Max:=MAX(GROUP,value);
UNSIGNED cnt:=COUNT(GROUP);},
wi, number, hbin);
// Pearson product-moment correlation matrix
SHARED dChildRec := RECORD
d.id;
d.value;
Types.t_FieldReal Pos;
END;
SHARED dDenormRec := RECORD
d.wi;
d.number;
DATASET(dChildRec) d := DATASET([], dChildRec);
END;
dParent := PROJECT(Simple, dDenormRec);
dDenormRec tDenorm(dParent L, AveRanked R) := TRANSFORM
SELF.d := L.d + PROJECT(DATASET(R),dChildRec);
SELF := L;
END;
SHARED dDenorm := DENORMALIZE(dParent, RankedInput,
LEFT.wi = RIGHT.wi
AND LEFT.number = RIGHT.number,
tDenorm(LEFT,RIGHT));
SHARED combineRec := RECORD(dChildRec)
Types.t_fieldreal value2;
END;
SHARED corrRec := RECORD
d.wi;
Types.t_Discrete number1;
Types.t_Discrete number2;
Types.t_fieldreal Correl;
END;
corrRec tPearson(dDenormRec L, dDenormRec R) := TRANSFORM
SELF.number1 := L.number;
SELF.number2 := R.number;
LRd := JOIN(L.d,R.d,LEFT.id = RIGHT.id,
TRANSFORM(combineRec,
SELF.value2 := RIGHT.value,
SELF := LEFT));
SELF.Correl := CORRELATION(LRd, value, value2);
SELF := L;
END;
EXPORT PearsonCorr := JOIN(dDenorm, dDenorm,
LEFT.wi=RIGHT.wi AND LEFT.number <= RIGHT.number,
tPearson(LEFT,RIGHT));
// Spearman's rho correlation matrix
corrRec tSpearman(dDenormRec L, dDenormRec R) := TRANSFORM
SELF.number1 := L.number;
SELF.number2 := R.number;
LRd := JOIN(L.d,R.d,LEFT.id = RIGHT.id,
TRANSFORM(combineRec,
SELF.value := LEFT.pos,
SELF.value2 := RIGHT.pos,
SELF := LEFT));
SELF.Correl := CORRELATION(LRd, value, value2);
SELF := L;
END;
EXPORT SpearmanCorr := JOIN(dDenorm, dDenorm,
LEFT.wi=RIGHT.wi AND LEFT.number <= RIGHT.number,
tSpearman(LEFT,RIGHT));
// Kendall's tau-b correlation matrix
KendallCompRec := RECORD
UNSIGNED2 Concordant;
UNSIGNED2 Discordant;
END;
KendallTiesRec := RECORD
n_t := COUNT(GROUP) * (COUNT(GROUP) - 1) / 2;
END;
KendallCompRec KendallComp(combineRec L, combineRec R) := TRANSFORM
SELF.Concordant := IF((L.value < R.value AND L.value2 < R.value2) OR
(L.value > R.value AND L.value2 > R.value2),
1, 0);
SELF.Discordant := IF((L.value < R.value AND L.value2 > R.value2) OR
(L.value > R.value AND L.value2 < R.value2),
1, 0);
END;
REAL8 Kendall(DATASET(combineRec) LRd) := FUNCTION
components := JOIN(LRd, LRd, LEFT.id <> RIGHT.id AND LEFT.id < RIGHT.id, KendallComp(LEFT,RIGHT), ALL);
n_t_L := SUM(TABLE(LRd, KendallTiesRec, value ), n_t);
n_t_R := SUM(TABLE(LRd, KendallTiesRec, value2), n_t);
n_c := SUM(components, Concordant);
n_d := SUM(components, Discordant);
n := COUNT(components);
tau := (n_c - n_d) / SQRT((n - n_t_L) * (n - n_t_R));
return(tau);
END;
corrRec tKendall(dDenormRec L, dDenormRec R) := TRANSFORM
SELF.number1 := L.number;
SELF.number2 := R.number;
LRd := JOIN(L.d,R.d,LEFT.id = RIGHT.id,
TRANSFORM(combineRec,
SELF.value := LEFT.pos,
SELF.value2 := RIGHT.pos,
SELF := LEFT));
SELF.Correl := Kendall(LRd);
SELF := L;
END;
EXPORT KendallCorr := JOIN(dDenorm, dDenorm,
LEFT.wi=RIGHT.wi AND LEFT.number <= RIGHT.number,
tKendall(LEFT,RIGHT));
// Generalized Spearman rho correlation matrix (each independent vs dependent)
// See the R package 'Hmisc' by Frank Harrell (and the function 'biVar').
corrRec tGenSpearman2(dDenormRec L, dDenormRec R) := TRANSFORM
SELF.number1 := L.number;
SELF.number2 := R.number;
LRd := JOIN(L.d,R.d,LEFT.id = RIGHT.id,
TRANSFORM(combineRec,
SELF.value := LEFT.pos,
SELF.value2 := RIGHT.pos,
SELF := LEFT));
unique := COUNT(TABLE(LRd, {value2}, value2));
XYSpearman := CORRELATION(LRd, value, value2);
XSqYSpearman := CORRELATION(LRd, value, POWER(value2,2));
XXSqSpearman := CORRELATION(LRd, value2, POWER(value2,2));
SELF.Correl := IF(unique < 3,
POWER(XYSpearman, 2),
(POWER(XYSpearman,2) - 2 * XYSpearman * XSqYSpearman * XXSqSpearman + POWER(XSqYSpearman,2))
/ (1 - POWER(XXSqSpearman,2))
);
SELF := L;
END;
EXPORT GenSpearman2Corr(dep = 1) := JOIN(dDenorm, dDenorm,
LEFT.wi=RIGHT.wi AND LEFT.number = dep AND RIGHT.number <> dep,
tGenSpearman2(LEFT,RIGHT));
// End Field Aggregates
END;