-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimpurity_measures.c
345 lines (303 loc) · 12 KB
/
impurity_measures.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
/****************************************************************/
/* Copyright 1993, 1994 */
/* Johns Hopkins University */
/* Department of Computer Science */
/****************************************************************/
/* Contact : murthy@cs.jhu.edu */
/****************************************************************/
/* File Name : impurity_measures.c */
/* Author : Sreerama K. Murthy */
/* Last modified : July 1994 */
/* Contains modules : maxminority */
/* summinority */
/* variance */
/* info_gain */
/* gini_index */
/* twoing */
/* Uses modules in : compute_impurity.c */
/* oc1.h */
/* Is used by modules in : compute_impurity.c */
/* Remarks : If the user wants to "plug-in" a new */
/* impurity measure, that function should */
/* look like the functions in this file. */
/* It is also probably convenient to */
/* concatenate such new measures to */
/* this file, so that the user needn't */
/* bother with the global declarations and */
/* changing the makefile. */
/* Note that none of the measure use any */
/* input parameters. The only information */
/* used is the number of dimensions, the */
/* number of classes (categories) and the */
/* counts of points of each class on the */
/* left and right of the hyperplane. */
/* All measures return a nonnegative float */
/* impurity value, where the lower the */
/* impurity, the better the hyperplane. */
/* All measures are called from the module */
/* Compute_Impurity. */
/****************************************************************/
#include "oc1.h"
extern int no_of_dimensions;
extern int *left_count,*right_count;
extern int no_of_categories;
int largest_element();
/************************************************************************/
/* Module name : maxminority */
/* Functionality : Suggested by Heath et al in their IJCAI-93 paper*/
/* Has the theoretical advantage that the depth of */
/* of the resulting tree has to be logarithmic in */
/* the number of instances. Can be computed very */
/* efficiently - no log computations. */
/* Returns : larger of the minority values on the left and right. */
/* (minority on a side = sum of the counts of all classes */
/* except the class with the highest count.) */
/* Calls modules : largest_element (compute_impurity.c) */
/************************************************************************/
float maxminority()
{
int i,j,lminor=0,rminor=0;
i = largest_element(left_count,no_of_categories);
if (i <= no_of_categories)
for (j=1;j<=no_of_categories ;j++)
if (i != j) lminor += left_count[j];
i = largest_element(right_count,no_of_categories);
if (i <= no_of_categories)
for (j=1;j<=no_of_categories ;j++)
if (i != j) rminor += right_count[j];
if (lminor > rminor)
return((float)lminor);
else return((float)rminor);
}
/************************************************************************/
/* Module name : summinority */
/* Functionality : Suggested by Heath et al in their IJCAI-93 paper*/
/* Is intuitively most appealing, and can be */
/* computed efficiently - no log computations. */
/* Does well on several domains, in spite of its */
/* simplicity. */
/* Returns : sum of the minority values on the left and right. */
/* (minority on a side = sum of the counts of all classes */
/* except the class with the highest count.) */
/* Calls modules : largest_element (compute_impurity.c) */
/************************************************************************/
float summinority()
{
int i,j,lminor=0,rminor=0;
i = largest_element(left_count,no_of_categories);
if (i <= no_of_categories)
for (j=1;j<=no_of_categories ;j++)
if (i != j) lminor += left_count[j];
i = largest_element(right_count,no_of_categories);
if (i <= no_of_categories)
for (j=1;j<=no_of_categories ;j++)
if (i != j) rminor += right_count[j];
return((float)(lminor+rminor));
}
/************************************************************************/
/* Module name : variance */
/* Functionality : Suggested by Heath et al in their IJCAI-93 paper*/
/* as the Sum_of_Impurities measure. */
/* Returns : sum of the category number variances on the */
/* left and right, making adjustments for bias. */
/* Variance on a side = sigma ((xi - avg)*(xi - avg)) */
/* i=1..k */
/* where x1,..xk are the classes (categories) of the */
/* points on that side of the hyperplane, and */
/* avg = (x1+x2+..+xk)/k. */
/************************************************************************/
float variance()
{
float lavg=0,ravg=0,lerror = 0,rerror = 0;
int i,lsum1=0,rsum1=0,lsum2=0,rsum2=0;
int *temp1=NULL,*temp2=NULL;
static int var_compare();
if (no_of_categories > 2)
/* Renumber categories in descending order of their proportion of
occurance. This removes the possibility for a biased impurity
estimate. */
{
temp1 = ivector(1,no_of_categories);
temp2 = ivector(1,no_of_categories);
for (i=1;i<=no_of_categories;i++)
{
temp1[i] = left_count[i];
temp2[i] = right_count[i];
}
qsort((char *)(left_count+1),no_of_categories,sizeof(int),var_compare);
qsort((char *)(right_count+1),no_of_categories,sizeof(int),var_compare);
}
for (i=1;i<=no_of_categories;i++)
{
lsum1 += left_count[i];
lsum2 += i * left_count[i];
rsum1 += right_count[i];
rsum2 += i * right_count[i];
}
if (lsum1 != 0) lavg = (float)lsum2/lsum1;
if (rsum1 != 0) ravg = (float)rsum2/rsum1;
for (i=1;i<=no_of_categories;i++)
{
lerror += left_count[i] * (i - lavg) * (i - lavg);
rerror += right_count[i] * (i - ravg) * (i - ravg);
}
if (no_of_categories > 2)
/* Restore original left_count and right_count arrays.
Remember, they are read_only. */
{
for (i=1;i<=no_of_categories;i++)
{
left_count[i] = temp1[i];
right_count[i] = temp2[i];
}
free_ivector(temp1,1,no_of_categories);
free_ivector(temp2,1,no_of_categories);
}
return (lerror+rerror);
}
/************************************************************************/
/* Module name : Var_Compare */
/* Functionality : Used by the qsort function call in the module */
/* Variance. */
/* See the man page for qsort for more details. */
/************************************************************************/
static int var_compare(p1,p2)
int *p1,*p2;
{
int p;
p = *p1-*p2;
return(p);
}
/************************************************************************/
/* Module name : info_gain */
/* Functionality : Computes the (Quinlan's) information gain of */
/* the current split. As OC1 tries to minimize the */
/* "impurity" instead of maximizing the information*/
/* "gain", this module returns the reciprocal of */
/* the computed gain. */
/* Remarks : Much less efficient to compute than the minority measures. */
/* But often works much better. */
/************************************************************************/
float info_gain()
{
float presplit_info=0,postsplit_info=0,left_info=0,right_info=0;
float ratio,infogain;
int i,total_count=0,total_left_count=0,total_right_count=0;
double log2();
for (i = 1;i<=no_of_categories;i++)
{
total_left_count += left_count[i];
total_right_count += right_count[i];
}
total_count = total_left_count + total_right_count;
if (total_count)
for (i = 1;i<=no_of_categories;i++)
{
ratio = (float)(left_count[i]+right_count[i])/total_count;
if (ratio) presplit_info += -1.0 * ratio * (float)log2((double)ratio);
}
if (total_left_count)
{
for (i = 1;i<=no_of_categories;i++)
{
ratio = (float)left_count[i]/total_left_count;
if (ratio) left_info += -1.0 * ratio * (float)log2((double)ratio);
}
postsplit_info += total_left_count * left_info / total_count;
}
if (total_right_count)
{
for (i = 1;i<=no_of_categories;i++)
{
ratio = (float)right_count[i]/total_right_count;
if (ratio) right_info += -1.0 * ratio * (float)log2((double)ratio);
}
postsplit_info += total_right_count * right_info / total_count;
}
infogain = presplit_info - postsplit_info;
if (infogain == 0) /*No information gained due to this split.
i.e., Either the region is homogenous or impurity
is as large as it can be. */
{
for (i=1;i<=no_of_categories;i++)
if (left_count[i] + right_count[i] == total_count) return(0);
return(HUGE);
}
else return(1.0/infogain);
}
/************************************************************************/
/* Module name : gini_index */
/* Functionality : Computes gini_index of a hyperplane. This stati-*/
/* stical measure was described in the context of */
/* decision trees by Leo Breiman et al in the CART */
/* book, 1984. */
/* Remarks : Efficient to compute - No log computations. */
/* Performs quite well. */
/************************************************************************/
float gini_index()
{
int total_left_count=0,total_right_count=0;
float temp,gini_left=0,gini_right=0,gini_value;
int i,j;
for (i=1;i<=no_of_categories;i++)
{
total_left_count += left_count[i];
total_right_count += right_count[i];
}
if (total_left_count)
{
for (i=1;i<=no_of_categories;i++)
{
temp = (1.0 * left_count[i]) / total_left_count;
gini_left += temp * temp;
}
gini_left = 1.0 - gini_left;
}
if (total_right_count)
{
for (i=1;i<=no_of_categories;i++)
{
temp = (1.0 * right_count[i]) / total_right_count;
gini_right += temp * temp;
}
gini_right = 1.0 - gini_right;
}
gini_value = (total_left_count * gini_left + total_right_count * gini_right)/
(total_left_count+total_right_count);
return(gini_value);
}
/************************************************************************/
/* Module name : twoing */
/* Functionality : Computes, by twoing rule, the goodness of a */
/* hyperplane. Returns the reciprocal of this value*/
/* The twoing measure is described in detail */
/* by Leo Breiman et al in their CART book (1984). */
/************************************************************************/
float twoing()
{
float total_left_count=0,total_right_count=0,total_count;
float goodness=0,temp,twoing_val;
int i;
for (i=1;i<=no_of_categories;i++)
{
total_left_count += left_count[i];
total_right_count += right_count[i];
}
total_count = total_left_count + total_right_count;
if (!total_count) return(0);
for (i=1;i<=no_of_categories;i++)
{
temp = 0;
if (total_left_count) temp = left_count[i]/total_left_count;
if (total_right_count) temp -= right_count[i]/total_right_count;
if (temp < 0) goodness += -1.0 * temp;
else goodness += temp;
}
total_left_count /= total_count;
total_right_count /= total_count;
twoing_val = total_left_count * total_right_count * goodness * goodness / 4;
if (twoing_val == 0) return(HUGE);
else return(1.0/twoing_val);
}
/************************************************************************/
/************************************************************************/