-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhier8_neat.m
executable file
·321 lines (291 loc) · 10.7 KB
/
hier8_neat.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
function [tree, splits, is_leaf, clusters, timings, Ws, priorities, W, H] = hier8_neat(X, k, params)
%%HIER8_NEAT - Hierarchical clustering based on rank-2 NMF
% [tree, splits, is_leaf, clusters, timings, Ws, priorities] = hier8_neat(X, k, params)
%
% Input parameters --
% X: m*n data matrix (m features x n data points)
% k: The max number of leaf nodes to be generated
% params (optional)
% params.trial_allowance (default=3): Number of trials allowed for removing
% outliers and splitting a node again.
% See parameter T in Algorithm 3 in the reference paper.
% params.unbalanced (default=0.1): A threshold to determine if one of the two clusters is an outlier set.
% A smaller value means more tolerance for unbalance between two clusters.
% See parameter beta in Algorithm 3 in the reference paper.
% params.vec_norm (default=2): Indicates which norm to use for the normalization of W or H,
% e.g. vec_norm=2 means Euclidean norm; vec_norm=0 means no normalization.
% params.normW (default=true): true if normalizing columns of W; false if normalizing rows of H.
% params.anls_alg (default=@anls_entry_rank2_precompute): The function handle to NNLS algorithm
% whose signature is the same as @anls_entry_rank2_precompute
% params.tol (default=1e-4): Tolerance parameter for stopping criterion in each run of NMF.
% params.maxiter (default=10000): Maximum number of iteration times in each run of NMF.
%
% Output parameters --
% From the output parameters, you can reconstruct the tree and "replay" the k-1 steps that generated it.
%
% For a binary tree with k leaf nodes, the total number of nodes (including leaf and non-leaf nodes)
% is 2*(k-1) plus the root node, because k-1 splits are performed and each split generates two new nodes.
%
% We only keep track of the 2*(k-1) non-root node in the output.
%
% tree: A 2-by-(k-1) matrix that encodes the tree structure. The two entries in the i-th column are the numberings
% of the two children of the node with numbering i.
% The root node has numbering 0, with its two children always having numbering 1 and numbering 2.
% Thus the root node is NOT included in the 'tree' variable.
% splits: An array of length k-1. It keeps track of the numberings of the nodes being split
% from the 1st split to the (k-1)-th split. (The first entry is always 0.)
% is_leaf: An array of length 2*(k-1). A "1" at index i means that the node with numbering i is a leaf node
% in the final tree generated, and "0" indicates non-leaf nodes in the final tree.
% clusters: A cell array of length 2*(k-1). The i-th element contains the subset of items
% at the node with numbering i.
% timings: An array of length k-1.
% Its i-th element is the wall-time for performing i splits (with i+1 leaf nodes).
% Ws: A cell array of length 2*(k-1).
% Its i-th element is the topic vector of the cluster at the node with numbering i.
% priorities: An array of length 2*(k-1).
% Its i-th element is the modified NDCG scores at the node with numbering i (see the reference paper).
%
% Tips:
% If you want to have the flat partitioning induced by the leaf nodes in the final tree,
% use this script:
%
% partitioning = zeros(1, n); % n is the total number of data points
% leaf_level = clusters(is_leaf == 1);
% for i = 1 : length(leaf_level)
% partitioning(leaf_level{i}) = i;
% end
%
% (Now the entries in 'partitioning' having value 0 indicate outliers that do not belong to any cluster.)
%
%
% References:
% Da Kuang, Haesun Park
% Fast rank-2 nonnegative matrix factorization for hierarchical document clustering, KDD 2013
%
% Feb 2013
if ~exist('params', 'var')
trial_allowance = 3;
unbalanced = 0.1;
vec_norm = 2.0;
normW = true;
anls_alg = @anls_entry_rank2_precompute;
tol = 1e-4;
maxiter = 10000;
else
if isfield(params, 'trial_allowance')
trial_allowance = params.trial_allowance;
else
trial_allowance = 3;
end
if isfield(params, 'unbalanced')
unbalanced = params.unbalanced;
else
unbalanced = 0.1;
end
if isfield(params, 'vec_norm')
vec_norm = params.vec_norm;
else
vec_norm = 2.0;
end
if isfield(params, 'normW')
normW = params.normW;
else
normW = true;
end
if isfield(params, 'anls_alg')
anls_alg = params.anls_alg;
else
anls_alg = @anls_entry_rank2_precompute;
end
if isfield(params, 'tol')
tol = params.tol;
else
tol = 1e-4;
end
if isfield(params, 'maxiter')
maxiter = params.maxiter;
else
maxiter = 10000;
end
end
params = [];
params.vec_norm = vec_norm;
params.normW = normW;
params.anls_alg = anls_alg;
params.tol = tol;
params.maxiter = maxiter;
t0 = tic;
[m, n] = size(X);
timings = zeros(1, k-1);
clusters = cell(1, 2*(k-1));
Ws = cell(1, 2*(k-1));
W_buffer = cell(1, 2*(k-1));
H_buffer = cell(1, 2*(k-1));
priorities = zeros(1, 2*(k-1));
is_leaf = -1 * ones(1, 2*(k-1));
tree = zeros(2, 2*(k-1));
splits = -1 * ones(1, k-1);
term_subset = find(sum(X, 2) ~= 0);
W = rand(length(term_subset), 2);
H = rand(2, n);
if length(term_subset) == m
[W, H] = nmfsh_comb_rank2(X, W, H, params);
else
[W_tmp, H] = nmfsh_comb_rank2(X(term_subset, :), W, H, params);
W = zeros(m, 2);
W(term_subset, :) = W_tmp;
clear W_tmp;
end
result_used = 0;
for i = 1 : k-1
timings(i) = toc(t0);
if i == 1
split_node = 0;
new_nodes = [1 2];
min_priority = 1e308;
split_subset = 1:n;
else
leaves = find(is_leaf == 1);
temp_priority = priorities(leaves);
min_priority = min(temp_priority(temp_priority > 0));
[max_priority, split_node] = max(temp_priority);
if max_priority < 0
fprintf('Cannot generate all %d leaf clusters\n', k);
return;
end
split_node = leaves(split_node);
is_leaf(split_node) = 0;
W = W_buffer{split_node};
H = H_buffer{split_node};
split_subset = clusters{split_node};
new_nodes = [result_used+1 result_used+2];
tree(1, split_node) = new_nodes(1);
tree(2, split_node) = new_nodes(2);
end
result_used = result_used + 2;
[max_val, cluster_subset] = max(H);
clusters{new_nodes(1)} = split_subset(find(cluster_subset == 1));
clusters{new_nodes(2)} = split_subset(find(cluster_subset == 2));
Ws{new_nodes(1)} = W(:, 1);
Ws{new_nodes(2)} = W(:, 2);
splits(i) = split_node;
is_leaf(new_nodes) = 1;
subset = clusters{new_nodes(1)};
[subset, W_buffer_one, H_buffer_one, priority_one] = trial_split(trial_allowance, unbalanced, min_priority, X, subset, W(:, 1), params);
clusters{new_nodes(1)} = subset;
W_buffer{new_nodes(1)} = W_buffer_one;
H_buffer{new_nodes(1)} = H_buffer_one;
priorities(new_nodes(1)) = priority_one;
subset = clusters{new_nodes(2)};
[subset, W_buffer_one, H_buffer_one, priority_one] = trial_split(trial_allowance, unbalanced, min_priority, X, subset, W(:, 2), params);
clusters{new_nodes(2)} = subset;
W_buffer{new_nodes(2)} = W_buffer_one;
H_buffer{new_nodes(2)} = H_buffer_one;
priorities(new_nodes(2)) = priority_one;
end
W = cell2mat(Ws(find(is_leaf)));
[H,temp,suc_H,numChol_H,numEq_H] = nnlsm_blockpivot(W'*W,W'*X,1,rand(k,size(X,2)));
%--------------------------------------
function [subset, W_buffer_one, H_buffer_one, priority_one] = trial_split(trial_allowance, unbalanced, min_priority, X, subset, W_parent, params)
[m, n] = size(X);
trial = 0;
subset_backup = subset;
while trial < trial_allowance
[cluster_subset, W_buffer_one, H_buffer_one, priority_one] = actual_split(X, subset, W_parent, params);
if priority_one < 0
break;
end
unique_cluster_subset = unique(cluster_subset);
if length(unique_cluster_subset) ~= 2
error('Invalid number of unique sub-clusters!');
end
length_cluster1 = length(find(cluster_subset == unique_cluster_subset(1)));
length_cluster2 = length(find(cluster_subset == unique_cluster_subset(2)));
if min(length_cluster1, length_cluster2) < unbalanced * length(cluster_subset)
[min_val, idx_small] = min([length_cluster1, length_cluster2]);
subset_small = find(cluster_subset == unique_cluster_subset(idx_small));
subset_small = subset(subset_small);
[cluster_subset_small, W_buffer_one_small, H_buffer_one_small, priority_one_small] = actual_split(X, subset_small, W_buffer_one(:, idx_small), params);
if priority_one_small < min_priority
trial = trial + 1;
if trial < trial_allowance
disp(['Drop ', num2str(length(subset_small)), ' documents ...']);
subset = setdiff(subset, subset_small);
end
else
break;
end
else
break;
end
end
if trial == trial_allowance
disp(['Recycle ', num2str(length(subset_backup) - length(subset)), ' documents ...']);
subset = subset_backup;
W_buffer_one = zeros(m, 2);
H_buffer_one = zeros(2, length(subset));
priority_one = -2;
end
%--------------------------------------
function [cluster_subset, W_buffer_one, H_buffer_one, priority_one] = actual_split(X, subset, W_parent, params)
[m, n] = size(X);
if length(subset) <= 3
cluster_subset = ones(1, length(subset));
W_buffer_one = zeros(m, 2);
H_buffer_one = zeros(2, length(subset));
priority_one = -1;
else
term_subset = find(sum(X(:, subset), 2) ~= 0);
X_subset = X(term_subset, subset);
W = rand(length(term_subset), 2);
H = rand(2, length(subset));
[W, H] = nmfsh_comb_rank2(X_subset, W, H, params);
[max_val, cluster_subset] = max(H);
W_buffer_one = zeros(m, 2);
W_buffer_one(term_subset, :) = W;
H_buffer_one = H;
if length(unique(cluster_subset)) > 1
priority_one = compute_priority(W_parent, W_buffer_one);
else
priority_one = -1;
end
end
%--------------------------------------
function priority = compute_priority(W_parent, W_child)
n = length(W_parent);
[sorted_parent, idx_parent] = sort(W_parent, 'descend');
[sorted_child1, idx_child1] = sort(W_child(:, 1), 'descend');
[sorted_child2, idx_child2] = sort(W_child(:, 2), 'descend');
n_part = length(find(W_parent ~= 0));
if n_part <= 1
priority = -3;
else
weight = log(n:-1:1)';
first_zero = find(sorted_parent == 0, 1);
if length(first_zero) > 0
weight(first_zero:end) = 1;
end
weight_part = zeros(n, 1);
weight_part(1:n_part) = log(n_part:-1:1)';
[sorted, idx1] = sort(idx_child1);
[sorted, idx2] = sort(idx_child2);
max_pos = max(idx1, idx2);
discount = log(n-max_pos(idx_parent)+1);
discount(discount == 0) = log(2);
weight = weight ./ discount;
weight_part = weight_part ./ discount;
priority = NDCG_part(idx_parent, idx_child1, weight, weight_part) * NDCG_part(idx_parent, idx_child2, weight, weight_part);
end
%--------------------------------------
function score = NDCG_part(ground, test, weight, weight_part)
[sorted, seq_idx] = sort(ground);
weight_part = weight_part(seq_idx);
n = length(test);
uncum_score = weight_part(test);
uncum_score(2:n) = uncum_score(2:n) ./ log2(2:n)';
cum_score = cumsum(uncum_score);
ideal_score = sort(weight, 'descend');
ideal_score(2:n) = ideal_score(2:n) ./ log2(2:n)';
cum_ideal_score = cumsum(ideal_score);
score = cum_score ./ cum_ideal_score;
score = score(end);