-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhier8_neat_reduced.m
executable file
·215 lines (203 loc) · 7.73 KB
/
hier8_neat_reduced.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
function [tree, splits, is_leaf, clusters, timings, Ws, priorities, W, H] = hier8_neat_reduced(X, k, cl_idx, idx, params)
%%HIER8_NEAT - Hierarchical clustering based on rank-2 NMF
% [tree, splits, is_leaf, clusters, timings, Ws, priorities] = hier8_neat(X, k, params)
% Input parameters --
% X: m*n data matrix (m features x n data points)
% k: The max number of leaf nodes to be generated
% params (optional)
% params.trial_allowance (default=3): Number of trials allowed for removing
% outliers and splitting a node again.
% See parameter T in Algorithm 3 in the reference paper.
% params.unbalanced (default=0.1): A threshold to determine if one of the two clusters is an outlier set.
% A smaller value means more tolerance for unbalance between two clusters.
% See parameter beta in Algorithm 3 in the reference paper.
% params.vec_norm (default=2): Indicates which norm to use for the normalization of W or H,
% e.g. vec_norm=2 means Euclidean norm; vec_norm=0 means no normalization.
% params.normW (default=true): true if normalizing columns of W; false if normalizing rows of H.
% params.anls_alg (default=@anls_entry_rank2_precompute): The function handle to NNLS algorithm
% whose signature is the same as @anls_entry_rank2_precompute
% params.tol (default=1e-4): Tolerance parameter for stopping criterion in each run of NMF.
% params.maxiter (default=10000): Maximum number of iteration times in each run of NMF.
%
% Output parameters --
% From the output parameters, you can reconstruct the tree and "replay" the k-1 steps that generated it.
%
% For a binary tree with k leaf nodes, the total number of nodes (including leaf and non-leaf nodes)
% is 2*(k-1) plus the root node, because k-1 splits are performed and each split generates two new nodes.
%
% We only keep track of the 2*(k-1) non-root node in the output.
%
% tree: A 2-by-(k-1) matrix that encodes the tree structure. The two entries in the i-th column are the numberings
% of the two children of the node with numbering i.
% The root node has numbering 0, with its two children always having numbering 1 and numbering 2.
% Thus the root node is NOT included in the 'tree' variable.
% splits: An array of length k-1. It keeps track of the numberings of the nodes being split
% from the 1st split to the (k-1)-th split. (The first entry is always 0.)
% is_leaf: An array of length 2*(k-1). A "1" at index i means that the node with numbering i is a leaf node
% in the final tree generated, and "0" indicates non-leaf nodes in the final tree.
% clusters: A cell array of length 2*(k-1). The i-th element contains the subset of items
% at the node with numbering i.
% timings: An array of length k-1.
% Its i-th element is the wall-time for performing i splits (with i+1 leaf nodes).
% Ws: A cell array of length 2*(k-1).
% Its i-th element is the topic vector of the cluster at the node with numbering i.
% priorities: An array of length 2*(k-1).
% Its i-th element is the modified NDCG scores at the node with numbering i (see the reference paper).
%
% Tips:
% If you want to have the flat partitioning induced by the leaf nodes in the final tree,
% use this script:
%
% partitioning = zeros(1, n); % n is the total number of data points
% leaf_level = clusters(is_leaf == 1);
% for i = 1 : length(leaf_level)
% partitioning(leaf_level{i}) = i;
% end
%
% (Now the entries in 'partitioning' having value 0 indicate outliers that do not belong to any cluster.)
%
%
% References:
% Da Kuang, Haesun Park
% Fast rank-2 nonnegative matrix factorization for hierarchical document clustering, KDD 2013
%
% Feb 2013
if ~exist('params', 'var')
trial_allowance = 3;
unbalanced = 0.1;
vec_norm = 2.0;
normW = true;
anls_alg = @anls_entry_rank2_precompute;
tol = 1e-4;
maxiter = 10000;
else
if isfield(params, 'trial_allowance')
trial_allowance = params.trial_allowance;
else
trial_allowance = 3;
end
if isfield(params, 'unbalanced')
unbalanced = params.unbalanced;
else
unbalanced = 0.1;
end
if isfield(params, 'vec_norm')
vec_norm = params.vec_norm;
else
vec_norm = 2.0;
end
if isfield(params, 'normW')
normW = params.normW;
else
normW = true;
end
if isfield(params, 'anls_alg')
anls_alg = params.anls_alg;
else
anls_alg = @anls_entry_rank2_precompute;
end
if isfield(params, 'tol')
tol = params.tol;
else
tol = 1e-4;
end
if isfield(params, 'maxiter')
maxiter = params.maxiter;
else
maxiter = 10000;
end
end
params = [];
params.vec_norm = vec_norm;
params.normW = normW;
params.anls_alg = anls_alg;
params.tol = tol;
params.maxiter = maxiter;
% t0 = tic;
[m, n] = size(X);
cluster_idx=unique(cl_idx(idx));
sub_k=length(cluster_idx);
ctrary = zeros(size(X,1),sub_k);
timings = zeros(1, k-sub_k);
clusters = cell(1, 2*(k-sub_k));
Ws = cell(1, 2*(k-sub_k));
W_buffer = cell(1, 2*(k-sub_k));
H_buffer = cell(1, 2*(k-sub_k));
priorities = zeros(1, 2*(k-sub_k));
is_leaf = -1 * ones(1, 2*(k-sub_k));
tree = zeros(2, 2*(k-sub_k));
splits = -1 * ones(1, k-sub_k);
min_priority = 1e308;
% if sub_k == 1
% term_subset = find(sum(X, 2) ~= 0);
% W = rand(length(term_subset), 2);
% H = rand(2, n);
% if length(term_subset) == m
% [W, H] = nmfsh_comb_rank2(X, W, H, params);
% else
% [W_tmp, H] = nmfsh_comb_rank2(X(term_subset, :), W, H, params);
% W = zeros(m, 2);
% W(term_subset, :) = W_tmp;
% clear W_tmp;
% end
% end
%initial clusters, node
for i=1:sub_k
subidx = find(cl_idx(idx)==cluster_idx(i));
ctrary(:,i) = mean(X(:, subidx),2);
Ws{i} = ctrary(:,i);
is_leaf(i) = 1;
[~, W_buffer_one, H_buffer_one, priority_one]...
= trial_split(trial_allowance, unbalanced, min_priority, X, subidx, Ws{i}, params);
clusters{i} = subidx;
W_buffer{i} = W_buffer_one;
H_buffer{i} = H_buffer_one;
priorities(i) = priority_one;
end
result_used = sub_k;
for i = 1:k-sub_k
% timings(i) = toc(t0);
tic
leaves = find(is_leaf == 1);
temp_priority = priorities(leaves);
min_priority = min(temp_priority(temp_priority > 0));
[max_priority, split_node] = max(temp_priority);
if max_priority < 0
fprintf('Cannot generate all %d leaf clusters\n', k);
return;
end
split_node = leaves(split_node);
is_leaf(split_node) = 0;
W = W_buffer{split_node};
H = H_buffer{split_node};
split_subset = clusters{split_node};
new_nodes = [result_used+1 result_used+2];
tree(1, split_node) = new_nodes(1);
tree(2, split_node) = new_nodes(2);
result_used = result_used + 2;
[max_val, cluster_subset] = max(H);
clusters{new_nodes(1)} = split_subset(find(cluster_subset == 1));
clusters{new_nodes(2)} = split_subset(find(cluster_subset == 2));
Ws{new_nodes(1)} = W(:, 1);
Ws{new_nodes(2)} = W(:, 2);
splits(i) = split_node;
is_leaf(new_nodes) = 1;
subset = clusters{new_nodes(1)};
[subset, W_buffer_one, H_buffer_one, priority_one] = trial_split(trial_allowance, unbalanced, min_priority, X, subset, W(:, 1), params);
clusters{new_nodes(1)} = subset;
W_buffer{new_nodes(1)} = W_buffer_one;
H_buffer{new_nodes(1)} = H_buffer_one;
priorities(new_nodes(1)) = priority_one;
subset = clusters{new_nodes(2)};
[subset, W_buffer_one, H_buffer_one, priority_one] = trial_split(trial_allowance, unbalanced, min_priority, X, subset, W(:, 2), params);
clusters{new_nodes(2)} = subset;
W_buffer{new_nodes(2)} = W_buffer_one;
H_buffer{new_nodes(2)} = H_buffer_one;
priorities(new_nodes(2)) = priority_one;
toc
disp('nnlsm_blockpivot');
tic
W = cell2mat(Ws(find(is_leaf)));
[H,temp,suc_H,numChol_H,numEq_H] = nnlsm_blockpivot(W'*W,W'*X,1,rand(i+sub_k,size(X,2)));
toc
end