-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparseAutoencoderCost.m
63 lines (50 loc) · 2.06 KB
/
sparseAutoencoderCost.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
function [cost,grad] = sparseAutoencoderCost(theta, visibleSize, hiddenSize, ...
lambda, sparsityParam, beta, data)
% visibleSize: the number of input units
% hiddenSize: the number of hidden units
% lambda: weight decay parameter
% sparsityParam: The desired average activation for the hidden units
% (greek letter rho, which looks like a lower-case "p").
% beta: weight of sparsity penalty term
% data: Data store in column (DIFFERENT FROM H1) data(:,i) is the i-th training example.
%
% The input theta is a vector (because minFunc expects the parameters to be a vector).
% We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this
% follows the notation convention of neural nets (sparse encoder note).
W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize);
W2 = reshape(theta(hiddenSize*visibleSize+1:2*hiddenSize*visibleSize), visibleSize, hiddenSize);
b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize);
b2 = theta(2*hiddenSize*visibleSize+hiddenSize+1:end);
% shortcut constants
sp = sparsityParam;
m1 = 1/size(data,2);
% forward pass
a1 = data;
s2 = W1*a1;
s2 = bsxfun(@plus,b1,s2);
a2 = sigmoid(s2);
s3 = bsxfun(@plus,b2,W2*a2);
a3 = sigmoid(s3);
%cost
cost = (1/2)*m1*sum(sum((data-a3).^2));
% weight decay
regcost = lambda/2 * (sum(sum(W1.^2))+sum(sum(W2.^2)));
cost = cost + regcost;
% sparsity cost
esp = mean(a2,2);%sample sparsity (estimated sparsity)
kbl = sp * log((sp./esp)) + (1-sp)*log((1-sp)./(1-esp));
cost = cost + beta*sum(kbl);
% backwards pass
delta3 = -(data-a3).*sigmoid(s3).*(1-sigmoid(s3));
sp_grad = beta*(-(sp./esp) + (1-sp)./(1-esp));
delta2 = bsxfun(@plus,W2'*delta3,sp_grad).*sigmoid(s2).*(1-sigmoid(s2));
W1grad = m1*delta2*a1' + lambda.* W1;
W2grad = m1*delta3*a2' + lambda.* W2;
b1grad = sum(delta2,2)*m1;
b2grad = sum(delta3,2)*m1;
% Now we convert the gradients back to a vector format (suitable for minFunc).
grad = [W1grad(:) ; W2grad(:) ; b1grad(:) ; b2grad(:)];
end
function sigm = sigmoid(x)
sigm = 1 ./ (1 + exp(-x));
end