-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtalk2-boyd-chap8.tex
346 lines (301 loc) · 20.3 KB
/
talk2-boyd-chap8.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
\begin{document}
\title{Talk 2: Distributed Optimization and Statistical Learning via ADMM (II)}
\date{2021-5-13}
\author{WEN Hao}
\maketitle
{\bfseries Main Resource: Chapter 8 of \cite{boyd2011distributed}}
\section{Distributed Model Fitting Overview}
Consider a general convex (linear) model fitting problem
\begin{align*}
& \text{minimize} \quad \ell(Ax-b) + r(x)
\end{align*}
where
\begin{align*}
& x \in \mathbb{R}^n: \text{parameter vector} \\
& A \in \operatorname{Mat}_{m\times n}(\mathbb{R}): \text{feature matrix} \\
& b \in \mathbb{R}^m: \text{output (response, etc) vector} \\
& \ell: \mathbb{R}^m \rightarrow \mathbb{R}: \text{convex loss function} \\
& r: \mathbb{R}^n \rightarrow \mathbb{R}: \text{convex regularization function} \\
\end{align*}
Recall that $\ell$ is generally expressed as $\expectation\limits_{z\sim \mathcal{D}} \operatorname{loss}(x;z)$.
\begin{question}
$\ell(Ax,b)$ could be better? ref. classification.
\end{question}
For linear models with bias term, one can always add the bias term as the first (or last) element of $x$, and add a column with values 1 to the feature matrix $A$. In this way, the model can be written in a uniform and simple way $Ax$.
$\ell$ is usually additive w.r.t. samples, i.e.
$$\ell(Ax-b) = \sum\limits_{i=1}^m \ell_i(a_i^Tx-b_i)$$
where each $\ell_i$ is the loss function for sample $i$. For example one can assign (different) weights to each sample, thus different loss function yields from a common base loss function. For concrete examples, ref. \href{https://scikit-learn.org/stable/auto_examples/svm/plot_weighted_samples.html}{a scikit-learn example}.
Important examples of $r$:
\begin{align*}
& r(x) = \lambda \lVert x \rVert_2^2: \text{ridge penalty} \\
& r(x) = \lambda \lVert x \rVert_1: \text{lasso penalty} \\
& r(x) = \lambda_2 \lVert x \rVert_2^2 + \lambda_1 \lVert x \rVert_1: \text{elastic net} \\
& etc.
\end{align*}
\section{Examples of Model Fitting}
\subsection{(Linear) Regression}
Consider a linear model
$$b = a^Tx$$
One models each sample (measurement) as
$$b_i = a_i^Tx + \varepsilon_i$$
with $\varepsilon_i$ being measurement error or noise, which are independent with log-concave density $p_i$ (sometimes simpler, IID with density $p$). The likelihood function of the parameters $x$ w.r.t. the observations $\{(a_i,b_i)\}_{i=1}^m$ is
$$\operatorname{LH}(x) = \prod\limits_{i=1}^m p_i(\varepsilon_i) = \prod\limits_{i=1}^m p_i(b_i - a_i^Tx)$$
If $r = 0$ (no regularization), then the model fitting problem can be interpreted as maximum likelihood estimation (MLE) of $x$ under noise model $p_i$. For example, if we assume that $\varepsilon_i \sim N(0, \sigma^2)$ (IID), then the likelihood function of $x$ is
$$\operatorname{LH}(x) = \prod\limits_{i=1}^m \dfrac{1}{\sqrt{2\pi}\sigma} \exp\left(-\frac{(b_i-a_i^Tx)^2}{2\sigma^2}\right)$$
Therefore,
\begin{align*}
\operatorname{MLE}(x) & = \argmax_x \{\operatorname{LH}(x)\} = \argmin_x \{\operatorname{NLL}(x)\} \\
& = \argmin_x \left\{ \dfrac{1}{2\sigma^2} \sum\limits_{i=1}^n (b_i-a_i^Tx)^2 \right\} \\
& = \argmin_x \left\{ \sum\limits_{i=1}^m (b_i-a_i^Tx)^2 \right\}
\end{align*}
a least square problem.
If $r_i$ is taken to be the negative log prior density of $x_i$, then the model fitting problem can be interpreted as max a posteriori estimates (MAP) ( = $\argmax \left\{ \operatorname{LH} \cdot \operatorname{prior}\right\}$) estimation. Again, we model each sample (measurement) as
$b_i = a_i^Tx + \varepsilon_i$ with $\varepsilon_i \sim N(0, \sigma^2)$. Then
\begin{itemize}
\item if the parameters $x$ are endowed with Laplacian prior, then MAP of $x$ is equivalent to lasso,
\item if the parameters $x$ are endowed with normal prior, then MAP of $x$ is equivalent to ridge regression.
\end{itemize}
For example, let $x$ be endowed with Laplacian prior
$$p(x_j) = \dfrac{1}{2\tau}\exp\left( -\dfrac{|x_j|}{\tau} \right)$$
Then
\begin{align*}
\operatorname{MAP}(x) & = \argmax_x \{p(x) \cdot \operatorname{LH}(x) \} \\
& = \argmax_x \left\{ \prod\limits_{j=1}^n \dfrac{1}{2\tau}\exp\left( -\dfrac{|x_j|}{\tau} \right) \cdot \prod\limits_{i=1}^m \dfrac{1}{\sqrt{2\pi}\sigma} \exp\left(-\frac{(b_i-a_i^Tx)^2}{2\sigma^2}\right) \right\} \\
& = \argmin_x \left\{ \sum\limits_{i=1}^m (b_i-a_i^Tx)^2 + \lambda \lVert x \rVert_1 \right\}
\end{align*}
\subsection{Classification}
Consider a binary classification problem (multi-class or multi-label problems can be generalized as vector or sum or mean of this kind of problems). Suppose we have samples $\{p_i,q_i\}_{i=1}^m$, with $q_i\in\{-1,1\}$. The goal is to find a weight vector $w$ and bias $v$ s.t.
$\operatorname{sign}(p_i^Tw + v) = q_i$
holds ``for as many samples as possible''. The function
$$f(p_i) = p_i^Tw + v$$
is called a discriminant function (``decision function'' in scikit-learn), telling on which side of the classifying hyperplane we are and how far we are away from it. The (margin-based) loss functions is usually given by
$$\ell_i(p_i^Tw + v) = \ell_i(q_i(p_i^Tw + v)) \quad (\text{by abuse of notation})$$
where the quantity $\mu_i := q_i(p_i^Tw + v)$ is called the margin of sample $i$.
As a function of the margin $\mu_i$, $\ell_i$ should be (positive) decreasing. Common loss functions are
\begin{align*}
& \text{hinge loss}: \quad (1-\mu_i)_+ \\
& \text{exponential loss}: \quad \exp(-\mu_i) \\
& \text{logistic loss}: \quad \log(1+\exp(-\mu_i))
\end{align*}
Recall that SVM (SVC) is to solve
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^m (1 - q_i({\tikz[baseline,yshift=2.3pt]{\node[] (svm_kernel) {$\color{red} p_i^Tx$} } } + v))_+ + \lambda \lVert x \rVert_2^2
% \tikz[]{
% \node[above right = 0.3cm and 0.5cm of svm_kernel.north] (svm_kernel_text) {SVM kernel, can be generalized to non-linear $k(p_i, x)$};
% \path[->] (svm_kernel_text.south) edge ([xshift=-3cm]svm_kernel.north);
% }
\end{align*}
where hinge loss and $\ell_2$ regularizer are used. $p_i^Tx$ is the SVM kernel, which can be generalized to non-linear ones $k(p_i, x)$. (for more kernel functions, ref. \href{https://scikit-learn.org/stable/modules/svm.html#svm-kernels}{scikit-learn docs})
Let $\displaystyle f(\mu) = \dfrac{1}{1+\exp(-\mu)}$, then $f(\mu_i) = f(q_i(p_i^Tw + v))$ can be given as the probability of predicting the ground truth. In this case, the (binary) cross entropy loss is given as
$$\operatorname{CE}_i(x) = - (1 \cdot \log(f(\mu_i)) + 0 \cdot \log(1-f(\mu_i))) = \log(1+\exp(-\mu_i))$$
For more loss functions and deeper insights for classification, ref. \href{https://en.wikipedia.org/wiki/Loss_functions_for_classification}{Wikipedia} and references listed therein.
\section{Splitting across Examples (Horizontal splitting)}
In the model fitting problem
\begin{align*}
& \text{minimize} \quad \ell(Ax-b) + r(x)
\end{align*}
we partition the feature matrix $A$ and labels $b$ by rows, i.e.
$$A = \begin{pmatrix} A_1 \\ \vdots \\ A_N \end{pmatrix}, \quad b = \begin{pmatrix} b_1 \\ \vdots \\ b_N \end{pmatrix},$$
where $A_i \in \operatorname{Mat}_{m_i\times n}, b_i \in \mathbb{R}^{m_i}$ are from samples of ``client'' $i$. The model fitting problem thus is formulated as follows
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N \ell_i(A_ix_i-b_i) + r(z) \\
& \text{subject to} \quad x_i = z
\end{align*}
as a {\bfseries consensus problem (with regularization)}.
The scaled ADMM iterations of the above optimization problem are
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \ell_i(A_ix_i-b_i) + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert_2^2 \right\} = \operatorname{prox}_{\tilde{\ell}_i,\rho}(z^k - u_i^k) \\
z^{k+1} & = \argmin_z \left\{ r(z) + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert_2^2 \right\} = \operatorname{prox}_{r,N\rho}(\overline{x}^{k+1} + \overline{u}^k) \\
u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}
where $\tilde{\ell}_i(x_i) := \ell_i(A_ix_i-b_i)$. It can be seen that
\begin{align*}
\text{$x$-update} & \leftarrow \text{parallel $\ell_2$-regularized model fitting problems} \\
\text{$z$-update} & \leftarrow \text{averaging $x,z$, and minimization problem} \\
\end{align*}
\subsection{Example: Lasso}
Recall that Lasso is the following optimization problem
\begin{align*}
& \text{minimize} \quad \dfrac{1}{2} \lVert Ax - b \rVert_2^2 + \lambda \lVert x \rVert_1
\end{align*}
The corresponding distributed (consensus) version of ADMM algorithm is
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - b_i \rVert_2^2 + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert_2^2 \right\} \\
z^{k+1} & = \argmin_z \left\{ \lambda \lVert z \rVert_1 + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert_2^2 \right\} = \operatorname{S}_{\lambda/N\rho}(\overline{x}^{k+1} + \overline{u}^k) \\
u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}
Each $x_i$-update is a ridge regression problem, which is equivalent to the least square problem
\begin{align*}
& \text{minimize} \left\lVert \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix} x_i - \begin{pmatrix} b_i \\ \sqrt{\rho}(z^k - u_i^k) \end{pmatrix} \right\rVert_2^2
\end{align*}
thus having analytic solution (and numerically solved by the so-called direct method)
\begin{align*}
x_i^{k+1} & = \left( \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix}^T \cdot \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix} \right)^{-1} \cdot \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix}^T \cdot \begin{pmatrix} b_i \\ \sqrt{\rho}(z^k - u_i^k) \end{pmatrix} \\
& = \tikz[]{\node[] (lasso_cache) {}} {\color{red} (A_i^TA_i + \rho I)^{-1} } (A_i^Tb_i + \rho (z^k - u_i^k) )
\end{align*}
Accelerations on $x_i$-updates:
\begin{itemize}
\item[(1)] $\color{red} (A_i^TA_i + \rho I)^{-1}$ is independent of $k$, hence (its factorizations) can be precomputed and used for each $x_i$ update.
\item[(2)] If further, $m_i < n$ (\# samples < \# features), by \href{https://en.wikipedia.org/wiki/Woodbury_matrix_identity}{Woodbury matrix identity} (or matrix inverse lemma),
$$(A_i^TA_i + \rho I)^{-1} = \dfrac{1}{\rho} - \dfrac{1}{\rho} A_i^T {\color{red} (A_iA_i^T + \rho I)^{-1}} A_i$$
The size $A_iA_i^T + \rho I$ is smaller, hence requires less computation.
\end{itemize}
\subsection{Example: SVM (SVC)}
Recall again that the SVM (SVC) is the following optimization problem
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^m (1 - q_i( p_i^Tx + v))_+ + \lambda \lVert x \rVert_2^2
\end{align*}
Ignore the bias term $v$ for convenience, otherwise one can replace $x$ by $\begin{pmatrix} x \\ v \end{pmatrix}$, and replace $p_i^T$ by $(p_i^T, 1)$. Write
$$A = \begin{pmatrix} -q_1p_1^T \\ \vdots \\ -q_mp_m^T \end{pmatrix},$$
then the problem rewrites
\begin{align*}
& \text{minimize} \quad \mathbf{1}^T (\mathbf{1} + Ax)_+ + \lambda \lVert x \rVert_2^2
\end{align*}
and in the horizontal splitting consensus form as
\begin{align*}
& \text{minimize} \quad \mathbf{1}^T (\mathbf{1} + A_ix_i)_+ + \lambda \lVert z \rVert_2^2 \\
& \text{subject to} \quad x_i = z
\end{align*}
with ADMM iterations
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \mathbf{1}^T (\mathbf{1} + A_ix_i)_+ + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert_2^2 \right\} \\
z^{k+1} & = \argmin_z \left\{ \lambda \lVert z \rVert_2^2 + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert_2^2 \right\} = {\color{green} \dfrac{N\rho}{2\lambda + N\rho}} (\overline{x}^{k+1} + \overline{u}^k) \\
u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}
\section{Splitting across Features (Vertical splitting)}
Let the feature matrix $A$ and parameter vector $x$ be partitioned vertically as
$$A = (A_1, \cdots, A_N), \quad x = (x_1, \cdots, x_N)$$
with $A_i \in \operatorname{Mat}_{m\times n_i}(\mathbb{R}), x_i \in \mathbb{R}^{n_i}$. Each $A_i$ can be considered as ``partial'' feature matrix, and $A_ix_i$ ``partial'' predictions. The ``full'' prediction is given as
$$Ax = \sum\limits_{i=1}^N A_ix_i$$
The model fitting problem hence is formulated as follows
\begin{align*}
& \text{minimize} \quad \ell(\sum\limits_{i=1}^N A_ix_i - b) + \sum\limits_{i=1}^N r_i(x_i)
\end{align*}
or better to be written
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N r_i(x_i) + \ell(\sum\limits_{i=1}^N A_ix_i - b)
\end{align*}
which can be further formulated as a sharing problem
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N r_i(x_i) + \ell(\framebox{$\sum\limits_{i=1}^N z_i$} - b) \\
& \text{subject to} \quad {\color{red} A_i}x_i = z_i
\end{align*}
The scaled ADMM iterations (slightly different from a standard sharing problem) are
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ r_i(x_i) + \dfrac{\rho}{2} \lVert A_ix_i - A_ix_i^k - \overline{z}^k + \overline{Ax}^k + u^k \rVert_2^2 \right\} \\
% & = \operatorname{prox}_{r_i,\rho}(A_ix_i^k + \overline{z}^k - \overline{Ax}^k - u^k) \\
\overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \ell(N\overline{z}-b) + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{Ax}^{k+1} - u^k \rVert_2^2 \right\} \\
% & = \operatorname{prox}_{\widetilde{\ell},N\rho}(\overline{Ax}^{k+1} + u^k) \\
u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}
which can be interpreted as
\begin{align*}
\text{$x$-update} \leftarrow \text{parallel regularized ($r_i$) least square problems} \\
\text{$\overline{z}$-update} \leftarrow \text{$\ell_2$ regularized loss ($\ell$) minimization problem}
\end{align*}
Here $\overline{Ax} := \dfrac{1}{N} \sum\limits_{i=1}^N A_ix_i$
\subsection{Example: Lasso}
We fit the Lasso optimization problem
\begin{align*}
& \text{minimize} \quad \dfrac{1}{2} \lVert Ax - b \rVert_2^2 + \lambda \lVert x \rVert_1
\end{align*}
into the form of the vertical splitting sharing problem as
\begin{align*}
& \text{minimize} \quad \dfrac{1}{2} \left\lVert \sum\limits_{i=1}^N z_i - b \right\rVert_2^2 + \lambda \sum\limits_{i=1}^N \lVert x_i \rVert_1 \\
& \text{subject to} \quad A_ix_i = z_i
\end{align*}
with ADMM iterations
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \lambda\lVert x_i \rVert_1 + \dfrac{\rho}{2} \lVert A_ix_i - A_ix_i^k - \overline{z}^k + \overline{Ax}^k + u^k \rVert_2^2 \right\} \\
& = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - \underbrace{(A_ix_i^k - \overline{Ax}^k + \overline{z}^k - u^k)}_{v_i} \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_1 \right\} \\
& \leftarrow \text{ $N$ parallel smaller Lasso problem} \\
\overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \dfrac{1}{2} \lVert N\overline{z}-b \rVert_2^2 + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{Ax}^{k+1} - u^k \rVert_2^2 \right\} \\
& = \dfrac{1}{N + \rho} \left( b + \overline{Ax}^{k+1} + \overline{u}^k \right) \\
u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}
For the $x_i$-update, $x_i^{k+1} := \argmin\limits_{x_i} \left\{ \dfrac{1}{2} \lVert v_i - A_ix_i \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_1 \right\}$ has to satisfy the subgradient conditions
\begin{align*}
& A_i^T (v_i-A_ix_i^{k+1}) = \dfrac{\lambda}{\rho} \partial \lVert x_i^{k+1} \rVert_1 = \dfrac{\lambda}{\rho} \begin{pmatrix} s_1 \\ \vdots \\ s_n \end{pmatrix} \\
\text{where} & \quad s_j \begin{cases} = \operatorname{sign}((x_i^{k+1})_j) & \text{ if } (x_i^{k+1})_j \neq 0 \\ \in [-1, 1] & \text{ if } (x_i^{k+1})_j = 0 \end{cases}
\end{align*}
It is claimed that
$$x^{k+1}_i = 0 \Longleftrightarrow \lVert A_i^T v_i \rVert_{\color{red}\infty} \leqslant \dfrac{\lambda}{\rho}$$
{\color{red}
Indeed, consider
$$\mathcal{L}(x_i) := \dfrac{1}{2} \lVert v_i - A_ix_i \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_1,$$
then
\begin{align*}
0 \text{ is solution to } \argmin_{x_i} \mathcal{L}(x_i) & \Longleftrightarrow \nabla_s \mathcal{L}(0) \geqslant 0, \ \forall s \\
& \Longleftrightarrow \langle -A_i^T (v_i - 0), s \rangle + \dfrac{\lambda}{\rho} \lVert s \rVert_1 \geqslant 0, \ \forall s \\
& \Longleftrightarrow \dfrac{\lambda}{\rho} \geqslant \max\limits_{\lVert s \rVert_1=1} \langle A_i^T v_i, s \rangle \\
& \Longleftrightarrow \dfrac{\lambda}{\rho} \geqslant \lVert A_i^T v_i \rVert_{\infty}
\end{align*}
}
For more, ref. \cite{hastie2019statistical} exercise 2.1.
\subsection{Example: Group Lasso}
Group Lasso is the following generalization, where features are (rearranged if needed) grouped and corr. to a vertical splitting, of the standard Lasso:
\begin{align*}
& \text{minimize} \quad \left\{ \dfrac{1}{2}\left\|\sum _{i=1}^{N}A_ix_i - b \right\|_{2}^{2} + \lambda \sum _{i=1}^{N}\|x_{i}\|_2 \right\}
\end{align*}
ADMM iterations are
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - \underbrace{(A_ix_i^k - \overline{Ax}^k + \overline{z}^k - u^k)}_{v_i} \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_{\color{red} 2} \right\} \\
\overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \dfrac{1}{2} \lVert N\overline{z}-b \rVert_2^2 + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{Ax}^{k+1} - u^k \rVert_2^2 \right\} \\
& = \dfrac{1}{N + \rho} \left( b + \overline{Ax}^{k+1} + \overline{u}^k \right) \\
u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}
For the $x_i$-update, one similarly has
\begin{align*}
& A_i^T (v_i-A_ix_i^{k+1}) = \dfrac{\lambda}{\rho} \partial \lVert x_i^{k+1} \rVert_2 \begin{cases} = \dfrac{\lambda}{\rho} \cdot \dfrac{x_i^{k+1}}{\lVert x_i^{k+1} \rVert_2} & \text{ if } x_i^{k+1} \neq 0 \\ \in \dfrac{\lambda}{\rho} \cdot \mathbb{B}(0,1) & \text{ if } x_i^{k+1} = 0 \end{cases}
\end{align*}
i.e.
$$x_i^{k+1} = (A_i^TA_i + \tilde{\lambda})^{-1} A_i^Tv_i \quad \text{ with $\tilde{\lambda}$ satisfying } \tilde{\lambda}\rho \lVert x_i^{k+1} \rVert_2 = \lambda \text{ if } x_i^{k+1} \neq 0.$$
Again, it's claimed that (note the difference with ordinary Lasso on the penalty term)
$$x^{k+1}_i = 0 \Longleftrightarrow \lVert A_i^T v_i \rVert_2 \leqslant \dfrac{\lambda}{\rho}$$
\subsection{Example: SVM}
The vertical splitting version of SVM is
\begin{align*}
& \text{minimize} \quad \mathbf{1}^T (\mathbf{1} + \sum\limits_{i=1}^N A_ix_i)_+ + \lambda \sum\limits_{i=1}^N \lVert x_i \rVert_2^2
\end{align*}
ADMM iterations are
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - \underbrace{(A_ix_i^k - \overline{Ax}^k + \overline{z}^k - u^k)}_{v_i} \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_2^2 \right\} \\
& \leftarrow \text{ parallel ridge regression} \\
& = \left( A_i^TA_i + \dfrac{2\lambda}{\rho}I \right)^{-1} A_i^T v_i \\
\overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \mathbf{1}^T (\mathbf{1} + N\overline{z})_+ + \dfrac{N\rho}{2} \lVert \overline{z} - \underbrace{(\overline{Ax}^{k+1} + u^k)}_{s} \rVert_2^2 \right\} \\
& = \argmin\limits_{\overline{z}} \left\{ \sum\limits_{j=1}^n \left( (1+N\overline{z}_j)_+ + \dfrac{N\rho}{2}(\overline{z}_j-s_j)^2 \right) \right\} \\
u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}
\begin{figure}[H]
\centering
\begin{tikzpicture}
\draw[blue] (-6, 0) -- (-0.5, 0);
\draw[->] (-0.5, 0) -- (4, 0) node[right] {$\overline{z}_j$};
\draw[->] (0, -2) -- (0, 6);
\draw[dashed] (-0.5, -2) -- (-0.5, 6);
\node (one_over_N) at (-0.9,-0.6) {$\dfrac{1}{N}$};
\draw[domain=-0.5:2.5, smooth, variable=\x, blue] plot ({\x}, {1+2*\x});
\draw[dashed, domain=-4:1, smooth, variable=\x, red] plot ({\x}, {(\x+1.5)*(\x+1.5)});
\draw[dashed, domain=-1.5:3.5, smooth, variable=\x, red] plot ({\x}, {(\x-1)*(\x-1)});
\draw[dashed, domain=-2:1, smooth, variable=\x, cyan] plot ({\x}, {-3*\x+0.75});
\node at (3,-2) {slope $N\rho\left(-\dfrac{1}{N}-s_j\right)$};
\node[circle,fill,inner sep=2.5pt] (tangent) at (-0.5,2.25) {};
\end{tikzpicture}
\caption{sketch of $\overline{z}$-update of vertical splitting SVM}
\end{figure}
$\overline{z}$-update splits to the component level, i.e.
$$(1+N\overline{z}_j)_+ + \dfrac{N\rho}{2}(\overline{z}_j-s_j)^2$$
and are easily computed
$$\overline{z}_j = \begin{cases}
s_j - \dfrac{1}{\rho} & \text{ if } s_j > -\dfrac{1}{N} + \dfrac{1}{\rho} \\
-\dfrac{1}{N} & \text{ if } s_j \in [-\dfrac{1}{N}, -\dfrac{1}{N} + \dfrac{1}{\rho}] \\
s_j & \text{ if } s_j < -\dfrac{1}{N}
\end{cases}
$$
% \subsection{Generalized Additive Models}
% Consider generalized additive (additive w.r.t. features) model $F$, i.e.
% $$F(a) = \sum\limits_{j=1}^n F_j(a_j)$$
% where $a$ is a feature vector, $F_j: \mathbb{R} \to \mathbb{R}$ are feature functions.
\bibliographystyle{ieeetr}
\bibliography{references}
\end{document}