notes/talk2-boyd-chap8.tex


\begin{document}

\title{Talk 2: Distributed Optimization and Statistical Learning via ADMM (II)}
\date{2021-5-13}
\author{WEN Hao}

\maketitle

{\bfseries Main Resource: Chapter 8 of \cite{boyd2011distributed}}

\section{Distributed Model Fitting Overview}

Consider a general convex (linear) model fitting problem
\begin{align*}
    & \text{minimize} \quad \ell(Ax-b) + r(x)
\end{align*}
where
\begin{align*}
    & x \in \mathbb{R}^n: \text{parameter vector} \\
    & A \in \operatorname{Mat}_{m\times n}(\mathbb{R}): \text{feature matrix} \\
    & b \in \mathbb{R}^m: \text{output (response, etc) vector} \\
    & \ell: \mathbb{R}^m \rightarrow \mathbb{R}: \text{convex loss function} \\
    & r: \mathbb{R}^n \rightarrow \mathbb{R}: \text{convex regularization function} \\
\end{align*}
Recall that $\ell$ is generally expressed as $\expectation\limits_{z\sim \mathcal{D}} \operatorname{loss}(x;z)$.

\begin{question}
$\ell(Ax,b)$ could be better? ref. classification.
\end{question}

For linear models with bias term, one can always add the bias term as the first (or last) element of $x$, and add a column with values 1 to the feature matrix $A$. In this way, the model can be written in a uniform and simple way $Ax$.

$\ell$ is usually additive w.r.t. samples, i.e.
$$\ell(Ax-b) = \sum\limits_{i=1}^m \ell_i(a_i^Tx-b_i)$$
where each $\ell_i$ is the loss function for sample $i$. For example one can assign (different) weights to each sample, thus different loss function yields from a common base loss function. For concrete examples, ref. \href{https://scikit-learn.org/stable/auto_examples/svm/plot_weighted_samples.html}{a scikit-learn example}.

Important examples of $r$:
\begin{align*}
    & r(x) = \lambda \lVert x \rVert_2^2: \text{ridge penalty} \\
    & r(x) = \lambda \lVert x \rVert_1: \text{lasso penalty} \\
    & r(x) = \lambda_2 \lVert x \rVert_2^2 + \lambda_1 \lVert x \rVert_1: \text{elastic net} \\
    & etc.
\end{align*}

\section{Examples of Model Fitting}

\subsection{(Linear) Regression}

Consider a linear model
$$b = a^Tx$$
One models each sample (measurement) as
$$b_i = a_i^Tx + \varepsilon_i$$
with $\varepsilon_i$ being measurement error or noise, which are independent with log-concave density $p_i$ (sometimes simpler, IID with density $p$). The likelihood function of the parameters $x$ w.r.t. the observations $\{(a_i,b_i)\}_{i=1}^m$ is
$$\operatorname{LH}(x) = \prod\limits_{i=1}^m p_i(\varepsilon_i) = \prod\limits_{i=1}^m p_i(b_i - a_i^Tx)$$
If $r = 0$ (no regularization), then the model fitting problem can be interpreted as maximum likelihood estimation (MLE) of $x$ under noise model $p_i$. For example, if we assume that $\varepsilon_i \sim N(0, \sigma^2)$ (IID), then the likelihood function of $x$ is
$$\operatorname{LH}(x) = \prod\limits_{i=1}^m \dfrac{1}{\sqrt{2\pi}\sigma} \exp\left(-\frac{(b_i-a_i^Tx)^2}{2\sigma^2}\right)$$
Therefore,
\begin{align*}
    \operatorname{MLE}(x) & = \argmax_x \{\operatorname{LH}(x)\} = \argmin_x \{\operatorname{NLL}(x)\} \\
    & = \argmin_x \left\{ \dfrac{1}{2\sigma^2} \sum\limits_{i=1}^n (b_i-a_i^Tx)^2 \right\} \\
    & = \argmin_x \left\{ \sum\limits_{i=1}^m (b_i-a_i^Tx)^2 \right\}
\end{align*}
a least square problem.

If $r_i$ is taken to be the negative log prior density of $x_i$, then the model fitting problem can be interpreted as max a posteriori estimates (MAP) ( = $\argmax \left\{ \operatorname{LH} \cdot \operatorname{prior}\right\}$) estimation. Again, we model each sample (measurement) as
$b_i = a_i^Tx + \varepsilon_i$ with $\varepsilon_i \sim N(0, \sigma^2)$. Then 
\begin{itemize}
    \item if the parameters $x$ are endowed with Laplacian prior, then MAP of $x$ is equivalent to lasso,
    \item if the parameters $x$ are endowed with normal prior, then MAP of $x$ is equivalent to ridge regression.
\end{itemize}
For example, let $x$ be endowed with Laplacian prior
$$p(x_j) = \dfrac{1}{2\tau}\exp\left( -\dfrac{|x_j|}{\tau} \right)$$
Then
\begin{align*}
\operatorname{MAP}(x) & = \argmax_x \{p(x) \cdot \operatorname{LH}(x) \} \\
& = \argmax_x \left\{ \prod\limits_{j=1}^n \dfrac{1}{2\tau}\exp\left( -\dfrac{|x_j|}{\tau} \right) \cdot \prod\limits_{i=1}^m \dfrac{1}{\sqrt{2\pi}\sigma} \exp\left(-\frac{(b_i-a_i^Tx)^2}{2\sigma^2}\right) \right\} \\
& = \argmin_x \left\{ \sum\limits_{i=1}^m (b_i-a_i^Tx)^2 + \lambda \lVert x \rVert_1 \right\}
\end{align*}

\subsection{Classification}

Consider a binary classification problem (multi-class or multi-label problems can be generalized as vector or sum or mean of this kind of problems). Suppose we have samples $\{p_i,q_i\}_{i=1}^m$, with $q_i\in\{-1,1\}$. The goal is to find a weight vector $w$ and bias $v$ s.t.
$\operatorname{sign}(p_i^Tw + v) = q_i$
holds ``for as many samples as possible''. The function
$$f(p_i) = p_i^Tw + v$$
is called a discriminant function (``decision function'' in scikit-learn), telling on which side of the classifying hyperplane we are and how far we are away from it. The (margin-based) loss functions is usually given by
$$\ell_i(p_i^Tw + v) = \ell_i(q_i(p_i^Tw + v)) \quad (\text{by abuse of notation})$$
where the quantity $\mu_i := q_i(p_i^Tw + v)$ is called the margin of sample $i$.

As a function of the margin $\mu_i$, $\ell_i$ should be (positive) decreasing. Common loss functions are
\begin{align*}
    & \text{hinge loss}: \quad (1-\mu_i)_+ \\
    & \text{exponential loss}: \quad \exp(-\mu_i) \\
    & \text{logistic loss}: \quad \log(1+\exp(-\mu_i))
\end{align*}

Recall that SVM (SVC) is to solve
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^m (1 - q_i({\tikz[baseline,yshift=2.3pt]{\node[] (svm_kernel) {$\color{red} p_i^Tx$} } } + v))_+ + \lambda \lVert x \rVert_2^2
% \tikz[]{
% \node[above right = 0.3cm and 0.5cm of svm_kernel.north] (svm_kernel_text) {SVM kernel, can be generalized to non-linear $k(p_i, x)$};
% \path[->] (svm_kernel_text.south) edge ([xshift=-3cm]svm_kernel.north);
% }
\end{align*}
where hinge loss and $\ell_2$ regularizer are used. $p_i^Tx$ is the SVM kernel, which can be generalized to non-linear ones $k(p_i, x)$. (for more kernel functions, ref. \href{https://scikit-learn.org/stable/modules/svm.html#svm-kernels}{scikit-learn docs})


Let $\displaystyle f(\mu) = \dfrac{1}{1+\exp(-\mu)}$, then $f(\mu_i) = f(q_i(p_i^Tw + v))$ can be given as the probability of predicting the ground truth. In this case, the (binary) cross entropy loss is given as
$$\operatorname{CE}_i(x) = - (1 \cdot \log(f(\mu_i)) + 0 \cdot \log(1-f(\mu_i))) = \log(1+\exp(-\mu_i))$$

For more loss functions and deeper insights for classification, ref. \href{https://en.wikipedia.org/wiki/Loss_functions_for_classification}{Wikipedia} and references listed therein.

\section{Splitting across Examples (Horizontal splitting)}

In the model fitting problem
\begin{align*}
    & \text{minimize} \quad \ell(Ax-b) + r(x)
\end{align*}
we partition the feature matrix $A$ and labels $b$ by rows, i.e.
$$A = \begin{pmatrix} A_1 \\ \vdots \\ A_N \end{pmatrix}, \quad b = \begin{pmatrix} b_1 \\ \vdots \\ b_N \end{pmatrix},$$
where $A_i \in \operatorname{Mat}_{m_i\times n}, b_i \in \mathbb{R}^{m_i}$ are from samples of ``client'' $i$. The model fitting problem thus is formulated as follows
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^N \ell_i(A_ix_i-b_i) + r(z) \\
    & \text{subject to} \quad x_i = z
\end{align*}
as a {\bfseries consensus problem (with regularization)}.

The scaled ADMM iterations of the above optimization problem are
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \ell_i(A_ix_i-b_i) + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert_2^2 \right\} = \operatorname{prox}_{\tilde{\ell}_i,\rho}(z^k - u_i^k) \\
    z^{k+1} & = \argmin_z \left\{ r(z) + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert_2^2 \right\} = \operatorname{prox}_{r,N\rho}(\overline{x}^{k+1} + \overline{u}^k) \\
    u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}
where $\tilde{\ell}_i(x_i) := \ell_i(A_ix_i-b_i)$. It can be seen that
\begin{align*}
    \text{$x$-update} & \leftarrow \text{parallel $\ell_2$-regularized model fitting problems} \\
    \text{$z$-update} & \leftarrow \text{averaging $x,z$, and minimization problem} \\
\end{align*}

\subsection{Example: Lasso}
Recall that Lasso is the following optimization problem
\begin{align*}
    & \text{minimize} \quad \dfrac{1}{2} \lVert Ax - b \rVert_2^2 + \lambda \lVert x \rVert_1
\end{align*}
The corresponding distributed (consensus) version of ADMM algorithm is
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - b_i \rVert_2^2 + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert_2^2 \right\} \\
    z^{k+1} & = \argmin_z \left\{ \lambda \lVert z \rVert_1 + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert_2^2 \right\} = \operatorname{S}_{\lambda/N\rho}(\overline{x}^{k+1} + \overline{u}^k) \\
    u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}
Each $x_i$-update is a ridge regression problem, which is equivalent to the least square problem
\begin{align*}
    & \text{minimize} \left\lVert \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix} x_i - \begin{pmatrix} b_i \\ \sqrt{\rho}(z^k - u_i^k) \end{pmatrix} \right\rVert_2^2
\end{align*}
thus having analytic solution (and numerically solved by the so-called direct method)
\begin{align*}
x_i^{k+1} & = \left( \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix}^T \cdot \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix} \right)^{-1} \cdot \begin{pmatrix} A_i \\ \sqrt{\rho}I \end{pmatrix}^T \cdot \begin{pmatrix} b_i \\ \sqrt{\rho}(z^k - u_i^k) \end{pmatrix} \\
& = \tikz[]{\node[] (lasso_cache) {}} {\color{red} (A_i^TA_i + \rho I)^{-1} } (A_i^Tb_i + \rho (z^k - u_i^k) )
\end{align*}

Accelerations on $x_i$-updates:
\begin{itemize}
    \item[(1)] $\color{red} (A_i^TA_i + \rho I)^{-1}$ is independent of $k$, hence (its factorizations) can be precomputed and used for each $x_i$ update.
    \item[(2)] If further, $m_i < n$ (\# samples < \# features), by \href{https://en.wikipedia.org/wiki/Woodbury_matrix_identity}{Woodbury matrix identity} (or matrix inverse lemma),
    $$(A_i^TA_i + \rho I)^{-1} = \dfrac{1}{\rho} - \dfrac{1}{\rho} A_i^T {\color{red} (A_iA_i^T + \rho I)^{-1}} A_i$$
    The size $A_iA_i^T + \rho I$ is smaller, hence requires less computation.
\end{itemize}

\subsection{Example: SVM (SVC)}

Recall again that the SVM (SVC) is the following optimization problem
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^m (1 - q_i( p_i^Tx + v))_+ + \lambda \lVert x \rVert_2^2
\end{align*}
Ignore the bias term $v$ for convenience, otherwise one can replace $x$ by $\begin{pmatrix} x \\ v \end{pmatrix}$, and replace $p_i^T$ by $(p_i^T, 1)$. Write
$$A = \begin{pmatrix} -q_1p_1^T \\ \vdots \\ -q_mp_m^T \end{pmatrix},$$
then the problem rewrites
\begin{align*}
    & \text{minimize} \quad  \mathbf{1}^T (\mathbf{1} + Ax)_+ + \lambda \lVert x \rVert_2^2
\end{align*}
and in the horizontal splitting consensus form as
\begin{align*}
    & \text{minimize} \quad \mathbf{1}^T (\mathbf{1} + A_ix_i)_+ + \lambda \lVert z \rVert_2^2 \\
    & \text{subject to} \quad x_i = z
\end{align*}
with ADMM iterations
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \mathbf{1}^T (\mathbf{1} + A_ix_i)_+ + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert_2^2 \right\} \\
    z^{k+1} & = \argmin_z \left\{ \lambda \lVert z \rVert_2^2 + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert_2^2 \right\} = {\color{green} \dfrac{N\rho}{2\lambda + N\rho}} (\overline{x}^{k+1} + \overline{u}^k) \\
    u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}

\section{Splitting across Features (Vertical splitting)}

Let the feature matrix $A$ and parameter vector $x$ be partitioned vertically as
$$A = (A_1, \cdots, A_N), \quad x = (x_1, \cdots, x_N)$$
with $A_i \in \operatorname{Mat}_{m\times n_i}(\mathbb{R}), x_i \in \mathbb{R}^{n_i}$. Each $A_i$ can be considered as ``partial'' feature matrix, and $A_ix_i$ ``partial'' predictions. The ``full'' prediction is given as
$$Ax = \sum\limits_{i=1}^N A_ix_i$$
The model fitting problem hence is formulated as follows
\begin{align*}
    & \text{minimize} \quad \ell(\sum\limits_{i=1}^N A_ix_i - b) + \sum\limits_{i=1}^N r_i(x_i)
\end{align*}
or better to be written
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^N r_i(x_i) + \ell(\sum\limits_{i=1}^N A_ix_i - b)
\end{align*}
which can be further formulated as a sharing problem
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^N r_i(x_i) + \ell(\framebox{$\sum\limits_{i=1}^N z_i$} - b) \\
    & \text{subject to} \quad {\color{red} A_i}x_i = z_i
\end{align*}
The scaled ADMM iterations (slightly different from a standard sharing problem) are
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ r_i(x_i) + \dfrac{\rho}{2} \lVert A_ix_i - A_ix_i^k - \overline{z}^k + \overline{Ax}^k + u^k \rVert_2^2 \right\} \\
    % & = \operatorname{prox}_{r_i,\rho}(A_ix_i^k + \overline{z}^k - \overline{Ax}^k - u^k) \\
    \overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \ell(N\overline{z}-b) + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{Ax}^{k+1} - u^k \rVert_2^2 \right\} \\
    % & = \operatorname{prox}_{\widetilde{\ell},N\rho}(\overline{Ax}^{k+1} + u^k) \\
    u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}
which can be interpreted as
\begin{align*}
    \text{$x$-update} \leftarrow \text{parallel regularized ($r_i$) least square problems} \\
    \text{$\overline{z}$-update} \leftarrow \text{$\ell_2$ regularized loss ($\ell$) minimization problem}
\end{align*}
Here $\overline{Ax} := \dfrac{1}{N} \sum\limits_{i=1}^N A_ix_i$

\subsection{Example: Lasso}
We fit the Lasso optimization problem
\begin{align*}
    & \text{minimize} \quad \dfrac{1}{2} \lVert Ax - b \rVert_2^2 + \lambda \lVert x \rVert_1
\end{align*}
into the form of the vertical splitting sharing problem as
\begin{align*}
    & \text{minimize} \quad \dfrac{1}{2} \left\lVert \sum\limits_{i=1}^N z_i - b \right\rVert_2^2 + \lambda \sum\limits_{i=1}^N \lVert x_i \rVert_1 \\
    & \text{subject to} \quad A_ix_i = z_i
\end{align*}
with ADMM iterations
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \lambda\lVert x_i \rVert_1 + \dfrac{\rho}{2} \lVert A_ix_i - A_ix_i^k - \overline{z}^k + \overline{Ax}^k + u^k \rVert_2^2 \right\} \\
    & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - \underbrace{(A_ix_i^k - \overline{Ax}^k + \overline{z}^k - u^k)}_{v_i} \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_1 \right\} \\
    & \leftarrow \text{ $N$ parallel smaller Lasso problem} \\
    \overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \dfrac{1}{2} \lVert N\overline{z}-b \rVert_2^2 + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{Ax}^{k+1} - u^k \rVert_2^2 \right\} \\
    & = \dfrac{1}{N + \rho} \left( b + \overline{Ax}^{k+1} + \overline{u}^k \right) \\
    u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}

For the $x_i$-update, $x_i^{k+1} := \argmin\limits_{x_i} \left\{ \dfrac{1}{2} \lVert v_i - A_ix_i \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_1 \right\}$ has to satisfy the subgradient conditions
\begin{align*}
    & A_i^T (v_i-A_ix_i^{k+1}) = \dfrac{\lambda}{\rho} \partial \lVert x_i^{k+1} \rVert_1 = \dfrac{\lambda}{\rho} \begin{pmatrix} s_1 \\ \vdots \\ s_n \end{pmatrix} \\
    \text{where} & \quad s_j \begin{cases} = \operatorname{sign}((x_i^{k+1})_j) & \text{ if } (x_i^{k+1})_j \neq 0 \\ \in [-1, 1] & \text{ if } (x_i^{k+1})_j = 0 \end{cases}
\end{align*}
It is claimed that
$$x^{k+1}_i = 0 \Longleftrightarrow \lVert A_i^T v_i \rVert_{\color{red}\infty} \leqslant \dfrac{\lambda}{\rho}$$
{\color{red}
Indeed, consider
$$\mathcal{L}(x_i) := \dfrac{1}{2} \lVert v_i - A_ix_i \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_1,$$
then
\begin{align*}
    0 \text{ is solution to } \argmin_{x_i} \mathcal{L}(x_i) & \Longleftrightarrow \nabla_s \mathcal{L}(0) \geqslant 0, \ \forall s \\
    & \Longleftrightarrow \langle -A_i^T (v_i - 0), s \rangle + \dfrac{\lambda}{\rho} \lVert s \rVert_1 \geqslant 0, \ \forall s \\
    & \Longleftrightarrow \dfrac{\lambda}{\rho} \geqslant \max\limits_{\lVert s \rVert_1=1} \langle A_i^T v_i, s \rangle \\
    & \Longleftrightarrow \dfrac{\lambda}{\rho} \geqslant \lVert A_i^T v_i \rVert_{\infty}
\end{align*}
}
For more, ref. \cite{hastie2019statistical} exercise 2.1.


\subsection{Example: Group Lasso}

Group Lasso is the following generalization, where features are (rearranged if needed) grouped and corr. to a vertical splitting, of the standard Lasso:
\begin{align*}
    & \text{minimize} \quad \left\{ \dfrac{1}{2}\left\|\sum _{i=1}^{N}A_ix_i - b \right\|_{2}^{2} + \lambda \sum _{i=1}^{N}\|x_{i}\|_2 \right\}
\end{align*}
ADMM iterations are
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - \underbrace{(A_ix_i^k - \overline{Ax}^k + \overline{z}^k - u^k)}_{v_i} \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_{\color{red} 2} \right\} \\
    \overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \dfrac{1}{2} \lVert N\overline{z}-b \rVert_2^2 + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{Ax}^{k+1} - u^k \rVert_2^2 \right\} \\
    & = \dfrac{1}{N + \rho} \left( b + \overline{Ax}^{k+1} + \overline{u}^k \right) \\
    u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}

For the $x_i$-update, one similarly has
\begin{align*}
    & A_i^T (v_i-A_ix_i^{k+1}) = \dfrac{\lambda}{\rho} \partial \lVert x_i^{k+1} \rVert_2 \begin{cases} = \dfrac{\lambda}{\rho} \cdot \dfrac{x_i^{k+1}}{\lVert x_i^{k+1} \rVert_2} & \text{ if } x_i^{k+1} \neq 0 \\ \in \dfrac{\lambda}{\rho} \cdot \mathbb{B}(0,1) & \text{ if } x_i^{k+1} = 0 \end{cases}
\end{align*}
i.e.
$$x_i^{k+1} = (A_i^TA_i + \tilde{\lambda})^{-1} A_i^Tv_i \quad \text{ with $\tilde{\lambda}$ satisfying } \tilde{\lambda}\rho \lVert x_i^{k+1} \rVert_2 = \lambda \text{ if } x_i^{k+1} \neq 0.$$
Again, it's claimed that (note the difference with ordinary Lasso on the penalty term)
$$x^{k+1}_i = 0 \Longleftrightarrow \lVert A_i^T v_i \rVert_2 \leqslant \dfrac{\lambda}{\rho}$$

\subsection{Example: SVM}

The vertical splitting version of SVM is
\begin{align*}
    & \text{minimize} \quad \mathbf{1}^T (\mathbf{1} + \sum\limits_{i=1}^N A_ix_i)_+ + \lambda \sum\limits_{i=1}^N \lVert x_i \rVert_2^2
\end{align*}
ADMM iterations are
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \dfrac{1}{2} \lVert A_ix_i - \underbrace{(A_ix_i^k - \overline{Ax}^k + \overline{z}^k - u^k)}_{v_i} \rVert_2^2 + \dfrac{\lambda}{\rho} \lVert x_i \rVert_2^2 \right\} \\
    & \leftarrow \text{ parallel ridge regression} \\
    &  = \left( A_i^TA_i + \dfrac{2\lambda}{\rho}I \right)^{-1} A_i^T v_i \\
    \overline{z}^{k+1} & = \argmin\limits_{\overline{z}} \left\{ \mathbf{1}^T (\mathbf{1} + N\overline{z})_+ + \dfrac{N\rho}{2} \lVert \overline{z} - \underbrace{(\overline{Ax}^{k+1} + u^k)}_{s} \rVert_2^2 \right\} \\
    & = \argmin\limits_{\overline{z}} \left\{ \sum\limits_{j=1}^n \left( (1+N\overline{z}_j)_+ + \dfrac{N\rho}{2}(\overline{z}_j-s_j)^2 \right) \right\} \\
    u^{k+1} & = u^k + (\overline{Ax}^{k+1}-\overline{z}^{k+1})
\end{align*}

\begin{figure}[H]
\centering
\begin{tikzpicture}
  \draw[blue] (-6, 0) -- (-0.5, 0);
  \draw[->] (-0.5, 0) -- (4, 0) node[right] {$\overline{z}_j$};
  \draw[->] (0, -2) -- (0, 6);
  \draw[dashed] (-0.5, -2) -- (-0.5, 6);
  \node (one_over_N) at (-0.9,-0.6) {$\dfrac{1}{N}$};
  \draw[domain=-0.5:2.5, smooth, variable=\x, blue] plot ({\x}, {1+2*\x});
  \draw[dashed, domain=-4:1, smooth, variable=\x, red]  plot ({\x}, {(\x+1.5)*(\x+1.5)});
  \draw[dashed, domain=-1.5:3.5, smooth, variable=\x, red]  plot ({\x}, {(\x-1)*(\x-1)});
  \draw[dashed, domain=-2:1, smooth, variable=\x, cyan]  plot ({\x}, {-3*\x+0.75});
  \node at (3,-2) {slope $N\rho\left(-\dfrac{1}{N}-s_j\right)$};
  \node[circle,fill,inner sep=2.5pt] (tangent) at (-0.5,2.25) {};
\end{tikzpicture}
\caption{sketch of $\overline{z}$-update of vertical splitting SVM}
\end{figure}

$\overline{z}$-update splits to the component level, i.e.
$$(1+N\overline{z}_j)_+ + \dfrac{N\rho}{2}(\overline{z}_j-s_j)^2$$
and are easily computed
$$\overline{z}_j = \begin{cases}
    s_j - \dfrac{1}{\rho} & \text{ if } s_j > -\dfrac{1}{N} + \dfrac{1}{\rho} \\
    -\dfrac{1}{N} & \text{ if } s_j \in [-\dfrac{1}{N}, -\dfrac{1}{N} + \dfrac{1}{\rho}] \\
    s_j &  \text{ if } s_j < -\dfrac{1}{N}
\end{cases}
$$

% \subsection{Generalized Additive Models}

% Consider generalized additive (additive w.r.t. features) model $F$, i.e.
% $$F(a) = \sum\limits_{j=1}^n F_j(a_j)$$
% where $a$ is a feature vector, $F_j: \mathbb{R} \to \mathbb{R}$ are feature functions.

\bibliographystyle{ieeetr}
\bibliography{references}

\end{document}