slides/talk3-personalization.tex

% to copy from the backup section of federated_learning_optim.tex
\begin{document}

%----------------------------------------------------------------------------------------
%	TITLE PAGE
%----------------------------------------------------------------------------------------

\title[Personalization]{Talk 3: Personalization in Federated Learning}
\date{2021-5-27}

% \institute[北京航空航天大学] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
% {
% 数学科学学院 \\ % Your institution for the title page
% \medskip
% \textit{wenh06@gmail.com} % Your email address
% 北京航空航天大学 \\
% 数学科学学院 \qquad 北京航空航天大学
% }

% \logo{\includegraphics[height=1.5cm]{logo}}
% \logoii{\includegraphics[height=1cm]{logo2}}

% \date{\footnotesize 2021年4月13日} % Date, can be changed to a custom date

\setlength{\belowdisplayskip}{5pt} \setlength{\belowdisplayshortskip}{5pt}
\setlength{\abovedisplayskip}{5pt} \setlength{\abovedisplayshortskip}{5pt}

%------------------------------------------------

\begin{frame}
\titlepage % Print the title page as the first slide
\end{frame}

\begin{frame}
\frametitle{Personalization for FL}

{\bfseries When does one need personalization?}

\vspace{0.2em}
\noindent --- When data across clients are ``enough'' non-IID, which is more realistic.

\pause
\vspace{0.8em}

Means of personalization:
\begin{itemize}
    \item Federated Multi-Task Learning (+ regularization / proximal term), e.g. \cite{smith2017mocha}
    {\pgfsetfillopacity{0.4}
    \item Model-Agnostic Meta Learning, e.g. \cite{finn2017maml}
    \item Local Fine-tuning.}
\end{itemize}

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Personalization for FL}

\begin{itemize}
    \item Mixture of global and local \cite{hanzely2020federated}:
    $$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - {\color{red} \overline{x}} \rVert^2$$
    \item pFedMe (bi-level) \cite{t2020pfedme} (and similarly EASGD\cite{zhang2015easgd}):
    \begin{align*}
        & \text{minimize} \quad \sum\limits_{i=1}^N F_i(x), \\
        & \text{where} \quad F_i(x) = \min \left\{ f_i(x_i) + \dfrac{\lambda}{2} \lVert x_i - {\color{red} x}\rVert^2 \right\}
    \end{align*}
    \item FedU \cite{dinh2021fedu}:
    $$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N {\color{red} \sum\limits_{j\in\mathcal{N}_i}} \lVert x_i - {\color{red} x_j} \rVert^2$$
\end{itemize}

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL}

The {\color{red} ``weak'' consensus problem} (originally stated as ``mixture'' FL problem)
$$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - \overline{x} \rVert^2$$
can be reformulated as constrained optimization problems
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - z \rVert^2 \\
    & \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL}

or equivalently as the following problem,
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i \rVert^2 -\dfrac{\lambda N}{2} \lVert z \rVert^2 \\
    & \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}
which is a nonconvex sharing problem considered in \cite{hong2016convergence} (Eq. (3.2)). Note the difference of between formulations of a sharing problem in \cite{hong2016convergence} (Section 3) and in \cite{boyd2011distributed} (Section 7.3)

\vspace{0.5em}

The algorithm ``Flexible ADMM'' proposed in \cite{hong2016convergence} (Algorithm 4) updates $x_i$ using Gauss-Seidel method, which is non-trivial (or impossible) for parallelization. On the other hand, Jacobi method seems to have no guarantee of convergence.

% \begin{question}
% \begin{itemize}
%     \item what is the case when $f_i$ being nonconvex
%     \item inexact ADMM iterations (e.g. mini-batch SGD)
% \end{itemize}
% \end{question}
\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL}

Under certain assumptions, this problem is a (split?) DC (difference-of-convex) programming problem {\color{red} with linear constraints}.

\begin{align*}
    & \text{minimize} \quad \colorbox{green}{$\sum\limits_{i=1}^N \left( f_i(x_i) + \dfrac{\lambda}{2} \lVert x_i \rVert^2 \right)$} - \lambda \colorbox{pink}{$\dfrac{N}{2} \lVert z \rVert^2$} \\
    & \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}

One writes $\widetilde{f}_i (x_i) =  f_i(x_i) + \dfrac{\lambda}{2} \lVert x_i \rVert^2,$ and $r(z) = \frac{N}{2} \lVert z \rVert^2$.

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL}

The original unconstrained problem is studied in \cite{hanzely2020federated} using the so-called {\color{red} loopless} local gradient descent (L2GD) method, with the assumptions that
\begin{itemize}
    \item $f_i$ are Lipschitz $L$-smooth
    % $$\lVert \nabla f_i(x) - \nabla f_i(y) \rVert \leqslant L \lVert x-y \rVert$$
    $$f(y) \leqslant f(x) + \langle \nabla f(x), (y-x)\rangle + \dfrac{L}{2} \lVert x-y \rVert^2$$
    \item $f_i$ are $\mu$-strongly convex
    $$f(y) \geqslant f(x) + \langle \nabla f(x), (y-x)\rangle + \dfrac{\mu}{2} \lVert x-y \rVert^2$$
\end{itemize}

{\color{red} Looplessness} is the one of the key contribution of \cite{hanzely2020federated}, in which {\bfseries inner (local) loops are replaced with probabilistic gradient updates}.

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL}

Rewrite $\sum\limits_{i=1}^N f_i(x_i) + \frac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - \overline{x} \rVert^2$ as $f(x) + \psi(x)$ with $x = (x_1,\cdots,x_N)$, the local step of L2GD at a client $i$ is
$$x^{k+1} = x^k - \alpha G(x^k)$$
where
$$
G(x^k) = \begin{cases}
\dfrac{\nabla f(x^k)}{1-p} & \text{ with probability } 1-p \\
\dfrac{\lambda \nabla \psi(x^k)}{p} & \text{ with probability } p 
\end{cases}
$$
Locally, one has
$$x_i^{k+1} = x_i^k - \beta \nabla f_i(x_i^k), \quad x_i^{k+1} = (1-\gamma)x_i^k + \gamma \overline{x}^k$$
with probabilities $1-p$ and $p$ respectively.

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL}

\begin{block}{Questions}
\begin{itemize}
    \item[1.] Assumptions on the objective functions can be loosened?
    \item[2.] DCA with linear constraints? (augmented) Lagrangian is
    {\scriptsize
    $$\mathcal{L}_{\rho}(x,z,y) = \sum\limits_{i=1}^N \widetilde{f}_i(x_i) - \lambda r(z) + \langle y, Nz - \sum\limits_{i=1}^N x_i \rangle + \framebox{$\dfrac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N x_i \rVert^2$} $$
    }
    \item[3.] DCA (or stochastic, accelerated variants) can have better convergence?
    \item[4.] more
\end{itemize}
\end{block}

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL --- Research}

Because of the existence of the boxed term $\dfrac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N x_i \rVert^2$, the only choice to fit in the distributed settings is to update using the Jacobi method, as follows:
{\footnotesize
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \widetilde{f}_i(x_i) - \langle y_i^k, x_i \rangle + \colorbox{pink}{$\displaystyle \frac{\rho}{2} \lVert Nz^k - \sum\limits_{j\neq i}^N x_j^k - x_i \rVert^2$} \right\} \\
    z^{k+1} & = \argmin_{z} \left\{ \langle y^k, Nz \rangle + \frac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N x_i^{k+1} \rVert^2 - \lambda r(z) \right\} \\
    y^{k+1} & = y^k + \beta (Nz^{k+1} - \sum\limits_{i=1}^N x_i^{k+1})
\end{align*}
}

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL --- Research II}

One can add more intermediate variables and reformulates the DC-like sharing problem as
\begin{align*}
    & \text{minimize} \quad \sum\limits_{i=1}^N \widetilde{f}_i(x_i) - \lambda r(z) \\
    & \text{subject to} \quad x_i' = x_i, \quad i=1,\cdots,N \\
    & \phantom{subject to} \quad Nz = \sum\limits_{i=1}^N x_i'
\end{align*}
Augmented Lagrangian of the above problem is
{\footnotesize
\begin{multline*}
    \sum\limits_{i=1}^N \widetilde{f}_i(x_i) - \lambda r(z) + \sum\limits_{i=1}^N \langle y_i, x_i'-x_i\rangle + \sum\limits_{i=1}^N \dfrac{\rho_i}{2} \lVert x_i'-x_i \rVert^2 \\
    + \langle y, Nz-\sum\limits_{i=1}^N x_i'\rangle + \dfrac{\rho}{2} \left\lVert Nz-\sum\limits_{i=1}^N x_i' \right\rVert^2
\end{multline*}
}

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL --- Research II}

ADMM iterations of the above problem are
{\footnotesize
\begin{align*}
    x_i^{k+1} & = \argmin_{x_i} \left\{ \widetilde{f}_i(x_i) + \frac{\rho_i}{2} \lVert x_i-(x_i')^k \rVert^2 - \langle y_i^k, x_i \rangle \right\} \\
    (x_i')^{k+1} & = \argmin_{x_i'} \left\{ \langle y_i^k-y^k, x_i' \rangle + \frac{\rho_i}{2} \lVert x_i^{k+1}-x_i' \rVert^2 \right. \\
    & \hspace{7em} + \colorbox{pink}{$\displaystyle \frac{\rho}{2} \lVert Nz^k-\sum\limits_{j\neq i}^N (x_j')^{k} - x_i' \rVert^2$} \} \\
    z^{k+1} & = \argmin_{z} \left\{ \langle y^k, Nz \rangle + \frac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N (x_i')^{k+1} \rVert^2 - \lambda r(z) \right\} \\
    y_i^{k+1} & = y_i^k + \beta ((x_i')^{k+1} - x^{k+1}) \\
    y^{k+1} & = y^k + \beta (Nz^{k+1} - \sum\limits_{i=1}^N (x_i')^{k+1})
\end{align*}
}

\end{frame}


%------------------------------------------------
% Page 15

\begin{frame}
\frametitle{Mixture FL --- Research III}

to update....

% TODO: collect discussions on personalization via DC problem and more...

\end{frame}

%------------------------------------------------
% Page 15

\begin{frame}[allowframebreaks]
\frametitle{References}

{\footnotesize
\bibliographystyle{ieeetr}
\bibliography{references}
}

\end{frame}

%------------------------------------------------

\end{document}