-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtalk3-personalization.tex
280 lines (209 loc) · 10.1 KB
/
talk3-personalization.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
% to copy from the backup section of federated_learning_optim.tex
\begin{document}
%----------------------------------------------------------------------------------------
% TITLE PAGE
%----------------------------------------------------------------------------------------
\title[Personalization]{Talk 3: Personalization in Federated Learning}
\date{2021-5-27}
% \institute[北京航空航天大学] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
% {
% 数学科学学院 \\ % Your institution for the title page
% \medskip
% \textit{wenh06@gmail.com} % Your email address
% 北京航空航天大学 \\
% 数学科学学院 \qquad 北京航空航天大学
% }
% \logo{\includegraphics[height=1.5cm]{logo}}
% \logoii{\includegraphics[height=1cm]{logo2}}
% \date{\footnotesize 2021年4月13日} % Date, can be changed to a custom date
\setlength{\belowdisplayskip}{5pt} \setlength{\belowdisplayshortskip}{5pt}
\setlength{\abovedisplayskip}{5pt} \setlength{\abovedisplayshortskip}{5pt}
%------------------------------------------------
\begin{frame}
\titlepage % Print the title page as the first slide
\end{frame}
\begin{frame}
\frametitle{Personalization for FL}
{\bfseries When does one need personalization?}
\vspace{0.2em}
\noindent --- When data across clients are ``enough'' non-IID, which is more realistic.
\pause
\vspace{0.8em}
Means of personalization:
\begin{itemize}
\item Federated Multi-Task Learning (+ regularization / proximal term), e.g. \cite{smith2017mocha}
{\pgfsetfillopacity{0.4}
\item Model-Agnostic Meta Learning, e.g. \cite{finn2017maml}
\item Local Fine-tuning.}
\end{itemize}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Personalization for FL}
\begin{itemize}
\item Mixture of global and local \cite{hanzely2020federated}:
$$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - {\color{red} \overline{x}} \rVert^2$$
\item pFedMe (bi-level) \cite{t2020pfedme} (and similarly EASGD\cite{zhang2015easgd}):
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N F_i(x), \\
& \text{where} \quad F_i(x) = \min \left\{ f_i(x_i) + \dfrac{\lambda}{2} \lVert x_i - {\color{red} x}\rVert^2 \right\}
\end{align*}
\item FedU \cite{dinh2021fedu}:
$$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N {\color{red} \sum\limits_{j\in\mathcal{N}_i}} \lVert x_i - {\color{red} x_j} \rVert^2$$
\end{itemize}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL}
The {\color{red} ``weak'' consensus problem} (originally stated as ``mixture'' FL problem)
$$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - \overline{x} \rVert^2$$
can be reformulated as constrained optimization problems
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - z \rVert^2 \\
& \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL}
or equivalently as the following problem,
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i \rVert^2 -\dfrac{\lambda N}{2} \lVert z \rVert^2 \\
& \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}
which is a nonconvex sharing problem considered in \cite{hong2016convergence} (Eq. (3.2)). Note the difference of between formulations of a sharing problem in \cite{hong2016convergence} (Section 3) and in \cite{boyd2011distributed} (Section 7.3)
\vspace{0.5em}
The algorithm ``Flexible ADMM'' proposed in \cite{hong2016convergence} (Algorithm 4) updates $x_i$ using Gauss-Seidel method, which is non-trivial (or impossible) for parallelization. On the other hand, Jacobi method seems to have no guarantee of convergence.
% \begin{question}
% \begin{itemize}
% \item what is the case when $f_i$ being nonconvex
% \item inexact ADMM iterations (e.g. mini-batch SGD)
% \end{itemize}
% \end{question}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL}
Under certain assumptions, this problem is a (split?) DC (difference-of-convex) programming problem {\color{red} with linear constraints}.
\begin{align*}
& \text{minimize} \quad \colorbox{green}{$\sum\limits_{i=1}^N \left( f_i(x_i) + \dfrac{\lambda}{2} \lVert x_i \rVert^2 \right)$} - \lambda \colorbox{pink}{$\dfrac{N}{2} \lVert z \rVert^2$} \\
& \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}
One writes $\widetilde{f}_i (x_i) = f_i(x_i) + \dfrac{\lambda}{2} \lVert x_i \rVert^2,$ and $r(z) = \frac{N}{2} \lVert z \rVert^2$.
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL}
The original unconstrained problem is studied in \cite{hanzely2020federated} using the so-called {\color{red} loopless} local gradient descent (L2GD) method, with the assumptions that
\begin{itemize}
\item $f_i$ are Lipschitz $L$-smooth
% $$\lVert \nabla f_i(x) - \nabla f_i(y) \rVert \leqslant L \lVert x-y \rVert$$
$$f(y) \leqslant f(x) + \langle \nabla f(x), (y-x)\rangle + \dfrac{L}{2} \lVert x-y \rVert^2$$
\item $f_i$ are $\mu$-strongly convex
$$f(y) \geqslant f(x) + \langle \nabla f(x), (y-x)\rangle + \dfrac{\mu}{2} \lVert x-y \rVert^2$$
\end{itemize}
{\color{red} Looplessness} is the one of the key contribution of \cite{hanzely2020federated}, in which {\bfseries inner (local) loops are replaced with probabilistic gradient updates}.
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL}
Rewrite $\sum\limits_{i=1}^N f_i(x_i) + \frac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - \overline{x} \rVert^2$ as $f(x) + \psi(x)$ with $x = (x_1,\cdots,x_N)$, the local step of L2GD at a client $i$ is
$$x^{k+1} = x^k - \alpha G(x^k)$$
where
$$
G(x^k) = \begin{cases}
\dfrac{\nabla f(x^k)}{1-p} & \text{ with probability } 1-p \\
\dfrac{\lambda \nabla \psi(x^k)}{p} & \text{ with probability } p
\end{cases}
$$
Locally, one has
$$x_i^{k+1} = x_i^k - \beta \nabla f_i(x_i^k), \quad x_i^{k+1} = (1-\gamma)x_i^k + \gamma \overline{x}^k$$
with probabilities $1-p$ and $p$ respectively.
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL}
\begin{block}{Questions}
\begin{itemize}
\item[1.] Assumptions on the objective functions can be loosened?
\item[2.] DCA with linear constraints? (augmented) Lagrangian is
{\scriptsize
$$\mathcal{L}_{\rho}(x,z,y) = \sum\limits_{i=1}^N \widetilde{f}_i(x_i) - \lambda r(z) + \langle y, Nz - \sum\limits_{i=1}^N x_i \rangle + \framebox{$\dfrac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N x_i \rVert^2$} $$
}
\item[3.] DCA (or stochastic, accelerated variants) can have better convergence?
\item[4.] more
\end{itemize}
\end{block}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL --- Research}
Because of the existence of the boxed term $\dfrac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N x_i \rVert^2$, the only choice to fit in the distributed settings is to update using the Jacobi method, as follows:
{\footnotesize
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \widetilde{f}_i(x_i) - \langle y_i^k, x_i \rangle + \colorbox{pink}{$\displaystyle \frac{\rho}{2} \lVert Nz^k - \sum\limits_{j\neq i}^N x_j^k - x_i \rVert^2$} \right\} \\
z^{k+1} & = \argmin_{z} \left\{ \langle y^k, Nz \rangle + \frac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N x_i^{k+1} \rVert^2 - \lambda r(z) \right\} \\
y^{k+1} & = y^k + \beta (Nz^{k+1} - \sum\limits_{i=1}^N x_i^{k+1})
\end{align*}
}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL --- Research II}
One can add more intermediate variables and reformulates the DC-like sharing problem as
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N \widetilde{f}_i(x_i) - \lambda r(z) \\
& \text{subject to} \quad x_i' = x_i, \quad i=1,\cdots,N \\
& \phantom{subject to} \quad Nz = \sum\limits_{i=1}^N x_i'
\end{align*}
Augmented Lagrangian of the above problem is
{\footnotesize
\begin{multline*}
\sum\limits_{i=1}^N \widetilde{f}_i(x_i) - \lambda r(z) + \sum\limits_{i=1}^N \langle y_i, x_i'-x_i\rangle + \sum\limits_{i=1}^N \dfrac{\rho_i}{2} \lVert x_i'-x_i \rVert^2 \\
+ \langle y, Nz-\sum\limits_{i=1}^N x_i'\rangle + \dfrac{\rho}{2} \left\lVert Nz-\sum\limits_{i=1}^N x_i' \right\rVert^2
\end{multline*}
}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL --- Research II}
ADMM iterations of the above problem are
{\footnotesize
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ \widetilde{f}_i(x_i) + \frac{\rho_i}{2} \lVert x_i-(x_i')^k \rVert^2 - \langle y_i^k, x_i \rangle \right\} \\
(x_i')^{k+1} & = \argmin_{x_i'} \left\{ \langle y_i^k-y^k, x_i' \rangle + \frac{\rho_i}{2} \lVert x_i^{k+1}-x_i' \rVert^2 \right. \\
& \hspace{7em} + \colorbox{pink}{$\displaystyle \frac{\rho}{2} \lVert Nz^k-\sum\limits_{j\neq i}^N (x_j')^{k} - x_i' \rVert^2$} \} \\
z^{k+1} & = \argmin_{z} \left\{ \langle y^k, Nz \rangle + \frac{\rho}{2} \lVert Nz - \sum\limits_{i=1}^N (x_i')^{k+1} \rVert^2 - \lambda r(z) \right\} \\
y_i^{k+1} & = y_i^k + \beta ((x_i')^{k+1} - x^{k+1}) \\
y^{k+1} & = y^k + \beta (Nz^{k+1} - \sum\limits_{i=1}^N (x_i')^{k+1})
\end{align*}
}
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}
\frametitle{Mixture FL --- Research III}
to update....
% TODO: collect discussions on personalization via DC problem and more...
\end{frame}
%------------------------------------------------
% Page 15
\begin{frame}[allowframebreaks]
\frametitle{References}
{\footnotesize
\bibliographystyle{ieeetr}
\bibliography{references}
}
\end{frame}
%------------------------------------------------
\end{document}