From 23068f56d6ccf9d3fdcd29771c09a37dfe456371 Mon Sep 17 00:00:00 2001 From: happy2048 <2270020588@qq.com> Date: Tue, 14 Mar 2023 15:46:48 +0800 Subject: [PATCH] koord-scheduler: add gpu topology-aware scheduling proposal (#1116) Signed-off-by: happy2048 <2270020588@qq.com> --- docs/images/gpu-topology-aware-scheduling.png | Bin 0 -> 55487 bytes .../20230314-gpu-topology-aware-scheduling.md | 275 ++++++++++++++++++ 2 files changed, 275 insertions(+) create mode 100644 docs/images/gpu-topology-aware-scheduling.png create mode 100644 docs/proposals/scheduling/20230314-gpu-topology-aware-scheduling.md diff --git a/docs/images/gpu-topology-aware-scheduling.png b/docs/images/gpu-topology-aware-scheduling.png new file mode 100644 index 0000000000000000000000000000000000000000..6885bd14e789c485321d73f3f3bcfc937f4813e0 GIT binary patch literal 55487 zcmeFZ^;^{6_XP^WASKc*4H6OxNOwp|cOxa;-KeyH#DF3x-7THc3esKD-Cg(KTR-3D z{ss4!`#i%q&xmJUuXFZZd+oLM7^)~QiH=H)3I_*=E-fXl3+Q0a!_e{R2_GsS`nFn!Zm%Q`?!iAth&yevnq@0&houZnULbU z6)v}IsM}NfAZ)Hj6EvL_luZzS;`qLJGx#(I^36|(4NTo%4~Sk9JU5_uAne5bJ%FDw z0i%rSbkV~j7O&>dBzx3M z%c>!`-u{3yUy+!oq?>$f-?e|49x+>VhPFQ>RBeR#w?3AEmK&U6m?GJy%u2}*9&DV2 z)Npe>(asv^#(nQZ5mbPf{G@k{M${Va;B^2KlZa`B8ZVFj3R=<6 zxNJ`-9>-GsS5% z9!>7*@16^L2gZv7IcZKtL&vB^WqtSn&5fS3~LvrufGS&2xg;!hNw6;V%-hcR; z9~04W(eI*#=^*->$v&>7_}za0`wvJQXH3s|epRhQ=t> z(v|iCXY?=*Mk@jZpGhZC4VHbIKpHtd8C1YalvxMwTL$9~@|rtXtDBpcsiZ6AR}&w{ zXOp{KpNJn3r-(6yY5N(Fn-&s@otR;KxL!}a(L1>)jk+Af)`=i(>o=k+L(`;IM@pPP zDI~LMjbzyWwdu)QTKAx_1LJ!vf(N*zF0~6)RpnLXouQ3GEgl$aBCKS>eY+=SLs<#- zEVU=l4hcS%dX)#mysoyU79q`joSxjkjtF57^PR&%Qt>(|9Q^g|?QN%B-J65T<#c` zm{3Yej(3rSRAwPBeuvah3QAF62Z>W3NHNAlnozzLkNhFNN}bisXo66Vo+s)n5=lk( z5lxE35y|77GnHMm0)3FOlHLfkN<=bEkRI)WD9#|t8+=|wW^sbFzF&@&*t6kd8FYiP z>*$rFKB1?Ea4KDdi9D4tM8n@ab9DDinG!1#6YCVXjm9hRpqo5 zret|V>SQ)4PV(~#KIPTsy)Veh3mpv`=^Z)ECDo|UKh-UYDwL>xa`n-;$NSLv(DCq* zBf zOYS38P8WVPdA*mj5;hrg=}lRKZ#XLrwTYQit0UAU_$0ifwno#exUK9t&WW8k7&$Co z6k9e7(O79)m2=Qp;#*JKr5g>}N&nEvde^`i~Gb@O%7I&nQ6y-3??+q_-DS@l_4 z0a5{sMw3PhPdv}A<0&G-_MldyWi}Z5BQ{$$!HuN#ietn}*<-8Yl1;y7gl3r56wm0N zPuSVFFT_#~Kl|~l%Rb8}c@cqnrn~h`#GvZepD)cL>Vh^N9g%mEsIzS^l-76Vmc4ao zIC!|U)5+W^@Fe>Q))Va5lP5WHb#g@6t=W^=R&r5EuAc{gLVk*yS^bbtylEXVE>Eoe z;kasH_N}(eBeQnKJISNKBlUfeM2y784&QJj7_KaNpIqMy*iYE3x|#adA~S6@o!P1< zI#-%#MdMnj6}N`QRW{v>*AHs808ar^W^I3 zqujSSZ-1q?EtXSnuCLHL&MZ%Get9U$H@}_5b;|vT zZHMXDU7jhYkLDF?1gv(fc^V%ya=yj8)t$ZE(-Eu_bP$yF>A%AE7V};^=e+biK053> z<6A#VXpZ%Zyd%3Kptg}jeYs0`LDNFzPg2U9CJ;@ONLKTrRG^zbghZ9-n0taPk-UNZ zoTo1({^wMmC*E&tDs&#~!+@Qxlnz_e4pC}RSy7#K=eC$O&UXS2zabBxmOR>v+>c1a z8pM?ji3rmr@(4F>TXe|hp8eV&`IFd%H;gzQvx;)lp;D}pqt1wf*K=fj^y^-iW><*u z=4%lqI#-wNx7^b;3=Ykal#!t%D$iH78mz8=oI9Z$zmrDs!k(iGqG3^JNH9sDQLvOB zlOs#jjlZU(FP&5z&(`u*veroPwnCh~ z=g=j|r*PK3`c2OOt5i5hDtDFn#vxOu64d>qd-KR;QPL-TM!re$HkYC_EhEj%@^D0K zxH2a+r%KK(vB~7}sBOP3)qH{7o&B@<=YcO=M}tgbg*Fu*Cy6SO)-?OIp5++ZexI2= zqCaw6y-{CQuFPMby^gy;N0Y<eS5~S zeS12T6Nbu%24p>CS#OswRW1~6`q&bxSkAPYJP;k;yLbGI7kTPdGNi9ov)QvbSYS2& zY&@>ef@H#Y$v5rh*EehEZRlK#qKv0@9CI}$OTn+MUuke5 z*%eQ|F3)oA%PGnz`u0u2x}c~sk7-(dM!)oCGIE)8&G+=8)-q?Xe<<0&HPo$YFLK_e zuD3qlGU^~@W%Nx|L!Hy!d85YZ``v;Phy2Ov)UxJdIBp7|{Vvvx<{5rMm(q*qf%MPq zW5HZx>EsM~5|%mRuHz_MQ(G!q(~*(9Q!@+m_RAV;25v7tbE!Wb=BvKuUO5k3x}if= zIOdT4Tqkdq>^;PvcQKb&mQ$Qc=2UkgH}dJoynP^rw~N>9w|C1$Cz>xor2cWOrd!XJ zvQ4;o-^Y1=Kfa;OQEjiuE5{qW`KRbM)YFj$=yjfl+Ur#n<2>W6vmv#EuIwADJ332t zRr9O!z7BL}{af{jZdbcM8x?8{4u!WwCdHP5zcC(3_GZP~y3Gno-8yaMVk_bi3}(;4wL_cAayYN@6hC$a}4^Z@@fjT+gR3 zYjD=Abvd@bQ|v~z_u1>WCt1^g*Zf&}=6I>wnPh6znqZc<=N#7w`-am6!4)gI!Plm_ zy@9R01pBO+00sDmoE+>p{#^BNX;~2A;5E)(?iDS2Va$~a7r21Cckyc-@CT5z2d-fQ z?yfj+NQM=k&}!epD|)xlJwOhi&X;FK4vHVy1ssc_JAev;v2T3GTA&pT&lVD551J2d zQ%z~JSFhk0!E0nV_+Se-D0l@Aeu%*j=uy%j|K33uNJIGd8iI2Fms`AZt8j3_aMI!; zs_yXH$tVqK#%Em`)SqNR=))fL>i8*}uzaRTrx6oXrE>Zt`_7~n6aAfB7F%{eT%O!V zD#o}6PI=cS?MzsdikwtCChXGwP$>##XtniZ49rs#1tl8p&qLT1i(aqJ z+0`YaaWV`peyu2aABqw!%z{I&7RDOkTBRln`SwaJ>uLZ?E?8t*^% z|NXF_6JFuj>DHqBxBh>h0-l9OkQF^5|NnpWciaEJ-3=5R%___ILj6YG^y8GZp4Fd8 zI5H%xN^HF$8iFMs|6HO;5Qv5!_U1+ZJ7ETzXEswD7L(;!+jU#T%C(NBJcr%XroX?3 zI#_uh{p89r`}x^nn8p$7~az!n6sp7-_2({HRh-kUeLJlUf2yE#eFBN6g#7%9@xaGm#Yt34i) zHEj<=Ie5XE`$dUqXo`p zRMNInT&FA?duS87V-*A$RC46;=!c^Xd@m&MnKUV)_zy!=^HrrUk2j3h=oC_z){@;) zbnROa*}Tu~iC$R78E(FSp2R)Xkbm&#F@56Yj>*ncRTEW&lhs&Zk5hzGsjygUkijYY z06(?j1cu?bu3K(@e zULQO>9#-f>z~DD@(RJv+b3B`MON#LvQIfns-T0L*W@Guys#`@tz!ZOr&9paBf^aQe z&TVV#=KzAlbwA%;9D6^v7(S~$6S?ns`h~`LzFJ?Lfrh%#w1Uu$s^IOpu~xZBsPING zB1sre+&-gPzUJw;zSk>7mcN%)0m8r>CS*nT5ACGb;N@C7oozHf9JE%x(X_Lx+jzor zI$`{3vg6o_u8Z8SVTY=l5LbQq{dUb#TT-r{eC}jnTr_@Z#LiVlykmjv&T%_I^o$HqKT!0O5WE&v{0 z&M(NIJXYXzLd>?|;ANhA(PxJKldbWDmA-`fKC=s-gARh78t;q4+lz%qRBsHNe!e(d zOZA&F?+C+VGwyt-;z_`)z0B(H{zxPMqY4Mg(#MMTRJ~F~abIt;RNQU-7p0#vS?qtg znFtc;AqHH*|J^@JK=!Mo&wGbAXY;Iz`dFjHf_;kCMnr;(?{G{&qy)n0Ya-Q|52Qfu zmfay+9Jt%$e%DG1;ojh)rg80=0uZJ#w01QG1V)2mp$T$o9R#H>ngg*FDPc~dy%b3Ey_jZ*}g^u+nlzo|LL9neDwTuPBl+C(ABV~&^VH(6OuSutT$bPFfNy#qa2^pk_lyta-(CZ|yb@T8N*t~k z!<{l)^oCJ%Nqr>S(LP)zVguCJ@=wCO@#auc(PIPXlgqZ8)jKSviQWz5gH;^E;l`2#j(>^$w0!aA7e$5P zVk)zc31$P5;pF$5gK-1DPkT?Z$yDwk@y~-z!5_BvBMM#zRAWpsNPatw-xQ}e^`b$j zu*CjHIp?r9?xm%)pQuXBeiCm?e=E5^b~d9nXr6YJW%lPb&wFy<5B-hVa)gj$(*++# zn*j2wN4T5>fWI_lH1-P7M?Wm+dJu`Nlnk|&)Gihww|$!2XAWrq`SD)K=23sx1q5WJ zq+xW-FLA~JY5#62uw;3V@KC(DTy4kk^kRFkIXaCi4LcN*nRxKaUj1Iv4-Wsou=t$@ z@P;R!3+uoi1wpaNlVFD&sxxo!CPjX(Cnz=_i8!2on(zJN%prO3uJnp$y7qC=)h z^dautPgSxbtS8H98$Ouy-5($Y5(t6*4(E< zW2PPILfn7I{7dTpQM_mpa2acz+!YD`BL)9^K}#XLQVZQrjK9D53^_O}M7z0$T1`FS zxxJ<IRe@^q=tyl9y4R!k@ItLUC+vXeB4UqXLvPQ1|b|Ju; z7F?E&BCJh1A#idM~*w?+;d+u33TlP_)kVu$}NVURPej)&FS&F?J`xE z4^*4goL-+VMI9au2%$}((UMpgZUl|;Doq+k2|!Uf1Y@7PDz|o8>pvNw)t0B)__do+ z$^4P+Q_#P|rtku%)t^#kq5w9~@tyED+u7ypB-m{O zJsLSU7Jz^x1JJ!_wp9D>WZXb~05Aq-v7m=#O&QHNbn?erVI5rxX?zAgC-EFsV?PKv zEr!I+`oC-#_%6OD$W;3J(sB8tWG?`As~PnYeP$_w<*|Az&H@!lGTrR}$Q<8OvR6g&lWqtMXOvn!Hx zdp<{Tf(7oVt`#FBZl3A z-L>wxKIkGp{3V9b<9B;RuEF%}e+l&WMf3}(YLr|Gef%=sshzZ?&xyYGfrQ<~;Ywn` zP9ENf%KKlZmacpH>Wo)uePuw3MjLH+SH9WwPfURzFNU*g~#2^*$(k#1xRpD5T&lV zf|ebpsy(NwMAfHx7_|nck$k)Bd2G9Y(>kRz(d-Xw9eZGmQvlAHrct6VFx^GC*dTB7 zZOPuzQbUQs47MS9HN1Ou3Sm&x%2QC&lA=1O{#lVXjR_ zl&b|)E0pLr21?r)#9P+~<<9J@1i6p|=FMTsl|AZB^}A)L->tLSSDo>J%J3~mi(PZP zX->+BjIFX7ueMsp$+k7#bm12)rqJwFL%~<`on=s3nZN#K^b6I`{ky6*T(dY(ST0> z!fi=2ILwd;-_{1a&<3s9oWAtj8w3rfz$DA{6_Q}5BE8;-^*KNFA(x<{$x*nr*Uo2^ zg;^K+^5_AfyP4#2=1u+ue}uy?;cosLD*DnrqV5BH^+ZuF=v)!W6~2t%aX@1vJ1z`5y|}<*-QF4;0QcBmBTZ%w)a)_&!gNQ z+z1n)xN88&PcD==R)ab3&>Z>XO$2U0HkSb~aG7G!EuIw&rz6Ucd%H+Rk=eioFOJr( zZfp1Lf+zwxabMGS@zPEe4!ADP3zaU+fz(}U}63zKSkno~+DVr1R}Gwq>4ZuyL^qUE(tL zYh;YOx9&^xTEF8#N#?sa29<5>!XGt3OIj6XDEgfKl8XJjTTI|%qFM}y!#Msc-v4e7 z++_Jj;g9^O3wy-J;XhNg;8%pyejBeHJ!V4ot;6aujC~0L|<=M*IMcgErt&+zV% zH+1fJYR{^Ocl`0lY($6>dq4PSR-U{MEP9SEYp?kF03}<*H}3t$+O*BLWIgQ~`*gz_ zOD2`~b%sTfxK7_5oB&YyVMn6Uojl-D`Xgp{$yZd>ZpIDwKTO!iY}sRf9h8fEld9!j z*9yZG^?$2_OrZ}tuxrXWFY>Js_;#QSb1`Hjw8F2|JnJN=nIs9S{ zETOLr6mimX#XfZo=_1<(WtBNU2z*21)OtCPxQQ-7);g(7S(BQ}pA*Q`2k0I}Kn#E( z_V~4!=Mkc|i~~x9wACNMe(FJw{S1EhvJ=1W9&EljO{&6>!ND~5?wjS|H=Vn(gQ?iY2^~*$ zaB@p>Y{v)uj@e%d*=?9N`TCA14F1sh8~~)mXh#SyLj9Yj%)~mK=oKQtUx{pTD?BU4 zFcm!R6(Zkl#WWz)oPXIA;`^g-7~S)wLh9HV*BjO_ImbRz3o0ADHtw65+|*MjSPl~% zB2mTKcN5BMdoE)?Hqpd zm#GFQ;w}NV&b@4I^G5JNlq$nF-J-6wD4j(r?A7$d|n+R^Z*B@q4u= z#qs6C{gL0FIx??W#7KUkqel9WVh#OYsm2#FRJi8RZfdkJy;4>uQVLn5bVy zM1Nv8?+rjABmD9C@W$h4RV&U(6?{H5+DQDNjF@ICAZt&6+{BJ{j708@*@j;({jbXF zfkUD5k3r-48Jr+;*-LB3gb8F^&im%1o_V%=D} zshedySZB;VRmh9kzPVUSMYdia%~R=0;&sU%FVf-SQ-2RGY~r*lf@sO2g#*K5s?uVq z@oKj|r8r=F2k10&prUQ(z1~iE?zk+q_knA!bm}!7ItujE3{ylj$lHo6zwnj@Peb z)QEret2AF1^$rm{)*sDR6Y$Y08-0=TU+hTW^^e0d4fC8%apk@N?XYk!3j;ZUkh1uIXymYFsfz zC542`);g?S@0XC`*dRd;Kfwq_0w?3NW-t4c5K#OSigxFKj6@c{TbleQ-Cuyn3SKi} ziDzD{+*uwjDvP+7>b^Y$qHeZ`m?~v;k@kC=ty>CzjYiLt>n-kipm{0vRnUqw z0YRX4Iit7O`rE9lYeW%aW3@q@v&B;ck%@b8HRm)RS(^|`A-Qi`$gcC=+^lyimN?U_$P86j#J?a3puz%;C5a<8C!ogy z`G!fak}Wr*PG;W%pP^EODAa#XJa5vcJ@5BOvgt-4#)hRsgQ!R>CrVVG2^Kvo} z0U#)7IU)CgnKl=e+(Y#x29@jy2c{JstS5joKeyrxjM%`nMblo5bJEPcJ72;mc&a(p zLzimZ$3>Yq(fNU7rFJbPcC1JTDR%k$ZB~Llqkz|G93adnW{8UaktFaOBj(@hEQGiSCPipoBk*PIX~wF+G!_*;;=xvkYf% zd)yw3SqBwn+;*smMCu%k7mU0hA_BFSW0(g)Y?5SCU3MMvl2lfg{Iy=xp@%rJ5MbG_ z0$#>B{<%MJi_yY8C~=G)SWJ}U@X!Xzac@O;DF_y&e+#X&6=Ph52`r&9C@VMZh1J+Z zr3L!x$`waKCSz&)$P)1jOJv*wz<5!!y)|+bTMPR&gL*HF>OBpUgGPxr7M8x)pt2EgAMIXd^SyQ-OtY=_)|mO;AoZuoSG0zl`*i-G$J>PO$R@mvcs#Qlg%!WH z%x)S=6U_n&c2uuZonb{v1`d7xVFa;l?F#-UZiftpU-jGNvV%`eAN_~MqCgp8!BtYH zS*YA8Bz)-__?LyEKKWH*Aar!|67#v{9>ZT$6^3*GRiV~>CNcc)uThD?jji2_hZ*); zb>kl`xJdq~8b(y0aLIjbt-jx>`^^eNZn?VvjoAxC-aGAChWwb0zhaB9I*8jZYVAG! z|9;gP@)ijLpXu&B!;I}l$wZ#V{{@>AIP~ZTXMfo;MIXwH*IuLYH(NcGw2Fp;ZL+^2 z4NW{C(V4$ZjQ(CZrY{78f8QTSgMvd%K3Dr(Fy?>tC>nD?S*?d0nMEz$JDcevbV$Nq zRxK->V~YVl;5U;RUP%TJSo;9VnzX$`If$V>nF2_}svvlg`6Kc9y~u8DGt!iJ+=U7+c_1JALenF9cI1S$GmaVmzB>Z6twXMo_Sy+?FaAQP^NeX zeHlSW2?U3@VFe)p=Ay01iX7+NnMqtG`fXs}5b)a{#*#jKmO?lVB4>izzkZl`g#554N(p$!q!pzrip?*6FU=;;z#he&+J`Wm`$oAvu~oencCy5pH$ zmE|av-(5CT@ezT{3QsUnsvc18rI-VjqJ;FH$bU(klF{^($8raP1N2Odae2}_waOwF zG`X^t2f%e=>!W6JjhmtNcSz;1h-uN_4r{yO3i}Fc;55#D=<6nQzVLP#uQ!J36^Ome z;HVE^ZDjTU5K7Zbcvj=Z=uYgD)ghQIfY3DA8)V4<0HeDnuE4~hwoq%y|FBVCIYEnV zk%=0Jh_MuPMG+S0Y5)}C2EVV|?$U3Zfc9W>jkO#t3IP(|r!)Bb5Y7yN8yo82m;kYw zo%;YkNfv5P4j6`qGH3+jT(3F^4B~*b(@I7dt2B7LF`Q*HUEM<;W-1;B*GYaRf{VCw zZ%kxNI9KyXL!aGhlKS5-9(=kykMxib_BYHa^5ENbzn1Cth0>+abIsVl8h;n02ZZ9auP9x4T2~rjxkrB6ZE(?_Sz=D-SK?&NVXx zw2JcBB%S?%mCVmAK%lZ{b*!Fs)d`;n+xWKEa3nvJKqBKJr0MI4ENtX^weu}YAyv?| zx#cxdxD#L@!^8f5RskL8sK+^}zKr9#POr4L%uN3Q#Y@PS>f~#s``Z2&N*O*?lUiT5 zy+>Qw$Vr6(N#v8-9uTE4^j8H@y5;$HO!|~4^Ia2I^ptm(bO{eZ;(QudyO-`jiPtge zInoVYDVF=hX6DoH6oAAdnFc(U>c`q~y!}^PGGF8Cs-Jk0Q;?2($j@@W%Q6AY3ONdR zGVjHf{?iJPH?@HdEr23mv_K;XG7H>VqGyJ}`^qyn;^e$b_-)}@A|Nc)9~#TMQVq^O z&-{A$<>ni*PfCSf#H83Cj0aLbRsaYdEq#v_yragZlQA7xReW|}b(z3~ac>6LWh1I$ zTke25dP|PB`=2`vLhw69DG+}@L2la6Si4ExX0WL76k>&KF$sAbGh05BUAI)hi=2i% z)qJuGjObZLzW($SrvU-3fhtLIAJxwwk=Asnkm8?ez-pb=p2Tq2eg^8W4se5M0i;DG zu_O}=;+I|mo;{?+ywyv2FPFb5omBgG#VGJl=C_EKAXDRkU{pXO@thS!{4-qk`s<%A zH5}JK$}WF1NTmf_c3t0#m5$xUJz1rH&0A_Oc-(n9--@fKCpx19FQp8h zOOjP0vcgfgGuTg>CQM!cqg`j&rylcsI9&{hukG{xy%gttGOFR*Y>1W0<77s0;^#If zuj>UwEX0+J0q$FX#!Rh1jWz0`$FSi8Jo%b&ruHPW(Dz@A!;MIm9Zy%sfEa3JZaz{NoBFu5~l;5@0z1DCd+UNliTzkq&>P2u!PF%?#O0l(rVv3V}I*s-w-` z#SJvL0u?-qJz$xfC~{35fp(&Pxms;y>~MkDok^$qN8}(^2;+|HH2_8u@@E1oaVpm2 zHf}pk4N9Fr&bDSMEg7@#i<+mikBT=i|D}1=A|CG~>$dvIT$|+ZF4T=2I!r~nIg9~F zO&T)vJc|Ls^n^Ab8VmWBd>Odk!j#aF=HUXRuR&+s)KTXQqXZ+vE7jZFgTeAb>~7^S z|JyOiVmq+2^aJZSaT4B-z=#1~6X;6|Q^6p@(TMW)84+r={Lv|lMbC0$e@L3S#r0ta z5k!KxSJ&5}_Xx~F{B8tFE{uEy8gEWND6jR>*Bc|fR4i1SFTGOCdDfLL=+GN%xfby; zPM|Br4$vdkfi2N@c?{CYtLD*r!Y~WP5gD6}^mX|Z{>jsHalU}SLKd>!U)Ay+>U|ZY zTXm-})|A7C@|Yt5Y?O*A$y6VV?Sl_>q7QWyZ-JLEC5yKFexUi4ilEyra+oEU#Hlr`9nmmyHpRQ8oJRdQB*U zT~0jSX41BP=eps-P4&~alZ`<-x?D6S)ELkkSEdc$j<~uh6N7I@YSm&53)yW(-QzJW$vJeMAzaztg3I3=4{>Ex)e-RkcYL4R@QPOmo zFEKxw7k5zp{*)MYpd_aHW;$UzD81QIOU|{7M~f`Z0y!|4GUMdMxGptv{(FqL5zxdX zDm=$EOolKI=O}RbnL|;ZhEa^QfB}^4rNdToLKkik66dKZ;n2<|ofO8z2e(s}WvA1* z>V{tMaHb?z?pP192t|ISZMWy}z!oH?0MkQk`i|)x{6)w~#3QW`sbfWqNQA+l$$S6V zz`cic_?W;M^QjQ7c_X*_nHXVR0Om`-7yn41e{JTfWlapI6F(eYc|Ay&9?U#?@`++JU_`Ff9OnTN{PV|cNPO$ySt z)THivfQh1&2=@MGR&0g&$Yen+hNYa;$YX#FS>v12Tl044IOkU&j*Bjog62y54%7eP z$9gmz&A`#-@i?6Xe325&4>~zQI>KdZDPHp0-)xy{?u^jMIIR;KM^GcH|7ii<7v4ip`6`se6>`;%zl0BD`-j^)@>{su7~ z7mNuYLNGm>vJH5@%!d+=Cm4cSDB^dh$2^sO3HyxLzrvWOR-1XD$5(@n1dl!zZ_a3+ zQPkG>NCHnK`5+*kK;|&Xu8E=4RUc5O48bnW6{5j3tXIUn^tk5qRf&!fN5-8IAGb=H z=kg_pHp8EW{+Oz{+$<>a9@iJo>I{DpSO1f9dg4sc$GwQzzQBLQ9dIVDKDIcu%4T3^ za^74e8b_8fOe*T8JBx)%X4;5=sdR+=e3YNd<@2h_57+MY+cB68v4)&ChB{P)l$!NgNt5-h0unG82$kabTsBP-u)ave zGlcs_u&K6|waDfBYYAb!LkTpLM1JtzP)S7TPM)s>Cb-2l^?^c3Uf=h^l&>T#CTIBF z2l8Ud&j*0T*nCVbQ$zbA@uu?yG)(mBR|=TP)3j&!61>hP*a;QMRn04}zCDw9?dM?d zPRUu%P$B-w-jr2IRmfl~XD1pVz8rxS`Ne3tMLsCpJ?l6#zxVm4^!>gDs1EH_C}v=lV%?@#OFqQ6anxV;mO~EkYK-$^s=BWQ9t;!z@3iHma zUEYsF;Re7pb4d7iWYS==L0az;W>;$OtcT$JIjN{Q)h75fdbw3~e! z1tRLIoYH@YiUj8h0{Rk zqvAJw!B&O{w1nyw$d~UW@?zD_p|>Kp zrD5{y)lFYNl!}Uly292onyyi-$R~>%pd7-Va$3;^lJ)JjG?(6MGo>FE&OZ!O@E05q zMBN3O@ZTdY4$hke%TzNb3jZnQ==$K-pGbHg!xv>yfSzD(JzZujIp|`^oc|0GbKk}W zhScxJiwo17YfxQs!FEFF-e*Zh)6(r9siKwhRoXPr`Uavbmgs?bj62oa1!-Fg3L3?% zSs30S&RzXiR;0t!UMF3k1@NiUPE_xENnGi|(S4Z#I`B3zOl63n?)MbDesRhE{Cf&s z_G`sOp|HwzzCfwgZ#O{mX7^KPYJHa#5XMN)9zhOmaD=4lJbTA_3?#=bd8Z^(aNa3k zBm)2Y4w%L&o&UhJMO7b51uB{f6!t&Gsu-({W}lR_>B2QAP$ z@2=RVR0vGQ=&<4|1(j7k${?gB4Wtnn*_(`WsVbZr5<>3ARX~*`k8qDL^?f9Cz0cp# zq4S$A5i=lZBxWnMw|TWlJN(89#o}NY-GtVy3oxA0vt$6N3j3&!XeU+2Z4X;3x zRcV9O&>MQ0GgYh-RAv&TqmS}2@qR3V^{F#wfJ9)>?+6Nw-S}Ktqp1Y~?H0+xKK0*_OqvvQ#C)7;MYOC)ufAI1q@!*?u?17BhCX9B z)MoNCnSiI%(dy5g;YKevzBO1A@7foqWuR8$6-sPn+_m-kbW;W=zQ==EFvTv#4Fqno zGBwH@b8ej#YE@V>>oM`+#(~U+4!2&0SEoIu&kA|@I1bSGkW2c$FFa1xKGvG9v1QH} zRHVimH0WU_qXJjk6+^{(e3O8k*%aJvR5wIlhFn=8)R(o%qG)E8=@RIxwp>Oy_oJej z68Cx3=5Ds$iaU_^vFCaDOK1lT6}GMx4iG*y{~7hV8P&?kh@BXm`xY*D>9`;01$zbFCa_9seTDjgJQL z#?9m*VILY=uXE3OpOzdCw{Q{Owp<{3I1A_>k-qf$C6xI-jZBZMixpw8!=<-WPmHV! z`(s1dUWT0xv>9AWFYL^!6Pk~ znwUH8P34M%+V5$gDF{Yb-x z035ng0w*40cf*02MdaVlC`};^J-jnmdkzjb5Q-UJ2bjw#MbW&Z!cFpUK0n>WkIE-+ z*)Z_A=G?z>0sF1u*ix9o>a9x3@KoAE^#_lXVxKV%D4t%O?b%tm8Lz^P(Yf{t>DwC4 z+BVdPe)lLe-8Px2alp?bTQf3sTM9^^*?eGh_5>E#j_rUbEwUA~wDRyUjp z6qh7}ge7{lGo>$C!yh81G0)T*qmsxYanho)fj9*TxhQc&2 z)QGD2^Cxut$b8R@l&q?B!50HOdVd2YQ`i+}fh1>}%;1!z9>rv{gL8{EKnhzk!wSM> zInq6b=Su2-m&?*>5*(^+t*dWak>G#36{5S==6=_vj5hBm54Qa2mD&# zkwOF?UUl71&PlNZEKDtKGE&+#p1_zxgQ^DAj82~p`#V>EPhzb34P$7R5La@`2rJSm zC~F|1S~c#e1U7KLUbRi3OZ!oYUkaVdLR#r{>zh+_63$AgvxpW_xb9g+;CnpIUotf%PoGM;Wc? zM-msvBa7EE&x?5+wtSZ~5r#uw77)noG+iI|+2rw{KUs^J;vB161oYf6#kuSeYG1SC zij2@x*vEEdd&DAxUAo_oP)wphfAX-+eDt3T`U0}l6~%S;{UfzxFCu|1X4Ps%BMqhp z8BKwRD=W3zRB3lCRO9K-=9>4hioq8Pj*4FRnSOh~J+}40 z{G8Xg0Q?66wekU{yw2+jYZmiOjXO(J-Y`9~ ztAr;n^x36yUSnlf=?a}gk~*^!a=xFC%POxjCfX)w;$HsT3U4?{qX>bV&+7(mg9+YA zSwk?%n9*B7!L}=cL>i|kdEnD_EbEgiM=yY5${L`YB_rY$sfyI@ez%Ej51bcT5ofFO z$3VK6^GJM6d>V%LR6Id+y4URj$bzvp-)7apmnx$8Jvh1(7d{t{vH)q-C&j82L9-4R z`sN$GFlLg5%F(7`0^8g1I^86SO@11EU(I?KJr~ep_(0EO5^3(^TPC4ex^Zz+IML8d zMCbSXh1M^T`*E`WX0XyspbOIxv?lu^{#5t7ZiZqgoqrw1t_|%VP7S0Pb92i{cEyGg zJe510aT;ignK%JMRnk#p0@gygxm|r2S%T_U*!gjxQWW=-FF7b21%GEt1L(1TS;RxO zumayLQYPICggg+uLS^7`)RP<=x>N=*mf?)`OG_g! zN&WoxYELH(A8Hf8Rt{!8oKM0}#3SIc#a~aE0?x5=FR>Clq@SH#AIXb-mHwE|)w(QS z-nq`W4Anea3aq&!g-$?1LJ_^bM>ym#xWS?qCOub8T&XcWDiYpIXZ}GQpr5*6roU0RD{tDv4yvV-} zzj*gn^8dSPMbal1ur**)%K~+51&FRN$Cci!0@gx0S>zVKQUFc(&^L$6Jq&wz5F{5c zJEC1}Z86Lut4!IX@Q_d)J8&{K0k*bH)Tg^8ozZM8CE4e6I!_90lRLtSV33DIG;rbU zxaNhSvK>5c5wI5X9p)Eq2UiT0Z39xkG|FS3d)^NsCull6aHN-WPs|J#eP`D~!$HdG zj~d?p<(*}WK6GzBllp^6fKsC$H3}c(L-yh00Fl=?KyAv9h7;^B#clg7M;Q=ydmcTE-aL6uYa61zXEUt@3NP|l_R!)!TU5)8>mn+4kx z_Ih}76-<+!OP<`m!iM1E00)Z)?rr@&ibTsn?oCo%;B32IH=9`&jA2DCc& zT1gl?(+n8)7Eunj8!P+-zU;t|XIftR1}yg9^6nNAJ$ewrJ7D?L@>y{9QBOY&HIl} zC(X$fas=yCW>zr+fNbnpM^?rKzOf~B`uWU`CYagcgX>vIKh4w1AHaPQMoJ^Y+*xZ8 z)fwevQ9YCdOL+w=DX z`fqM8!57b@(oGjH0^r+GJ8&-}8Sqn_$RESg73zY~~so`CU>Q13Hfq4^?* zkHAAw-cGA2?HVu+NQge3{0`u6`&ZfGU))#ZhlD`x0CXoFetFbERP+Um61}JfOFNG1 zx%FIM91*>*cW)<{(p5}+UYt1S?+}E?LV{}!H{f>%Y?y=wKQS*l{1T(qt8c(I(Q>)N zk3x3B69-?O3CxY#e#|^TyVNy96D@r|bQ~kN#S$mX#x$TAgv#c+Jt4X!D*-w2!Gj7U zJvRL7k?wAhE~UG>Vbl4}Wt@L~yW`$Z_Z{PmGt|BI+-uD>=P#envKEcFeMkLOV-ebu7}JFp=~lA%ck= z1@E6U_eD%2h`NpZZ6H1XXhLBt>q?w85@q)mKH(PgIJWG)&CMIP01%;Z3!Dfw0E!H4 zgem0Z?k|R>T!z*5qxdVEkNT$nP(U9bY9y4HzZfI?)Ac%uZUoKTp7}ro8L%@DtN!`f z2nZ8=r_oU5&xL3Vo_h;zT@d_j)E+R`a(iCY{0$;lA(9O1^A?}K@CpND(O_aaey4*_ z&KZ|*e;b4L&;EaW0S2{Dmre4=KkQFn6`ms~mWl=5(>LWJ1MByv376(yJO>4hUT2Aq z$_fu|2MO)I6tiptd(hjC-D`o{zls1tI!IuJ#MAW*WJretn8IRR`@=ssOYV6Gx1G)W z9KfN&0JKI0SWMNFXW?NmaA^mv&ue@kJ*a!^c6o-Bzl#sm6Y%330XiTcflekEbUoXu zVkRXu%gw@GRR~rwN!6A@bOg`I$DY?z+}2$CxUVE45>?)RKdCVQ|-c*@#X! zpa3WXXMw}1UCdm6b+*9@fZPn*#a+zd!d^G2bBv<^%@%|LPz76$()V1cKu6mG&>~NP zMG8<_haxq=?_*F%Q1eS>Pt^vgtZXio2+s_lz~1MvHJ;>yLMf1tJ(mm2h31EacrSlmh)muXmkvnJ3sj5EWHD73>}d#nO>n z^@oKiR{&ZH&^XTOpvf}h1uC{Z{qs$?)?(RBC@~3{OE)kH7(-J-1jchG)NJ+vN}w1H z!32a(K~Qv=rrY4=rtW@yS#hxCN?{fLR|{Z5Rqw9J;MXvSt8n$S;>PQPVNJ`jS_lND z1#mL}_*bAdZZTDj-b--}HHkhK2l|RcEG>faPszphy@?CgSb&lOeCCleucImx@4|!X zqj=B=UTC z5JGw7N7U8jxn^t={$%J1;HD6&4Fd3oh~H2v0AkAK*Bxn;Ab8%+<>YEr`&Nx8iRG9Q zl<0PrHH5k`0Q$|w3mA_IIYJ(A_Ug1JkvIx_Eh!|`8rJTH2Jg!qJq+P9Sec*VR6)NE z6>3MD1ORD{@EZ+w`c@8f^a-G4FQ2WHWiTUh!EDH#3LrU}PfpBREm_}C&T$e;+E`14eKuuX#hjXZ`m(9ByR0v5KLy$M;9%kAM?kgs?mi&cv-ItUl>=j zcEjEy-D<)&X&yus0En`P7;7KMJVYj zxT|_K>xFGbPe$qp!ei{F;`=zX`i`?Pg6{PYQ(F#eaXx^hoNnQ>$}Q?n^awsasyfil zeg_;<>@x^->o8Vs%#acP{J!^mpf8%5f}SuL$RdaQj*{Zl%5QAhNRVL&CGO1<;M2O~ zTRJ8^yG~|r0DEp9fH*q90({${udH3~QbpGzM^wa)Mx`RSm2xHg0D~T{lRx(x)TO5T zHtXpt@c71!%(&<1(3;n_E^oO(j&Fc=;s)CC8MX__IF}|v*^##EgD-cjK$bM|*&MkB zYAnqPj#OG3Dt?$FL_;CHblvZuk-jgm{Mm4m8gsn{G-;MVYobCOjmsTau?hYmK!0+s z@_^h=iM2494>p+v2GobPWO=&#?$NZX7Y{8J4hIql4Sge+wKeoSr$J7ET%w$k_M+wl z)ff)8jyX%kF0W^Fl%u+8@-vh#0H|?kKU)(R4axw_Wzl(ZtWN?^8boAv4X{`w!cZG5 zY2s}^MK0wV1`wv7xO?sI#olyZ0IiqUrBdhPZ6t4j6Qm(f)_8W2*km~mER8& zQsh@=$K@YaOdk}8)^W;G&C_@)S%8s|skmh}b1A>n)ekY_>?O$Z`N|BOdFr}*bB4xy zH*N%x$9__CZJWE(^l^MDe#LrepPd(;y zrC0+T=*IZ+8QDee1JLImh;}*>L~QUqn;%dldn?FfzdI$kS=fAGvwf@^`OXrs-AUaX zFz>{p7&d4c<6WU9$sB9S%Q?k8=iPI@TOdeF_xi;nR>UC|mT5qxP6Fx)A({O&x$PWIdgNeeX?}h~PCm@fz(S<8kCf2k5|B4kOfl zX+s@|2B5WB=C(=Db}zTj15m1R?;)h(%cF@2g4e=v>Gugx+p%~z^)xRaRf8mARn81C zgKSGVJ@eZ)?>(PoqiJfS`}p{_sjVB`s@CJ%j9rD#cs;$@Y4fq-d{Np^G`eQ`X6Yr1 zI$uod=^ivq7GOjXpHP3xPL(iC7Piy6EDT(VRx_L%|2T#u{4<*1O5ZBVlNVj`Hsh5Z zc8)!t=MqLfqRc+L+kK|e_WmfxFpuR>U+kHBKABW=rMlpDTiC>=;T-CZcb=sf6V)>; zbu8UX$SN}3#N5DeSkn(8Rp4AK&w4o$TAepK+?$bi6*g`-&%bdZh!{<%Y}Iqv8sD~w zPjlkxQ$}+2;D*Wc=+f)VOZH_V`HWQ3H7p;V)pM+|! zl8BX;^RzZoXK+?x3{ff0Q)M$aVlne&e-nP7mvDglI4>K(VlpBq)G(11KYI-{K7M-Z zJyV8=VK!OD1b34)<$GT`zR!)Bgh=ihe5HET1sW1Pf&|zoyODP9KzklN!cuC*>-|ml ziHk$wFQSEPwE2$W?*yD_Zwz$r;T!;62sI*PQJYTY3=TsLo($)Y9{-5d5fYPHc z5@w1_c8xvC5uTfej6o!9))Q4jdlhy4o01L9=d(aJ5E5s8$6hXKr5qJi zwba*qvC?w6+js-J9i&a4jWGD|v1##yn$ zN$~E0O^I9KFuUMbN8`oL9&f}d%?T?yOa0)!**0y{TDMcf2Zh}`m``wbtyx%p{(#^o zlk$_8GKE6KH!4GBD{in{$5lpqbBQwEiqCu1&}>!GbWKnhkBs8~REsOF^5SGGfLg9{ zZrlchz=_EbNe<7aR8 zNeC)=^{&u=oA%S?DKc0g{1w(j4b)9hup_AOsD$?jn4=vj;he z?Oq$RT5?~EuR(5mxumh1w0fohR7!FPo9u92Ws;+7d?9hDuv^oP`8K}Vk6OW8!0Su+ zpwPiXY9w;4{B&ZgH`5Wb^e1jo(C`<#7?B=PI@!4ez!Ll& z|3N2()*ugN>mkjspTO3dgLxy3a#Yl6@(0W=;h$%)$*NM&To5s{#hAV))E}}G(#G=u zPf^@xx+|~fJ@LF|R_t~n?+JDDin^EO7u`6Gfr?H{GSdUPsKuQJ>#CR^Gl%a91_+Ql zgUZUjQu0)GuG-}R+lI^)>zrY(WQ*fj<6E*5q-L5?Khe9K4bL33`pqfLrAo0*Y}sOhGitpQ*x`?UkNiu)|Q z7DS60uzcz4(G@yQDO9~=7MyQ`ROP=x{X*!o$EP#biq*-l7!cXe7lK3Hq3S<9X}CTY zEU|4&@id9}NF30f^>Gj+u&WVv*kP5pTMq~}U!mplseMDa^c+Hd{qQZvMfa1}q>8QK z>bk7salNbu$$Rbt_de-9$#~m5?_#+h@Wv15bfqOeKjCOrMr^o&f^IQFdtGZl#PO#3 z>$|E*YJ!1S+(d*DvG+(bCR)vm_=FA=j#FFB{Af`+Y0bGYFOUaXa8Z$#zNG|VtC+X5 zd?jP^->*sjz7ZmDpcWVes<{)1Tj33pkBJxDz6Fw%M*j|TfKy?Ag8~Q}$=7Bk4 z^Oq{LefN05#PXy$hJI7hm#TW6#CF2l22W>Lt2FfI9v+oi#uq$!YAErL^wyK9AN&Xg z>s0UI$gYf%<5ypoWRv#;zl!Zzw0#QwpJW(E^>&NVW6a7Mm)s0jtA(*L{j$Pb&%&k z5((LJfn#6#pA_sOp)V46HkZ|~z02v^WcrI-`a|bPK`s`BwJI5%U@D3yeMVk4u0L(* z?lSum-TmD!>N4aE5k`zQ-mmow35?Z{5$i!7dN*phki;8rR66`Z$&oeVHy!Ms|F?@E z`FV5Q=Dz=?Ju>M(trdVV6yy&N)Qz=UaU^-83S$g%`zrV z@6|D*c)KVFP0DQ85fx0rtDy8m&63_xUYtB@2D|0U;J(^D^wqQ8 zgnS+$oZm}|;vp0;&a9*=+2ehzUAn))07{!GP>qL9@g~DNAc{CL^EGU5zXEw%0q;OLtQ{m?Oc0mHRekncBefnByC4;)rI^P5+E)-$ z>IvEl9T@Z1zM3tcQCR~F+6WCAIVCS7k%128n$3Cpit&7%y73H~92Ct#CnmWoAQ)839HcZW%{_2EG|a6$73Lg`=<) zc$JtLAL1mRf3_zrfoHB#0*N1xR*Nl0 zM>z!Vox^>W!+gB53*FMrsWjac=*6M9^%OFrqBM1qLl{vWbC+Yo~VAHu=1A} z3;?+JSIDqAs8{~4>W(-PbZ`1~-sC3)$OLL?R!@dbA?Tt_lVG~!O3Q49dkuRbJCOI# zgaT>Bnq+c_+jG^g1|&C1*M=dXw_8N)?f3@KoR$1QmVuEh<$X>=2_$z1n1p7EIYYkV z85RqC5oT&B6!|Qppb1`I_WUv2KgMItU&;6G{ZUqeV0%7>Kk`rC{E zOvERL%J%kEvz%x$U-XMv%sK!@9mjtwwbVMeE)hxZck>b6iH@ldi>JFS-&ifIL}b8+ zQYZPDH{VN0RYcL>VN~CjZv~Ir1DhkmKK1q(;~DE*gQe)FnU^l83d;8Tnq7AkMaZ)R zeJiy~PF9{O8#caMz-hevKv+xk`H>904nztCE`k%VZ?8>57vQg|=Q4}xb5#=*d+vC4 zCh;}1O26a>z=CoE6j=ayXnXVDcW^^1NgkZt#*d%crM=4~e@N1wO;93(8 z;`j9Xy{cHkUMa)K+IRea=B9EOo3Ws$`5_?PV%DmBEL~OD5CekL^}T6-(E(h5wS zcU_ek8RW|r@eySvdV}VmUV(MS9l(nEw;?*&swYZ}WHGnd4ZeTeJx*Y{_0$tohkqbF z7XX@!$rdyZ#Gpb(6+}n02;>30%MiB6!bLV|F+bu8TRALul9#FpKbJtMyKCBche?fr zRDO>RLbLuCgHEKQv-G!pGlBS$(n)Muw{Q5oc0^F)2ZrDX&*Pid62s#qd(dQNb-!|H zDz?MF^HBuh73CgsR4P)HYSpYW?eA} z#=iUsNL}$~s#q7ure5#Ow`k>zc+S}~AUTe`H{OC;E5IyDfP#F5L^Iy+8=9j4y#dEo z%1EYUFu&DG-#+!RlTg2#>=)*~uuD^d_0G6nq^5@o2_xIy4SX*HC;k;62#EtkJJwnw zRh7DJy(Fs~!LNDpyOZ=jf|f4h;jgs^*%Hrgyre#x^YVYE@n3WC zYmNLrULc)s>e0=FBhVb_<=)MPGE@8Mdr!u1DW5y;8^Gn0Pz2nkMN^6%}a6t zPY6%}ltt!gBxctEQHuhKEh3m=Cky5QCmlKQRdlRSG-cw49ehf`51aon$_untWrug7 zR#Mslcef5y*zZzHt`0z*lzR(zDr_1(5LJ4CEQi0A3J~|H#TLAH{H7jaN&$+ne)mQt zPS}oObu79mpWv%TXY!b11g)RtyS&{qkEd_cqO!VR8y`lPQHN^TsFo`jH26#F0iV{N zDND8nEJsmS`*2_%A3RVhflqxMShhvx;QMT%z>_L3x^&CYBGxO>;^X3l3VlT%mF$qK z2o3sYx9_6AI&40Ud1_kAtX=I0z5oZ3YRh)3KbkZ_)KQ_|d{Z{^6|ehl6RIGtXmTmeTFDQbS=|N6_fK@03o9T%x5iV|2aW|uAs%Nfeo|T0(wOITCTj?m8t2F z@!|$gQ0IECLb|^L2*RB} zQ`svW2KqPrYIfFFKo@Qwgdv=~g23qRfX-prQA!@qxIL*Isr7eU`zv764~GkMEmyqU zG3W=>*i2M^;!- zMx`4d9xi!upWQh07_b4C=-L_#t9~i>jtL_~t+*GbGHjy!Swo9AJ#$qOlF6Ux>(n|H z(N*%Kk>YoOxSX3xRtgKp zi`xJ;80_u+if9B-n#!xNUJy0i095rYefp>PBTZhYnkuxtazB0wmfk)c+8OJIxP;SC z{dNl|4j2?>e|`<7C2{x~G!z{Howqj}^>EsL!aU*Zxigry8sGXQLfR=ZmtD`j7RmP@ zyKF^UO4MGr2)FO?A@olazo{L&!+dOJycmc1ERMJ&G^6;e-<)tgPj+6<%H*T1D@W#Qpg9fIK9LzpSZxEHC z1TW1QDCo@tjHf_6A5aC=Io_U{N;Z$*zqLe&)Wsap3z2?@CS=yh4oW~vU>dZMa}(uC zDR!TVHc%AeH1}gsllj}Z`6)l@7e;LD^1j#ykpxsl>s z=+aavpcWyl^rU=yP02Wb=n}LW&;+u7yaY+Dp}jU!jRL^eT%Q57-R6#=5g;Fz7%JBT zo{s%q0Y#^saEfs4#Q+c)I5<{riiQ$ML3>`~ihvhe{fxTxTDH~Yl9c(((KCm3z9ja$ z1A!#GD*5$iZ{-=Ph=Fj(a`UF{V7%Jvn*)n8s$KP~m}rQ2%Dr&iUYlCmH3bNteQA^5 z!&L;<$LUHFP&9FyawUh5lkF-<>X%5ony5jL;v>1|SDe`aTP0C?sx2jkuR{8RS*`mZ z3-18-Aon$tMFTN0(ZOmuxho$m5_+E7K`}oN^jZCdNqCsFAaWQWgH;}EPifj#SLS>x zFY4?Lr?pBNr~(5(y3FK6Oqzp8WR=y2`5G5_+HdosKmx3;5%nx3fVueHwNAcMc7vQR(aA(+1X;8W1oVgoC~!Iw7&DC7DZxbCvMB5O!q$I&v>4yBR*5_ zYEGtDF{c}3vVKaf^5ttqlF0R5rq8V;_>10sY%ZBku*AJMk{i&bIe=Es zMl^2-$3t^DcK%;Qykb8D6`a!@%?Y#hyXSvXA9X1N38`U>i;!f-hn2&i&HM`E!BIT9 zRg{Jh*jvf9+3B2p+$Z;;Mk|3WCI&}@?22SOO)_bc3-a!>#jParZ1!;+eA>Q`H-p>z zdo6h!NzP>`#A*iD@dtghkK8e7lPDBy2$xOLsBRXMx#S~k^F)@*!%Ilij!r#BL`b2GXu(@CYjIA=?eV z1P}iVNiJq&-Dm=TxGCj#Cid<#LaO=7?rh1%AFKLL<|>5b2MM&?qsfqXf*60>LXtkc zl8E3Bwy6&JaV&GrLh!4dA4NvcH-R9TQbOtz8TBcNAbfLyT2)eAt3})N(FK9RAd~NT!R+>{ngZvH>D&^ov}|%A647i?&ALudif>|@*1lUY|UTU z1o#DEE2u!q%=ATp zcFl0q?6bPn8|$9|q(ql$JmAV8eiaC8sZD5>8qXaU0mAt-Fk)|A+%6AQ1$B$<%d2f* zzXW(vq?6urxVx6tG!Fk(Bo7s`fW)1Ux3-}&8>#z+wIZV(I-Mf(q5a-X&>hVK@=&G# z=Ij8_6EL~rB5x!NNQ+;3>;f?PrPM`8bN3)I1h0l;pp&p@(gOUGW#^M#i1(Ng=nlY< zQK)Ol-&5)#+Pz#TjnwB^;jr7CK*5P|2GWDYC{XnI$&p)g0RkTwRNSJgNfiq@f|jT-iTMPyj~MT>7y}LIF&d8pk|Sw+6U|0jpwoZ zx4iN8K@b%~Sq+1~;q^HLl5;aUw@u_AV)6z7mwen7lpM8N2@^i3_B6Z@`;?J>ysO%fUopbIPHC!1<@aC*Ab9$$=d=l z4;8gv$y~KS*cFM+k@d;!nzGm(EI@$v>$HK;eo{~!V2!%bLw{|lXZ#L%Qk1IXAqc!i z?E#6lB9FeYeCW{6yavQnr){8r9Sa;>YcS?nYSgdO9YpS0a)aLb{s~+2%($P%dy!1> zn&!N6!?xhN$Q+82#gOl11!OXH06$Rz&i{viiwxi13kblNNNDcW=v!^Q&T_TU;;^mLZa>cpcD@HvzCyQ^I|Q$Hyns+yaeDi1&mBxaVXtM1ECWGwC#b4Y@;sgBjb@HN-Y5e8NK+Fk_xZEn zuV6A)3ITQnC3M2UAW+9=Y*BOC07&#A)?J!W0<|c484T!V(L&Z2^;}LidKFIt=fB5M+P8qUkmiTT{{GYXckPW z<@}ZjN}h81c1>#euF_L_OFN1tJSedDt(76* zfds3$dFPRV2-S0(5Tre39Ntub((j<{@1&J^Fu?x^#ZD#K{Qwdo|B)V%DA~%d5GP6* zg+3qqJ$%+-bjo+HpYT|t(miX82ZVw1ciK%g7%b?z#gOQzKpw;^4%C=jP0g%x=z&lh zfKpENVPO{kB1B}Xn%-x=jIcD#+I=jovD zWT6=|KfsjP!_o@HS$P-@i9%0~rh#y0KdwYf06HkUqnElMY z_b`^^k*5-#&@DvdZxaBBxRoBM_;xl5VZM>RRQ9;m>* z>W$+@;ZPnCs)&B;ShT!<6OXc77}V{Tpx|zwrDl^=H|)n3#&XH_%@CaeW%jttM~Lc@-bFW zrzLF2|Lx_64UF**qnJ2lW)dJv*z$73kzucH*jeufIzkPW+dSoH3Lo!b(&Aar5Dm>R zw-MnT;1%85-N$8qEcXbGiA0?}G$NB~7D%x;q^f(DmKKOB1y@az>wE|JSO!NwBpnF3 zSGEGF#T-;Sy{~jeT3UUJX)kbsT12Ns6tSX)GEjM`cJX+a_J$$qw&3+e$!j{}-2Dn% zL^A7{Ie{ubl5(kduVy$u{+?<3>7-#hHWmao({8QvzZTI8eFNOt%Db2k*untd8N}96rn~^g!yok6(zLi)8fiYm z+_0Kzn0*9QLsL^*axe6zX}W5bLQA=t4ao{zb|SdUqDFalhfFz`u|QyOd{DVmEb?v* zk?G|LuzbVK?AFRwd{R%vri7c`IjYHEofa(;N98svI9OFdw70ScrWL5vVdVzeu)#wB zWF6HwoPF{3-3B0+Q#4cSpzd;dNNy z@!EYBB}Ug*eeeX}HhSnAo^EVjCYHUiw*W}By{GGE)nuKAeqET$XVsISE6S5esNV`Z z{S1fr;_uGNd?%@+weEyGXsZ$IU| zXWIM`sZIBjy{hRmn!8=~E9+k3}c)|4?-R*XacXhLMFKwboD?o3| zjnAG>$JXZIc5F#64@3PjW}B5-xi)>Dg59p6-kY@Ou;mNdXxN#xXIfUlV>BTIB%H7B zJp_>pdmjol*y$w7FW}$Vjka^o8r0l39K+p@cI1^VPlIo=TsOsu0*Kf^uG~cn_xA}; zdfSA=SvJA^TBNETRDgGVs0 z-o?1C%kHGt zf<5vjvGu;g!Z{v;-lN`$2KgNafy#KdnfD}q2CfE~gT1vqSpEZqJbn4$6SGFaTNjmr zTWGM2SWJQKdTfF?0y6laflWg1gk8mZHnLuHP2LcQol-zIWBg-(@kA~euH~N?ZxOwa z*E8~ALq5CUlN~!-G~T*#W=F7DmVh5-`!)KAPUWL558ra@}HphUqOY1Y}lmkJl6T zH8KYs26w9%rJetIJM%1CU{=0AU<;1&&iRnaI z_xu?3l6EuI2qIFIs4wLHV5mNF`v=%y1jflb@WPum)y-~xOddj4IX=P`h9Vyjo19- z@&uHbS^K+a}_0`S7@inW7u( zGoi=ui8nBicD3|X@#t9I4Njanu!(Z(9av5r$830DxpD$9tuATzJHP&*7-lrNNY|Ad zG~_s)m#`>}liPOFD*@z7daejgqxJY5yI!J_Q}E#Jy_Fdz1gyaL4Iu*CFXDSKORFi9(??k5wp=fJ{w2-$&sb8CYS=|FSk z@cvU73Pcuu<+8K%FW(*S7re2bR}SiUJ>kaj&8Q9{p*!Ut=F>_(a}TWk7N^w1u@Ag&$u_z)*d~Z6i8ff`W)Fch*>&+@+~o&zNJr) zdg-i+Sl2Lyn|VyNgU4yDXk#0$l^^}YcWEa5j7r_z#_IZmO?*ht`8H}rhrZyBC0xON zM{_QQT~oI?*hc5gTpJaJGz(9_z=_d_gf`rJ<%8>rcCJ8r{^{HNFa<*j#vrWM&M)2G zjoIth9!nl>^nsI6Ds_AM`sQK_PNC%d31&#y;ytVp{VYDop`NwFl=4c(W6$erN0@uR zt*btc2Ldv=917w!TMJJJ!sRf`RWwLZocwgjI!T|G*52o$KUHCQMRe;0GA0WZ=J}jt zJ$%0;)jQ+M_ml>*g8S)lZ=5j0`${ZPdy7xG4X?54rZMUoq{Henrdc34B7{{GMU2hCyRytIo;&KRW*t}*z-={&%(qitY2O%l_qs~9*_x>; zF+{pJIQ``Gz9w(fONJp;Z@UA#I~K2~`dCJ2+@jP&Sia|K2a|9M(pZ+1tVe%5b*T_E znPwU8+h*g<%h!Bj1ABv)JNT3=n5H6fU2!v}V3~-ng;U+WY);{Fz35&7;<3%t-1SGx zZg1-ib2OSRhDak;Y1YkXZ+@Uq*zxts3genzks3GwL}VwTP*nA9$wrxbS}U*n(VY)8 zX%6n#N%14P+ekg9z6=+N3ER3xPhlVQ;}L4?`gs|c8?9i9B6Jb1?Zz?ESG$on{c}C1 zAOS<%BuAD$X;2NFiDIQjFQ{7_%TtEDvC+bXW$we%1QUjLX)_waUszdX$(edf5@Ca% zM#Q^3+4|yc+G8OKa)R9dxUH|~yy|TOe$Vnu)3oD4&;ns}dh$~+EYasd4b-?54Zko_ zB%D%KaEsuBRTm_#zW5p6QfRwF|Lj@C zvhiy93j^Y>zsHCotW=p8cij8)>F+Or3yYs2tZ*~%zvTTh(3G@?Q9@14wFy5oMh0FZ z%!0=Nm)bLoELO&RcMO^6< zBhS27+3TDBkn*A%e%VjjKN$S=Nu(K}XI!rKYenXN*7n2M9TcydE$~k#h;$pVqAI>$ zRz&-$3S?+@D~Sp-ZN0wVvduhxbm;qUQzdi*#JZd~kJMkkZ`^v@(%N6%U(&DMPnf<$ z-5Nly_wR3TRTPPl!Pu;T@UAn$02l;*IXba zo&Nr!Co*D`IgQ>cu8-j(CjbD!7|U z;=k5e{Dvo#ir6fl+@<+#4Q%nCE02%zI^XU5`|l`8>Veu<2mzqe!r~^usb}QtYinP` zVlApo4gz#9QUoU=RW#_TH;cQ@j>A2HN*k?)^Hy0ZC|`&^_r(gbpRyZSwRAt%WJTBi zb#v{Hf8Eh`UFZH2zZrB3K230T<*#?9M70Z~cwYWUtK)_pSu&jihc2dy7XW?n z7W7$IL_o9uBT(X`BGpMa!v4#2U3OGXUc<18c!<`%MRor9Y41ZTiI@-nANJB8_%_OdF=w3V7f*hkK+6+``0;52i z+1UH8%1114hN*E;3qb(r)NtrlzxobNP{iA@C~Ugvr6oWJT&(8kUcPlNX2`Fk z+z&i{o0CT%4YW4w^8r{SyuD-s97hP_Gj#hn0U+c?w$recmeb=zsoy3=LqbnnQgS2V8>y<|afC(aruen^ zKeiWy34J#wD_{AE-Z6cBF?8rpcO^7QPFT|&iA|yh{}66i&SWumnjni zv}>%G?)msg3OKt+RMV_T#1;TKTK52cL(CR%#6$(J_CiuYI>>+p4%Fe|0gsakDAg

ex2l z=6nH_4QwZ^+wSI(h5gwYGBmRl7Bh_e46sLM0sZ@T*sk7t2q$@=8!OWQ>zo$m5oNN% z9v&CV?`$uFRqPH87jzY=3c*$TT->;LZ+L9`V_U+Sg`4i(R+`2cLakngWGp z>uOL}5(wVe*yyUV0LUTNZGf)#3%R`9aSsXELj{hK3^8Qv;`*+6VeXwqKU0f7r<+cM zCc0~QYcJLc`p*>#@NeGNRI|EmJNz_h4)V?>RJ&ceSF8yyqZEnqK%L6^E@rd^pf3!C zMeA$2ZJ&O}yWG>7<~!e58WlWg!lrC6*9^8T1!s4+N;d0)J++#Znc%j~Mw&?e9oO7EBIxrb=InDdgq=47UpFf-St@$ihU% z?_}%l_IvFo9U`lhPZ2G<`F&P4uJgv-1Rj+{xRH|=6v)V5BVW8P1P97|XE`i}o0Zr& zL8&DRTFH&K@CRjEXi}D1FaBx)j2O(E4OgT9`KrhEp{@xt)viGL;6=k}kEMapc%OWt z)i?CZuFAT}@xI5XaLszJ)2cUuA?9LZ!6ki%#pbbn=!dpbP+142^M+tw5A&B}$)joP zw%PciNcnf0N|RdwW_DZf3OK6g&guO(U6%djZ0kk=8%2P5&2fdE11;0AD5F+E&n0z> z*eD}0KxYZLxF|J><(Gu8@@*{m3@Xb&ON8vS%9MA)&24;nCDz>j>hf7Jus6KpufnD0 z$5u`d+&ZPuaekEa>;-NkVKV+mj5i8626&niYORj3nJZ-Fo3(>y zpuG~Xd2%MAcNK5MyHPRiWV#tg2FjRW3EsicD(B{7*TBxqe_0E4EjAnJW9#E!xh!`* zDoG3i?emxuZ#_(=Qn`zi5~zNeFej7nfMyx{*Ta{3H)fiC99Q7;hQgv%aU?gzmtrb$ z65wqe;_klN1du}tjn|RNDFA=i9scE-rS@b#U@y1SC>3@7VBOmwOoxczI#5Ob3Xb23 z1p*j)B+_d)RnJ@3^ve7Lqn$shf|g!dfMS6Ih>W&_-7+HoY{+c1Sw2>pn()CTg0?GxrS#Q$+sX24bTJd(QlpAYuHbo16@n*5t9 z{{2VxSP(drpoa?rN9TteET2xe@!en1=wEKma~WqgGdrd^Qi}q?v-i=iXZB8PfM7)L zr|HN~t+L7>YWWii*>WSgWoPp;BfLqdNli2XQMcR!PrS%lSso21?cy6fQlE zqRK;uL!+X4qnxN2A0QKSb2j%qD@X`>^b_OX?py=yFD(tK zJ+@I*scs`pF-^N)Tlp;~0+QHToOk*x`?apOVI~X#W{vT8D}h&c6Z~?Vr%W0;6HQ^c zM6K+&WOdzAx}qKf+KKsuSNd-G))y4-^_#%W5kA%@{1dU4fuGzC&Tsp@N`k1ZN{Y|j zKansn#v)&s!CBItWpbAitZ;|^*}~u*v)tPFWC}d5Bu~s6jzAAxEc?9Dx6SXUQe~(> ztn61O8viF8k){OOJ-ji4QPXXO`!&fkI|bQB%b}yLrr)*tU*RA0)B#{kI>;Lh4Eo^g z_Q_98g~qib|3{p;dJn8Hk^yBUc538dC4N%bm)c)%{_DY^aETI$4?aox13&yeQpwPr z>kqgM3vpkVN#ve8F2jUIxzK-)7b*=di|cV?GP}#x=lx>TR|tkQGonwtvLDEW@ykU0 z*(Tk%ddu84+7ezdQjLP!U6m%Ri&*UmUskYIsj|_2Uub+6q`aCUV2Z&?(YcEF&)6C4 zD-t0vNjG@>=>H@>|Ew(`XD9}Y7EJ%2J%);NuOHDa%^$i81+k<|U$Gg*5jrKb|CkN& z+lUR1Z+QuE;;Kmn)#>aLfP`|HdKYs`1B85!1rb}@E~GwsLX)Bl_FqkQjY%$A9Fyzk z86+^PClMZ&v^^aGv66}JD3B5H^jxgKeS-0YKES(0en=xn~2KE5bdzendti1~EU=vu^QzAUUZ; zkM!|t{-ef|)%ESnJ?L=+K)!TAhxu&mk=)4=Dk{aqrS!(zhoQAa-+R)>N5go|k?)a> z?HFo6vztT@-nX(orTk4ae7Dmk;dY_S{trGlwDyL6v68oiB_G`hKuNmI84`?X1SOqBLueLHZL% z_?6`T6ANZ^q~S<#%i3dG>`>k;WCCtEeawm~7Wq@J4Tq+94rGY#;&nLREr%1Eo#Ns(FWvAIRW(u0vM8%}?pShx*Y1t&mE%3D1A* zCSZ${#K0Tokx6_1+hG4%BEN=W21(&$OT6sB-<;#0ZOj8^V91*C?tc$u`vlxC^R`RO z|Ll+<+(uY_FkaR_JLAufB?W*D;$XcpVEfM${mnR`l-Dj!{6Di?XhlM~Kmc!I#{cXo zJMf4(3oq#Zd#F`VQ0dd2JY^^O?|IAyIlmSOjl$p3%b(x&5F&(J;QyDe1T8Z6u8aE0_3zWkDyb({wmC;ozAYVDR$p3n@b>P~b_RQna#+6dO9BPqnNWiK zWFg*6ZWHOOar==_*En19!+pOEak3Wfv{me08-E_J?0??nuO($p{Uf#CS6SXmCw;8Y z-!@O8_#;LPBhwQPq^ae%6WXRnhl4r8#W4w_D^+VB>HY>zzgOc|FAtJ$nrV1&*k}MBQg56Q1x%CY&JLe+*f~>E?xc#~Poe zb9HIL;zgU}%Lygk{0*v(!%BG~2I*+6%pHj)dzBy%tajOdTj-sA78KUdFCbY)-4qQWaG3azecP zn-fz`r73l0jpVoxMtLR@wz$QD8%1%uy+aFk{DyVu;)Pk8c-^N0^c8pS(#lsg&&Gdx z)?O-1?41^l#RQypZ5;(6-vR|&1Vk3|CUVF~BKD6+-_FRx`RXZD(0_)>3X+*_*(6o@ z!$53a9(xUb8=#(+y-3|?Naap;9#;D0l~_;PCG1eTLnc06mN(blM<&`wT%fH@CelaJ zo^L1O=#xu*i1S!^yg+726;`mI0uc+hVB*f}UV)jpK8cq$Jc)*4{jdo?_?0}2yEa8B zpJZlaPcmI^x_zYxYGeq3Ak z#0oJD2`#twYC*u=qbJ7=a0jw4H9fsYZq~-zWIHwPF1NWudeU8~R+b)+1MdS28ENP} zSjZM;nL6^h1LRp#BePsY3M!YgQBx@k3LT^dh)NsZNWY1q~` z(xHMO4;zU&=X2?;j~csb4KMGc=@Dq|`w!!Jr_SJO~u z&3vgaI}fi2dyl8#pL+g%$-6vAVHB&dCpgmxrcJ|}CBvyCF>IRZazQ^nenHY4yg}OU z`RU6|xLFOHmOa`=ld`|0CwbaFVMmJCX1oS|o*rj(oZK}g!y0m!wv?8#jP?D$j9FSx zV|Mlr!t_XLUXJpiv^4mI0Wq&bR&jjeIsAN@k0tN(=(VYU*~+2GTF}MS!&CFQH_{?f z5t~}-_8%m9RCowBpQ23n?KnE?3QT5h)qI*z$R{~nyo~+!W%2&$!``m)ACuA@4LfV? z$^C8psh_wbOB9G*y}nI8o9I4Ha}l-wbj>^##?M5iT50V*?venDY3uJ-_!K|loTlfg zK8mfWiyh)iV!pLX!vxzzAsJO6V!S*4!{vL>xDK6k_;keN%#OH-x!g}R%Kx(WrO_Vo za=bZY5}C_61}l9sT7^GP8x6;u?4PVL79?NJEQL55`u4n8>1liaJb|yR&1>unkC+A( zOXdR_W!4c+ljO-)T2=O3+Wm8yPhlzF67e@x$N4qT`6lZ9|4(=C8P(*r#qTPj2nwhm z7OK*ugGdd{1_Y!ENEH+WLa#x(h=?FHbR>Y%6r}eqpu|WC1Sz2-B_M>}1L422_t|@# zz0ZgH<&HbpFwyQE1s)*wlXlgBGn_-Bkk6M zVGVxk7CtirsuO)0j7_(Fl#QV@O#2c`p#+wb$|0Kjp!eoKG&_ONl;VW>xeRo)Mg z@clX~8~58V^H0fH(q?NbYN2Xp)F%+(ICe&uii-?P2Tk{@6hW{=(+w&WnnFnLAE4CW z7o_43KD%*m2o^bh9qp0}BuE2x(G8hq5hxUITEAh)7Zqs0?@r9MaysI3Ts-DIjtWxF~>dpMFt%zt=Pf{DTlNX;u-e89MBXyFd5BUAp6DCkDlsU&+`mOm34m!(`6ER08XsDR1&;262fFOz8;)6kzdo@MsYJZxhZ>L;OlyRRqot^kX@tkY= z!x4GC&s~&0`lM{h!JuE)Vhku6^9{vOp~&kHe|RoJ(h!ox4ap>(6|hO4GS%Qu>P8vF zni@Qjg%}AJvkCOmLE;2dL!KG%7YuclV~kXeiT!b+}eAG^gp=3h=ddlscw2 zZV5VDSC!5USB|lPlH6P!Io$R`>R(UVUF4Wods5~)Z z9X7%5+RgG!F++OW=ent&VbnBHwuQgqO4u75jY$h)D>dgksx{~(;mk>5NIg6MFQ3?{f0;_FlouKHTjh0zo|LkY zkJNd`#`d_o zrZb1fb^hc2f*8+BLp^Z@vWCNER|S#R_qux0`?s7$>El=ei#;+oYbxsEZ;6~)PVkG0 zX1|70o5g3?xdkwA=LH(bs+Nt3dNJ%>8OzD0%~|Be6DHKhmCFiwSlL5A%%5{?=GBNz zbIPA_cO-@7*mI+ZvqsFSlFrB#S8N-CK)JvNdu>HGl~=$RQ5citU1JI%CaDJJJUzJM z8;x~Mk}z9$7BCm8eLPMVM?<)Le^ZTQW|LQc$Jn&2kbAFdSe(0jRE;cqnrbJXLx^B4 zfC`pc#(^;Y+nFhp2k(EZ$VtVCdVAdrdaNIy%PaMw?rvrnrB8y_)%H-N1*ACL`|;!b zTq~O;7uaVj-gOVyo0%1F1$h1~Q?*p&BX(lAl}#5+1)8&NBAwA5m5=D z9mb9(ti3CUGx)w(FFP-{9N!L27+$Pf;T{v(lf!KLZjVki=G>__Jg+cEayj&mYoXh{ zgCMp`QCB67suZnYu|E))S|wQ}w|5QEU=H$ee);miC{!Dn%di^iX5Od%{@7_SiyrIy z>e?6}jqheI02^?}xE&>*-$B}A8tJXzWa`w+K~`KtG~+d5!c#Eid4*EwM^dVBv0DxWeUzwC%_<}sWEa9IA?iT|*^ylM`f#fad=&LZ=T81p#GkVjKMpD0;r+eZ&?>;QeUy5dH&|f> zeeWxVu`14pPZ2U#<%M_pb*100B+$}ke+7YT>)W@)(?2(m9D>lCjWg&{sbPh4__vLe zm#af1lZX#k!ap!qHtMGGH1cW0r)hB$gD<>dUPu@wK2<2k4J%v=f%^u!kPN5V%llg% zYg?GW;^XDa>w7t|H?eR8j;daSGPRNQ=Mu$t_J0};CAHJLGPvyWhS%GHNgquDv#4n7y{`D z=_ab0$Jv$ojF^1>Jab{sGk#J6O}C|{>r&&(xc#8krpaIY3+iP^%2vrlg;2M9Ur#Q- zrAUWrc}`;h%=(e@*g^D1>^rtaq2P_bEv%C9>D_O7=1B$#2Mwi3$0zvo%O?0+j$`?; z!wBR3~;WR{i@h`fd$25%nX(DX?8Xl_#3o*7N@g_sd>9 ziobQM*_>d_K`4yhX%=YtfpFG*5aTC4ErK9$pqpBg8aT^0{@C2B0u0;&tG?zF;uG~= z-`S@Yd>=!FnL8A7KQS@T?)Ah@LZrj6xfLpGUqAy(^{h$Vmnc|zy0;&du|gr)rfHD- zFg4coI%bWAh>KUNO1S`zATuOjIeRD10dKJTlC0SX^`1*3^U?HJ!T5uGx#rldqB2#) zU$$9PiGt+hSZ}cI6>F0tgFvFp)m!e=m9SR@G!1PxSfQ5BqSXrpX7ji^uxw33f%29S65+o4Y~dvO>}x2m8atF1su*WF@d@F7ZGz*ucAffu)@I#i(<@K0 zAnu=VbJKU1M0caM<@*CB%1tsRyxiUod z>%;lBwCtL`o`_gA>uj~pGc?G%fjxR-?9b4p9@5$Cb;*%gwd33S{iCI9!GkA5H1<}F zCOQ*AiBtX5X!@?8xi;@}->(Nfmax-hVpe`fc4r!Ya28Hmmi(49zRI`fUc_Xg6DDto ztO%OJuQ;?cob`@9h+_32o~TU5Y$5Pg>EF6{B7-ZlpS@-MeA6kDAe7k1>)M^TbgN#G zri*2?NOS=q*Ax!_4O6*UoZnhV!kX&msOnuj4zWO~ryu(0ue0Wg?)nF5{Kr*(N%-(Z z9)U@@UdTbZ8kuk^8`_7 zdYI2Q=)ELdD_4DFi{2_><=j$}f(q}@z+BNs+6|qY(hhIuc9O7?usnMkdGr>m$i=F= zI@ z^6rcoWfz{9v!AiXr-HQfx@@uoRco&n93ZfPhA!Po8@>Xu(akDdKUOdU z+k~&Bb}tnM#c^o4w@4|KSS__;Mu_3FQKbuy#kM=?@ePE9KQ6(oskvfh>Z2?bdZYc#5nk5i*7W;v|2y48KC>TvAL(NK0CZ~iFGBxq3QUf zTh)eU>!_1>DUwGfa-=+O=6Pt>z8iWrP?O&wZ*fi*mkaG~i(37S82NPwjH_4Z*a}>) ztIgknSUOYTG!cO4OWC2@D{&s)P4&3CV<`i^qW69Kj!!}~5u@$2Ic|FkF3ky7FKKH3 z_E|60b`C9fXmTQ!jVcYF3X(m&`Fj-VnTGjv?mN&I5>achvusWeM&Npq(^umz)_Mp8 zO%Cb1rAKQWTTeQ8$OVyt>n^>($*C_KZsl<2jB$_XB3RiW$Er4R-u~(t#pyw`NLYvR zy9;uOTbixy0!z;!u2mvX@#!8@UVdrwC|Omv^x5A#1iw_Z+2t$`xN5LBTr}REuF3UW zCu}7#-!t8Ryba+jH?(w?XJl%-j#WU7&RG~|Y6BUw%qi!B)V3_8$r4ogq%qdBHQ%x= zoZ6anYqwe7Peh4hUxtXWQWEBAC1l z>$;zdFC6sIgwu2l#8oHsTum>+neo+PkE^v9e$g~L^Fv$3k=pfT=de|Gn&_@YnB&BEc3-aME$c)pUhCY= zR4aTza$&A2B$U+Zz{?xYz~Xj)>)i(RmSzgcJN>Fj( zFnh&1n#~>En0aDEEf@$`=yE*urK&dQSd>K|hN=^&QvozVRTd*+H(`wu9Qr{ccde26 zdWEOC5NCq@godnwa@oVWw;nm0rFz{j#TRa^s5V8>6qX<{0=LVMM`kXqXZ8+pdj!ip z0*m9N@1g#O&%>(vMiJ2h)t|l=roy>4isQ{yO<<;a?(!iGe`S%8rMXo+2s6n-2D)Qd zKVdHEITQT+-aMA0tneJJ=7PxEtG38IkX14_IN-z99ITaB3Ryb~tSg|q>nmSXR9fX# z_b3QPd|y$_bOa~23tqalV8BJJ6*{`2kTmFqpnHiYV@dE4RN=aGuRsXp#i|g3i(56P zVU1Bw)@gA!b~*mY_?Nie%9B)mQ@q@R*M6e##k_mUO10aK+0%iE@PzG{qBwpdTi;rb z5HL9|cB4a-0*J0H0aLEk_Te+QU)NEhBQ&tIiB1Y#%1Bd(9K7DLoGDB$u@Cbh+JqNH-zucsKiU$)TRTz4QrDfrt31k zbT%YqR*|nEtfJH^Z~g`vBjL+aYFrPcZWb=&7rH?Z7~@MQFt42TG1yVZCSuF5P+cs^ z3?iW3q0YbssTb#V$30x94Qk8LwPu3vI)Dd|HgdkxU-aI7s+G#HV=_O11D8@&JK>bD z#7K2UypM{n6<;qe3Be=`dnI7gc>OtWu}-fpR*7#8Ds5~pl_NFHQR#&3N(%*2218NS zc^clrt4`?6Z>`9*lVv^Yh0&kb1meUigazp~SI8 zxwJdZGkZa;n^CdgIizK3x1VO@NhvE|$nFBoi0kI(TqHCD@87W}gJk_uJkxf`&Yl>= zxW1Y>+#P#z?zj}^c^jJn{Ls=*o<*BH7Z`g?LViZH{=*{HtcTo#9kaL;+7m}lLo$A# zh(jk3BaRy~?=_k`L)*mI>EVeKn)pB^@FW^WNZzT zr5~YK6=Q9+$?L5U{1pe z?ua@Tgn}4zyUfQ>4!m^=FpH?&WMb4+Ex`tUjaY*ihfvjtI2ODmNqBMP3u&AHjR?a| zBbqnZxlWE49Y^`zX%(P4M0&)}c#r}v^FZM`H6^319?SxfkCKKDLqv&c%Pn~$Vuf7_}(d~Z7g1+{S(x+cAE!bgOs4zc6skm%@ zdu<9W5~#H`{SvOXf(YQkX~rEiuIs_H&FMk=rQ+X%glS8t*y`6%M?Q44$}p}H=e^zOQc})Xqejzh zx2v$Y;zuIw$)I7R>nGjQq9fzs2)SBofk#d^&9L50kMQ14`9akGxn!-W|27C`G$U`_ zGX_5LkI%KShBg;!vc(I=N{lD+(mvm*&l3FrC9d#KxTnTfmkaeby?bZKV}#Ml)F13S z;AS}SpEFXWJci_)Qw|(v*KpJUP#+x{rZlsaW=`5OR27xeb{k%rBX_WNx>#>IZ=9Hx zagJLY?Cwz1Zpsst5j=Ns#aV;T3Xj+Qb8cz4$%`b(l_QOq<8g_-|IAr9MnT|QeRh$6 zeTI191Mvlk2RSQIaa4b=_34)D zJ_9$#Lo2?)W29Lr&Zy11+hGt}!hia#{?+D+AJds822Y+ZbQxo9a+Kd4xk}n!@y%LP z3;0X$DrMp*l|Q}U%?{1MsKQ05)K5q>?^sj87X3~gy(bi6Sw3kQ>#chYyUp_S@xhvD zXKd8H`f-nD+-q)+o~R~1nhvm?2)lj z8A0&kxK1SfDoCOG&xx?dwIKVS){r=7HH>P&lc70mi>_F?rOA&2)}?C;@C7Y!UdW8S zg6KjykeUpNMy=|0UmdUqYHeayZMY%RMLWegx^w$AX_eg7D~{-iMc?=kgAC5EX_!;AssOik|9nnZ=G{*Y*&MSSgd4w<`O+4S(J)#=sF;o zuM=Cm**^JKw>VxEsioyrmxOfC5o<7t`ab$~cgkpE59qx(S1eMlO)ln*hI(6HRlCa} z3j9A=)X^D`e`^sy(HQ>Juk&yrxA*DP3bbkpTOxgHRWbcw)vGkd<~y1gP&GV#rv`1dHm!8HsanWWr$fonLkOJZ-5OC3-!8>p34v|+4x;` zXt+M6G?I(xlNsOIL1f+`zP!FfNz8D-c6NJvTgyme+7Q%?@Nq>9s`j95W2yuo-|_PN z>}zI@wbRL=v9Xbga>Schrqts>$_(@Ruj8&F`SxyeZ~`+%*RG;08)x0gEqeOgIa{%y zFn)~YXcD)`$=dtttOcw&9XaYbce9;U(`Pp??Pz45$+qW)luyJmB|6lrm*;qK;U(^@ zzoj|A22*iE`U6Oo^a~sdchGZkM@*q}q$^pGtXqfct^rQdW(y3kB)_zcQh8aghifZi zYEST9uAlUXX^%>`YnB|jAm#9x;<$k8{$~49= z^kg+K+bB#`@rUTl;Pv>ga#>tKc6w*G62EM~gX~<+s6^X3DYY ztarW{XcCk{iKoEV*l@a^TVih;U^`_%T9^GnY)j zMf6;QIgo=($M%P8(N)RdOpOyY z*7Ln{@LP7rflvAqO?$_uF8m1gEZVWMwn-Qxz6ymvAt6!j>osl5R2e2Ws{OBl2SsC( z&bs0bcUPLjd95T7S3P6}$3Sk~mO_l5`?tYI$=(Y5>?_O)U?{OK6nRO)hdRUeekbri zAD{H7-mI9xfZFjfRgilU*{Jyr>(cR@6z|KzJ-#8CXKe|9?$NXc|^vU4hUPPiWCbTjIr6z+( zFReP^EYPpO?%otafXZS9fA5B98d^lTkI~FBGeFNsQFZdcNHGutH$Rtj75;|!inN(4 zet5!$&T-ms@WO7BF{$*= zOdnSKJLJE=HVafYW@s1+(Jh;Prqa<}a5Oaxt1yU>M7-oA%7g^_LytppQ0h{1oNeXR zaI3p)%mNN7B|YA3S<*@M`BvrbwkitAtk2lJ6si^w$On+gD3J=??hWVo)m*<{x`d6X zrvz7=aEIo?C*AKVEzhgk#2fR}chofPX_KlDOz{XN>)^H}7istl3a1?o;A6J6wj699 zcjTRI`<8}p^XS}SNz*E~4hZZ$PW+WO;vf4oG+~=AV-zj!u4GeqSQg z`Ft2%xv_f@?OiN%Q5JJ%Hac=}z#5N(c_kct!(U$z<{PSJTy=#kjWZd$q!ZGHy({`w zQmpdEw969gGNLcqMa?%a4Oq3n6kIb?JprU_#ktFQ;X_kp;DyoZJ;}#%rw+F4t|!p z0tkng>=Jpf9TZZgUuiO>c{ciCRU2)MlKfgv8j&e`gKOFGzE)U$Nwtm3#3lW1le;B; zJKEW9w{{SfGqV@iOR5I@y*dP5Q|fR|H2{p_yK)9Ly#3sbhT5-@IqfdoNe<^M{H(Ir zVJE?1%LNKjUS0ak>&N~Pg~2w>=kL8?bt9(pyG*$oO&hBYe(Slx)6CEM5_*dwe?<5VYjW(JJdMvJjNLw| ztIRQK`m0^{hwSU$AZKfu^WvBiH`zzN_4)^`oQce^>GSb>wHu z1(Mc4U6IJY)5gA&Uw8l)i2R?=QU;&p^#8t#kkwd^0!8iiMGZ9H^_YTwYfL&4f=_dI zBbS9Z#olLH^L&;1^^0HE9_U!ymEtQjFgSQ=pZk82iHd#>Y7kzy-9p7Ks*rk%jpk@j zRAE>){W7~yUH6${qTS9SE4#eByuV{(>y3s5a`N(@1*Ub#FQDuq3vh4beLz#TEb(q> zPM_I?M@wdzJYvrQBr6M0SLFXFYmgW)7U=6-IV7O88@twTsZ<8EvF`5I95Vo6#K zwTno>*M-6~#O(pd)5HmI7is}Qwla13nnp0SsKW22ar-V{WhU)wQgF0Zejs0aF@^)= zV8?iZlt81Eso%qo`tg-XXNxPaYSP{My>^eU2OhsF<7VXq5JVTxw0IXe9gBzRSGrH; z470IIcV86g$mOx{i?2;(4t%rL`?H{SEoD$v!6ltp(H5J?*-_Ou!1)shknmwXamU|ZUoQb>S8t8$^z|L8L^g`$XwaKf#r*L1qrQiqgg;YHb0U%5yQ(V9 zq6*kvdRYF+ZueB5;5uORQ?9%8X-M4%6+nTx!#_Vox+>NvmTUsMlbt7WnSEj5tu{lu z3P&$VXgcfAKPLC;^S#qRK8QFQ)xYt>POX@>V5sW;C9_)rGRf73zSIq%0;m@h zps;(JbdMH;Qlp)OEBINZ1O;m+|NaR2#(qg3LgnCf=hE#97Jd~Giql!oJ~BVj7MivM z>>N_Sml><+)jVO|CYJ0X^^e*=Kn`bKL$I-)e4w;}pc+QuoAb}2qKRy%mXwkcjV!jK0u<-lY^=?f^s&&9dyJHuuD*vO3x_iqJ6{*s* ze;r)?os@|JNuF#|6Mv7R0b~H@oS2Ma*lJA{#!(#SQw5z@CNa*##=?ylq3~zr{PzA z=nMb2j4C;umeGq&U!6M$mf!#Ap^-TM$soydySvlTBij7;?_`Rf7FCeV%J}%q7yK~X zhSXgoC$zFFb}uVXJSsqss4jY$_rY9w&q*(2$9({_<0fu>e}D-2aFS`&ck{zM0KSE9 zPXr#7&331V^O9=-KL8+cvx6+%olGN;^W1e^y~&82fqh2=``VgZRxbHu9dk8^zCTku z$Gv<&bU2QdqSjRS_oDrw`cwg**T83^-mhT{SpZlJVJ?B)N1gvTC+n!3mCu*7)_Zn+ z#^_9tT~VL$dV!>)#OzIl{2XZA-yeHDi2ajJF|;|6>BrL(u!oYTjkZBFCbmpuk@-36fz2xci!v;eJ5G{0S(*rF2F9KZAWWddqIOsHb=lS>CHGgz)O=r z>&$R+clLR(^4<=Z2QEK-yI9<;Fl?I@`OBkuzrQ&ACD~Yg09;ClunoC%FD=>;^wW4# z4KQw&oBNB!s3#<)Of&E(2-mf0k>rGJ>c)GlBey{gGV4!1@iUYS3R|=b&;@PS6Y5|k zyVG)8KOgpjS~*t#y$Zt~%x58|7qs8G(g-SndO`n_i+EoE;k*ZG5cm8DTW4DVyO^bM zYrv#e|B=~6$S}D+Pi|z1JE&@WeB2fk?`x z@gqp-6!igniSKZ0BR@MBH83+;faI5W`9RHf(GOf z$luR-kMLj{iU2Uirg;G4j5aWOmS77gkC1{~}ovl=iY zd9+4{_Z>~NUZmg$#Phy7oZrn#El;Yb@yPcm)?08?4T&nyT=2^E7xZq3pZ@KRVCtsE zhico%(-O51FZna$;1v(ww32Y>7S2y1W zq)*o)(np=Fr+K(?G@$5(vEp5N=s@e;eKu`I)QD@<;aYTH1akekN}wdpDh?2f6v#~s zhxT?hH}8Qu_VC5ln}mRSl*13eJ{lO}~jRK2i-)A0$8 zH#2MmdeLNI7ELHIgX6em4>$WbAuXiq00;^B){!UQ#?o1S=JAKd0b~37fL!(_BaBV% zxy#J6!GM7-^@hVV5A4#tP<#^WQKvYDiyk#q!ZL!9_vIOxGa`VRN*THhDhdK9->~(f z*A7SpdnR@i@>7Tw?C_d*MYO5@aykGZZZNWG1G#b!tP;b6KJEJDI(rYWc9hevkN_Sv z3-DuQ_pwP$jLq7e_ldREM{{#XukkB`Uq$EyDvJGP1=f}}0D+X1+;Y}6AOv>cSMK;D zp6`zW+`O%`l^O511uMERcP-D^UIHSo{`*5GUa-Y^+!4`H`O|JH*_`j8bbR(Oj-S`g zB^>YtuYnp@)VJCupao>QNOQN;(|v*9a8C`}zIkBiX6tU8%9=++YXw|D$rMIwrOU}g z9rb097&F-OKc>v5jVF&~gj(PDRT7=yJ(p4o`ece|`gBPQZsxfL_=p*Ozn*9FdD;?~ z5k=y(@%bK%*K9>Z!&4p2B%5qn$zv32mdpuCk4D%N)<11YPiIM&+7W}-#9o{iTj7g{ zKCsa?P*mKM>63C!Do>K2VEYPkvB0kLQ~Aq;5-1O}2`GR*63aTpy?WE3`Fn@R;<}bW zZ9vPxNXO%W#^3n`%4bAONwaGKi9Yi^?|VTnZD!33=f#N`4kI&(VN#5`xQX_$25d;O>S2>N!Fhax4lDdGw@ zNqg@xR}OtUO3SKG27fZUzdD;~Hp2&s#OAFiNG_iZk1Av^7X{L$D3?Xj@QEHSC|N3@ z_i@-eKngBiZdd8hmMY3LZl~&YpRUQ1;9ZW$uC)1h>EF9(_G_@c+SR=b|7(l>dlNS% zi-rIH{YlRnC6kd=>^zxpybHH_e|(t=s609fpqYD1p0s@c%)OkfJ7$m1w5!w!9|QmG z2d~8zk!-yjL7nQ@iFkSt@l5>Uvsid(40K^kaWvx{rOcOH8>tccBRWhs#X3=Q=+b z-Z3{f*G$R{p^Z?A)-3pr`=e{~bH7n=qxw0#I_^MroPxinUlnXuaTlJ>oi$$E?LiQm zjz4Bj0}_<{<$o6p%2)gdc+=aXJ@4+xf8-U<7+`1rkK@9(?9Q*KPXi(x$zYd;A_#202(0ExFsVkk!c*$^Xi%(tvio6{_hoMg#;^Zz&CmOe`NC*(3y+Ms;B;2!IH)7e|(@P!&Lt{ zP5E2kdIF_a#C+`!7|Vaj-WN&(VDZ{ChQiqYef6IjKoB{-vifV8|NT)$06CPl#g^;; z@2hJ|0df0(&p`gVsM56{(?T*&8V|?@(WAQRLJ1G8Kcdfe;u^XdnHCop&-gza!Y@5i z$wUo$6@u0p)nW>%ay+9Vw4E(o`iP5KI=9NnM`H>`8)pg;l)3iYh3y9G#?G!xLO;WB&w z<7`@0)b2;3EW0h3bf-+4yShK5gGna`Y`YBJ``n+0;M&Z`1L2Q6+ks}i+6d^%y}+k7 z)AzX`k@$iL1D!=8l_M^nHv<&Li{f7Uw}XH2rtbQERxPlla9=Ed`^~p2-*TUO(5|8r z-xY1;cPqp7Hg!Za1qW=L_51E+5FgR(STl4;g(TMfIRQJUciT!oc%w^f(y)t^3&)O~ z`3SvtTMsyVs}AeQVwP@zx?X5_-Wp7DbvPZr8=OL*~a;4uYxY3>={1b*R;TXF5C zF^WnYQ+YPD`#}dXwtGp+4j_X(jsesY_PU?=D|4d^Bz4C5B3r7(2`qk%Ob}wg*9xZF(HYFL z=#8-VGpeTUQ-dGCu#LnOek%{{pS2@fqr#O8gQV|e1OtXVxj*&E-*f)lJQ)ncI`~ZQ zf52VFokH&np-I3H|}0Q3JBuCE@ym92NO%rjvgRiXsDxT+Nh2 z{?2Rt&pR^v!HPH0NL&BM`@tg5oF>;_nVNq=PK}^Bt+cZUMq!?%VeKZ=^o|+ zhyT_^k(9hw;C|F(U)qsvxGiH4KuO%Q2c!cz9NCr8(bCfL0No$E7&1o5%{l*oN2CI@ z$IZH@yMH^C3&0dJ0L}WW7lba%C0VJi^*Dg(W6i*Gq)9Fqo*Bv`!qld zOlRRj>h^Sp|C&?VE*H{}=wh_Q9bvtPJHAo0cSsQ`b>`4)U>Xm<#(XwDsi){M?arRSdi z@03Aj zN~29M0)@Key2THGH!401*ctm6*DvBsfTivJ2cqSXnWuh5uQK!fKb=B9ER^>f%-7d9 z3-l_*xU)gd*;0YE0r~jQI*+Wb)wFal1`L<6zbXDRB~!H~fxr5eb&?#d+n=Hjyi5SU z1C`!?*s&wtNrrNOW}6Uz<4EBGMVHN7T28K&_ob(hM)3-lymMuS$Q&QA zDf{<=rc&&MWk2|P$?OhA{L<{N>GsICaX0`D?7vJo3<{kiUU{@Tq&;9HCAkMA^HLW00TFK4tIHe(IMAF%9nn$B-l%N8xjm#1quke+(Qa2?W1lHtSLjAoI!SqF&wzh?U%+xY`*3g%%jK+TG- zYpI(B?9KL)g9R7`XVAM~4Y354mg3_o!t4lemx*%Xy#Y_K(w`+N`Q-Gq;L^^$wIcIz)d}yqi-w;ti9TN3S>C-rlojZ{N-N-d>{zgp`p>jsv z0gINGEAa93KlcCtR!RVN`v0?k5+gyOE>x*UW+B-W8nNQoc7ll~)Tx5f)6?h0Gxi>A z0(^V+|FzpRRBD+8Y$azoVKMu4E&C|Dp$~9Ouzi;zqBl41e*~fR7~7-7wO*1!=ASCK z2S@4FKLi3$=_Y{7Q~*^=<+(5qKv9ub#CfDZubc23=|W+RGk?y>*m8zZAQo#q2aNLa zE4b{~0C(ey@aFOKZ@aeRIivAu_N?s25M+iSYW-t zJTsU9P!!o`vZSub*anu-_UoJTy`TrQshE2n`>h*~)%W>VY#9OhrEclQG!Nt+ibI%#1YQRnjDrAhe^tU;?(w z$sxd4>!A-ogGJb6cC^?{iYaDE0$`RcIf#>J_%tk^1%HZ3%Rv7^%?v{Q1GovfW!`p` zgr2K05o^82-Uxkvxz5ytNF;4?B&xG%a0Nr2mAFUgwzWQxHO$)fZ-ndeudqW^MUgz zHg>@gP3NOq`PPCXPK?lXGXLZsx~P+U(g!lF{+-7KWDMAZm1DWs|E<@)3xXFq;X<_k zqp_61k;DH37rBi681bE)PX$mfWXB_S4r_4wCotDK48j+@u7KggA!$urP7|*a4+9m` Xo|@jUdTMtJ{Da=tyjOI`;^qGVU2}H~ literal 0 HcmV?d00001 diff --git a/docs/proposals/scheduling/20230314-gpu-topology-aware-scheduling.md b/docs/proposals/scheduling/20230314-gpu-topology-aware-scheduling.md new file mode 100644 index 000000000..98776809c --- /dev/null +++ b/docs/proposals/scheduling/20230314-gpu-topology-aware-scheduling.md @@ -0,0 +1,275 @@ +--- +title: GPU topology-aware scheduling +authors: + - "@happy2048" +reviewers: + - "@eahydra" + - "@hormes" + - "@yihuifeng" + - "@honpey" + - "@zwzhang0107" + - "@jasonliu747" +creation-date: 2023-03-14 + +--- + + +# GPU topology-aware scheduling + +## Table of Contents + + +* [GPU topology-aware scheduling](#gpu-topology-aware-scheduling) + * [Table of Contents](#table-of-contents) + * [Summary](#summary) + * [Motivation](#motivation) + * [Goals](#goals) + * [Non-Goals/Future Work](#non-goalsfuture-work) + * [Proposal](#proposal) + * [User stories](#user-stories) + * [Story 1](#story-1) + * [Story 2](#story-2) + * [Implementation Details](#implementation-details) + * [main steps](#main-steps) + * [How to identify pods are in the same group](#how-to-identify-pods-are-in-the-same-group) + * [GPU topology resource reporting](#gpu-topology-resource-reporting) + * [Node Selection](#node-selection) + * [Pick GPUs of Node](#pick-gpus-of-node) + * [Record the allocation information to the pod annotation](#record-the-allocation-information-to-the-pod-annotation) + * [Container environment variable assignment](#container-environment-variable-assignment) + * [Works with the Gang plugin](#works-with-the-gang-plugin) + * [Unsolved Problems](#unsolved-problems) + + + + +## Summary +In Distributed Deep Learning Job, each worker for the training job may involve data exchange and other operations. The bandwidth between GPU cards will affect the training time of the training job. Although the k8s native scheduler can allocate GPU cards to the workers of the training job, bandwidth between GPU cards is not considered; this proposal will provide a scheduling plugin to consider the bandwidth between a group of GPU cards when allocating a group of GPU cards to pods. +## Motivation +NVIDIA Collective Communication Library (NCCL) is a Magnum IO library provided by NVIDIA, which can realize GPU-accelerated collective operations. NCCL is topology-aware (automatically perceives the connection type between GPU cards, no manual configuration is required) and is optimized to pass PCIe, NVLink, Ethernet, and InfiniBand interconnects enable high bandwidth and low latency. In the deep learning distributed training job, the distributed training framework (Pytorch, MPI) combined with the NCCL library can achieve the acceleration effect. The NCCL library can perceive the connection between the GPU cards. Different connection types have different bandwidths. The size of the bandwidth affects the training time of the training job. + +The following is a matrix describing the bandwidth between 8 GPU cards on a node, and the unit of value is GB/s: +``` +Bandwidth Matrix: + gpu_0 gpu_1 gpu_2 gpu_3 gpu_4 gpu_5 gpu_6 gpu_7 +gpu_0 750.48 48.39 48.33 96.41 15.77 15.52 96.40 15.74 +gpu_1 48.39 753.38 96.46 48.38 4.64 16.93 16.98 96.39 +gpu_2 48.38 96.25 751.92 96.48 48.39 17.57 17.59 16.72 +gpu_3 96.25 48.39 96.43 750.48 15.45 48.39 15.88 14.62 +gpu_4 5.00 16.81 48.38 15.98 755.56 96.39 48.38 96.44 +gpu_5 15.80 16.93 17.50 48.39 96.25 751.92 96.23 48.38 +gpu_6 96.42 16.75 17.47 15.89 48.35 96.28 754.10 48.33 +gpu_7 15.65 96.20 16.77 15.71 96.25 48.38 48.33 754.83 +``` +If a distributed training job has 2 Pods, and each Pod requests 2 GPU cards, then the [gpu0, gpu1, gpu2, gpu3] combination should be selected first rather than the [gpu0, gpu1, gpu2, gpu5] combination, because the former The bottleneck bandwidth is 48.33 (bottleneck bandwidth refers to the minimum bandwidth of any two GPU connections in a group of GPU cards), while the bottleneck bandwidth of the latter is 4.64. If the latter is allocated to the training job, it will greatly affect the training time. +### Goals + +1. A scheduling plugin is provided, which considers the bandwidth between GPU cards when allocating GPU cards for pods and preferentially selects the GPU cards combination with large bottleneck bandwidth. +2. The scheduling plugin supports allocating GPU cards to individual Pods and groups of Pods. +3. Topology-aware scheduling tries to select the optimal combination of GPU cards currently available on the node for the training job rather than a mandatory behavior; that is to say, the GPU group allocated for the training job may also be the worst combination. +4. If a node cannot place all the pods of the training job, it will try to place these pods with the fewest nodes to avoid node resource fragmentation. +### Non-Goals/Future Work + +1. In this proposal, it is assumed that a training job can tolerate some pods running on the node first while the remaining pods are pending. If the training job cannot tolerate this situation, the GPU topology plugin needs to be used in conjunction with the gang plugin to implement All Or Nothing scheduling; that is, this solution does not implement the All Or Nothing scheduling logic. + +## Proposal +### User stories +#### Story 1 +**Single Pod requests GPU cards:** There is only one pod for the training job, and the number of GPU cards requested by the pod exceeds 1. At the same time, the training job uses the NCCL library for communication between GPU cards. The communication bandwidth between GPU cards needs to be considered when allocating GPU cards to pods. +#### Story 2 +**Multiple Pods request GPU cards:** The distributed training job has multiple workers (or multiple pods), the underlying communication framework of the workers uses the NCCL library, and there is data communication between GPU cards. If a node can run these workers, then these workers should be run on a node first to reduce the communication delay between GPUs. If one node cannot run these workers, consider multiple nodes to run these workers; when each node selects GPUs for the workers, which should be run on the node, communication bandwidth between GPU cards should be considered, and GPU combination with the largest bottleneck bandwidth is preferred. + +In this scenario, the following situation may occur: some workers(or pods) of the training job are running, while the remaining pods are pending due to untimely scheduling for some reasons. If the training job can tolerate this situation, no special handling is required; If the training job cannot tolerate this situation, the running pods occupy resources and waste resources. To avoid the situation, it is necessary to ensure All Or Nothing resource scheduling. In this case, gang scheduling is required. + +### Implementation Details +#### main steps +the main steps are described as: + +- When pod1 starts to be scheduled, the GPU topology plugin uses two specific pod labels(will be introduced later) to find pods that have not been scheduled in the same group (including pod1 itself) in preFilter extension, for example, [pod1, pod2, pod3]. +- At the preScore extension, get the list of nodes that are currently able to run pod1, for example, [node1, node2, node3]. Select a node group from [node1, node2, nod3], the node group can place [pod1, pod2, pod3], and each node is responsible for its pod combination (for example: [pod1, pod2, pod3] can be run on [node1, node2] and node1 is responsible for running [pod1, pod2], node2 is responsible for running pod3), the node needs to provide a group of GPUs which the bottleneck bandwidth of the GPU group is the largest among all combinations. Update the given pre-allocated scheme (including the pre-allocated GPUs for pod1, pod2, and pod3) to the cache of the GPUTopology plugin to pre-occupy node GPU resources. At this point, from other GPU Topology Groups, this group of pods has been allocated GPUs. +- At the score extension, if the current node to be scored is the same as the node pre-allocated by the preScore extension point for pod1, then give the current node 100 points, otherwise 0 points. +- At the Reserve extension, update the GPU information allocated for pod1 to pod1's annotation and koordlet mounts the GPU device for the pod1 by the GPU information. +- At the Bind extension, the bind operation is performed on pod1, and pod1 is scheduled. +- When pod2 or pod3 is scheduling, the GPUs pre-allocated for them are directly obtained from the cache of the GPUTopology plugin and the allocated GPU information is updated to their annotations, and the binding operation is performed. +- If pod2 or pod3 finds that the pre-allocation scheme is invalid during scheduling (for example: when pod2 is scheduling, the node list in preScore extension does not contain the node recommended by the pre-allocation scheme, and the pre-allocation scheme is considered invalid), then the GPUToplogy plugin needs to re-select a node for the current pod. At this time: + - If other pods in the same group have already been scheduled, it is impossible to find an optimal GPU combination for the entire group. In this case, the GPUToplogy plugin only needs to find a suitable node for the current pod. + - If all pods in the same group have not been scheduled yet, it is necessary to find an optimal GPU combination for the pods of the entire group again. + +![image.png](/docs/images/gpu-topology-aware-scheduling.png) +#### How to identify pods are in the same group +If the scheduler needs to select a better GPU combination for a group of pods. How to confirm which pods belong to the same group? The solution is that Pods belong to the same group if they have the same label key and values as below: +``` + gputopology.scheduling.koordinator.sh/name: "xxxx" + gputopology.scheduling.koordinator.sh/replica: "xxxx" +``` +The value of gputopology.scheduling.koordinator.sh/replica in the pod labels must be consistent with the number of copies of the job. +#### GPU topology resource reporting +Report the GPU topology resources of each node through the following CRD: +``` +apiVersion: scheduling.koordinator.sh/v1alpha1 +kind: Device +``` +In order to meet the requirements of reporting GPU topology resources, a field Topologies needs to be added to DeviceSpec: +``` +type DeviceTopologyInfo struct { + Name string `json:"name"` + Topology string `json:"topology"` +} + +type DeviceSpec struct { + // add a field to report gpu topology information + Topologies map[DeviceType][]DeviceTopologyInfo `json:"topologies,omitempty"` + // device information + Devices []DeviceInfo `json:"devices,omitempty"` +} +``` +Each node will only report GPU bandwidth topology information, the following is an example: +``` +Bandwidth Matrix: + gpu_0 gpu_1 gpu_2 gpu_3 gpu_4 gpu_5 gpu_6 gpu_7 +gpu_0 750.48 48.39 48.33 96.41 15.77 15.52 96.40 15.74 +gpu_1 48.39 753.38 96.46 48.38 4.64 16.93 16.98 96.39 +gpu_2 48.38 96.25 751.92 96.48 48.39 17.57 17.59 16.72 +gpu_3 96.25 48.39 96.43 750.48 15.45 48.39 15.88 14.62 +gpu_4 5.00 16.81 48.38 15.98 755.56 96.39 48.38 96.44 +gpu_5 15.80 16.93 17.50 48.39 96.25 751.92 96.23 48.38 +gpu_6 96.42 16.75 17.47 15.89 48.35 96.28 754.10 48.33 +gpu_7 15.65 96.20 16.77 15.71 96.25 48.38 48.33 754.83 +``` +And the CR example is described as below: +``` +apiVersion: scheduling.koordinator.sh/v1alpha1 +kind: Device +metadata: + name: host04 +spec: + # report the gpu topology information + topologies: + gpu: + - name: bandwidth + topology: '[[750.48,48.39,48.39,96.46,15.97,16.15,96.41,16.18],[48.39,752.65,96.46,48.38,16.96,16.84,16.67,96.23],[48.38,96.25,749.04,6.02,48.38,17.57,17.54,16.95],[96.44,48.39,96.48,752.65,17.27,48.38,17.33,16.8],[15.99,16.8,48.38,17.27,755.56,96.44,48.38,96.44],[16.14,16.74,17.73,48.38,96.23,755.56,96.25,48.38],[96.43,16.81,17.6,17.35,48.33,96.23,754.83,48.39],[16.28,96.22,17.18,16.88,96.23,48.33,48.33,755.56]]' + devices: + - health: true + id: GPU-04cea5cd-966f-7116-1d58-1ac34421541b + minor: 0 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-3680858f-1753-371e-3c1a- + minor: 1 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-95fe7a8b-bf9b-73cc-2903-c6e65883f3a7 + minor: 2 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-cd8d5d8c-7334-4c68-587e-04202daa38a5 + minor: 3 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-511dd579-5044-b716-e08a-841f51796a59 + minor: 4 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-62460a09-6838-abc8-00f5-31d2c6c101ef + minor: 5 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-2da27328-f395-a226-7486-c08e6e98570f + minor: 6 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu + - health: true + id: GPU-8137b226-b69a-1f22-4367-da110c8ba6b5 + minor: 7 + resources: + kubernetes.io/gpu-core: "100" + kubernetes.io/gpu-memory: 16Gi + kubernetes.io/gpu-memory-ratio: "100" + type: gpu +``` +#### Node Selection +Suppose a training job has a total of 3 pods waiting to be scheduled, namely pod1, pod2, and pod3. When pod1 is scheduled, the list of available nodes filtered by the filter extension point is [node1, node2, node3]. The logic for selecting available node groups for [pod1, pod2, pod3] is as follows: + +- First try to use one node to place [pod1, pod2, pod3]. The conditions for a node to place these three pods are as follows: + - The number of GPUs available on the candidate node is greater than or equal to the sum of the number of GPUs requested by [pod1, pod2, pod3]. + - Call the RunFilterPlugins function provided by the scheduler on the candidate node and each pod to run all filter extension points to determine whether the pod can run on the node. If all pods can run on the node, then the candidate node can place the set of pods: +``` +// nodeInfo is the current node info +satisfied := true +for _,pod := range []*v1.Pod{pod1,pod2,pod3} { + status := RunFilterPlugins(context.TODO(), state, pod, nodeInfo) + if status.Merge().IsSuccess() { + nodeInfo.AddPod(p) + }else { + satisfied := false + break + } +} +``` + +- If one node cannot place [pod1, pod2, pod2], then try to place these three pods with 2 nodes. After allocating GPUs to the pods, the combination with less remaining GPU resources on the node is preferred +- If two nodes cannot place these pods, then try to place these pods on 3 nodes until the number of nodes tried equals the number of pods. +#### Pick GPUs of Node +After selecting a set of nodes for the pods, the next step is to select a set of GPUs with the largest bottleneck bandwidth from the nodes and allocate them to the pods. +#### Record the allocation information to the pod annotation +After the GPUs are allocated to the pod, the allocation result needs to be recorded in the pod annotation, and the allocation result will be used by koordlet. The allocation result is described as follows: +``` +type ContainerIndex string + +type GPUIndex string + +type Allocation struct { + // allocatedGPUs represents the GPU index number that can be used by the current pod + AllocatedGPUs map[ContainerIndex][]GPUIndex `json:"allocatedGPUs"` + // visibleGPUs represents the GPUs provided by the node on which the current pod is running for the pods of entire group + VisibleGPUs []GPUIndex `json:"visibleGPUs"` +} +``` +and the example annotation is: +``` +annotations: + topology.scheduling.koordinator.sh/gpu: '{"allocatedGPUs":{"0":["4","5"]},"visibleGPUs":["4","5","6","7"]}' +``` +#### Container environment variable assignment +The following logic needs to be implemented in koordlet: + +- The GPU information allocated to the container is parsed from the pod annotation, and the value of visibleGPUs is assigned to the environment variable NVIDIA_VISIBLE_DEVICES, which represents the GPUs that the NCCL library can discover. +- Assign the GPUs allocated for this container in the allocatedGPUs field to the environment variable CUDA_VISIBLE_DEVICES, which represents the GPUs that can be used by the current container. +#### Works with the Gang plugin +This plugin can be combined with the gang plugin to achieve consistent scheduling for pods of the training job. If a training job needs gang scheduling, the pods of the training job need to add annotations: +``` + gang.scheduling.koordinator.sh/name: "xxxx" + gang.scheduling.koordinator.sh/min-available: "xxx" +``` +User needs to make sure that: + +- values of label gputopology.scheduling.koordinator.sh/name and annotation gang.scheduling.koordinator.sh/name are consistent +- values of label gputopology.scheduling.koordinator.sh/replica and annotation gang.scheduling.koordinator.sh/min-available are consistent +## Unsolved Problems