From f87db4b0e8908e97471c5c51e8d7dd4bb5699360 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 14:51:54 +0000 Subject: [PATCH 01/11] add node centrality functionality and docs --- .../basic_graph_centralisataion.drawio.png | Bin 0 -> 44159 bytes .../evaluation/clusters/graph_metrics.md | 22 +++++++++++++++++ .../clusters/how_to_compute_metrics.ipynb | 1 + splink/internals/graph_metrics.py | 18 +++++++++++--- .../internals/linker_components/clustering.py | 23 +++++++++--------- 5 files changed, 49 insertions(+), 15 deletions(-) create mode 100644 docs/img/clusters/basic_graph_centralisataion.drawio.png diff --git a/docs/img/clusters/basic_graph_centralisataion.drawio.png b/docs/img/clusters/basic_graph_centralisataion.drawio.png new file mode 100644 index 0000000000000000000000000000000000000000..418895957c4b20f82b361d19ca282a79975dfaf1 GIT binary patch literal 44159 zcmeFZ2Rzkp|38inPWC1%GaMs(mU--Cla-wjj=i_c?2-_1kgXEQmQ)&Mc0$OOoz4F` zp}Wuhz3=b${{Ft}{`?=ePvyL?_jRu8dcXGT`Fg!S(Hd73@o}he(9qEEm6hOHXlNM4 zXlUp-P%JPq?BaC}{6crrQj|q2@1UJSLlgbtE`QD4(ffv-gEbnffZXvXR(@Vvgqu67 z0GySd-`v@m$I8y!#?{=>jmOE_9Sni{jtDEe8+O)K$8-33`GvT7`MLS{b@&8X1!Q$W(*_e3Wr^cCgmBwotVLr%@0R5aAIJ2E**CN;+y+So!6^ zX9v5R*5H?-wdG9&>ML@#t_VjkqQEC0z{89B4-9IX-!ONz`)vu-lX+UZy4fL|P998< z2TVVH1Z#J5o8wVccV8=c4{Z?-HDLoWeMLq68y4KhbKHHLt&iu}AP_bVU_;cBZkFcY z<4LLXolw|8)b^U1tkO*zOL-MbZGB!x ziyOAbOF5Z?=lZ#}1|Tx`&kcTG?Q~5?Z$(#gXInLd)lceLd7m7JR}>r(bxJFr;}KCl zvEu<7*VElkPK_GSw(~uGloPtxc-UE4yPd3d{7rWR!ol6n`HzE^2q!0N%M&3uS<&3p z72);AX*UoKC(nE`#~D1VX#X3MqDJ-YtlWVVp~RMtPx$09P&bsU?QCpMwie|V zIXA$j^C^xr~t%bIsfxEJRm6eT(6I_IkTi}!s?mnjiYi$LHeRAo9Z~~l= zbwzkMS^ZRfAgi(nS9e>44Z_LX;R*ub42Z`E2JNlg-F;3(#oWUk0fubd9UV^Yx*a^$vik=5otQH$gK{+)RO z$6KHXbv*0m2LHZn{6#_(t$|%~cl7~M;%e<+?r!J#`zrjr<|o==^J^aee;_-*OCrjm zYaoDi1&05dl5{!4vOtD$M`=>LkqjSL!4p$iputD#R-yD9kS`DtxMK zrziN+suTae)afto=!6#kp*uQpTYnaH5)}KRJK{UNK9%180sZ+a4dxf&;}H_$<>eO< z;pG(-I3e`u{(oNWIgtdne>tOn+!+b|xq>-i(LZVr|LOIq^#1p2kAR36kJ#@&e~2ap z{`vraTJ;h5m-_t0`JBXP|Dp5wlc|r8;K_kcVmU$H6Q=(Us*mVNeE9da{PQZz$%FmN z-vIF7A61w?8F(Pf_eXyt@IN1|i3kMEX)Yne#pDqM$9xn|5lHcCzAPX=SZ0ej|d&c_Jv4LffXKqw39 z3J!7z2gEVdEg+yE#3zda`TmI0pg)9vJX!EW3T=Nwl26A@y~Xbs-9G}G{g(i!zbR5D z&+(f`oe=QVH<<7W!*$pOzN>blg5kOn&1wsBQd<+sBo% z6SoiQNXIle)$LR8;ZMhH(7u4;?Qh)vH_W#5aP|CIbVd2k--@J`4(4udc9vl9w?OQV zCEDX?>krWtFZcmp`C}XgVyM3#Z~fN+<=?m>^f#iZ)3N`ZGU#8+4nN;tr4|4E$eAFj zO9jQ|)4)UIM9EL%$N%Tl|1`J-g>!RMCHZtgXH*H*4Ylb%5*#0|a=O;PDI5y|iTVBb z{HU3z$NA6IhyU*S0-}G1JpZ}%e+JdZg!o4u`hT%x{5K=^6RGn z-xHPR7dVOHPBrwu-^z%H0iFEK%J837yH28`YSOZ{1l^7Q6}$=HIw)-Gk73J6Gnl!9oeiov(-L&Sp*kr4 z05P4+`7IDap+t6|S5W5UYmRnSRwx_yH<7GAMooX$M*TM;PyDC($*FxfZSD9iRyxH$ ze@C898}&|g`u`qzIt7ORS48ZJozu;K2^a{xe*5#cp-oY-KUTPYg}MLC0w(6|>Lx6z zE$nJ(YAI;v#C_A{3jDaS`H!&lpS(5I%G}NNS4MYiZT>3!0dW5RL*wc{+9~ui6#2(u zF@3`>4 zl;>~eWB;h~{4}fow950V4d+yO1pmkL4ndJW2G)Wo=Y+~bPWAOqt2`%VmjAyh4+x}P zeXfC!nwLlD_yr7bZSYsua1`W*oI_|rN5ofoQ39p_(OUHhHOKkuHtEO&h4tdG= zXlM**%5WJS@9T>u*xqEi1CoMv=w~yfXVqdh4$t-wYxpj=DcWL$>fo*@BSIPElSWGz z*cnWGv5PQz%p=Iz32o^$_+1W52J(x^w^dSMRu&L%yI3^l zW+*R4Y%m{j)LVGpeVVt|-^MjB`Yg;fC*5>kzpfP%^>Li6Jh;`1-L2L}=Mh|Qez3g! z)4~phLNKuzgbE$}>Y_)#P&{n;QNWux1fh5_JGX#|W8W_E^E0dqxcL*|B%cP(e135m zYpe~m(eZv+%+2sVH#{E*9YZj5KB6p#j3P;12+;!>$rjfQ6KyG`MqVO1J{cNwHXd02 z{mMj%N0jvPp}^esYqR!CoauYwYSc8yq3uh&rxTl@q6)#$FsXA9^5Z+}Gh{tCg-se# zP1c31RK3D)T>Y-3oXpbg&y)4*wwH;};5^So$w=8(&wSdCvYSf;_aX_d z=ru%$s<$4WdVCqo+}vVo&R#%J^0#v)>G*jQ8Dxq)%aGButUiaL1Bx7$jNrkp?Y$l) zN$M9X>X4p(plbtN?fiA$gUAe7Ob~x5CY4Cq<7mHXwL#Em@l$0PWxG9^V3kP}~$FKX+pZg|k zIBQmQ!AomN`NG+g{OYLd_7u8|ul8@@!4Q;r^)p7Mr)@%hM!|3jY@2r!WypX=jgsHha>O|as?pc3N&lpL~(oApCw?d6WX_upRBXth0eX#~p zhN|tL?N>_nT`nb`EhpbhSD9+1UfsMz9fr7ZtVAf`0nd)-O%t-S9z9&;Cs9Ik%cE_j zrgGiZxB61Tc`5pXr=|CVLdCs?Fl&i-Z#C}L7kbnSU-s|%3@OUQtcH>tjAv|s*l z;Lm)@jGr38T)b~(d%L+rAMUeGV$$d!kpRE8_k-1U>x*c>ti&vvxU1wdqrm=>loZM3 z{U6s3C!YstwBFIL^swfRIp68J-{do+MlH5;%V=`)5y_5I&n?X(kG;Ugq*Jb$p-3W- zjfqk1M4Tz5PIk`Dr@@n2nN~J9 zy2pZypd8nGnpQWvnY|RN8ZNrsU?LiJ7_x|+ae@o^RNSfzl)?8U70pk*m^*hEHe<2h ztP{ceEa(WYtD=Z6ti_}mJJEAMG5O$x!%W$!NXkq8N|lz_-5r_t9=}a^ zOCb9u7C!XMC?=nTEq=#$hEA2QZ@Ec=o>>M1sV>DzEgTEA6yx=B(~)@Irs|k6wJys8AV#pIm5l%qKbXogWDHQ&1N$PP!q5V8>PcAd}HPzyw1z05ouvsu{Fj9(`c4e zB3^CPpG#p;9zipIFj8YVT^QPYxYYzdgH2-nj%0@FyWYXYqo+9e>I(<8q>coi8h)22b7D2c^U?h0 ze4+qj<`NHjMY_VSBKuE0hW)g9Q-w_Wy3w!u(_orN_y$f4_Ej$H_jiRg^`l!Ksb%yJ zmF>IzBKHQ$0v=F&=3Z$=SFJ8&Rvabi?Z}K?(fZt#hBHOBzPYS^cbKVSIX-;jfx-;J z%4?^;%B`fkj$@2?V65gSHr?a7&fe$Gp+za|qU<%>o;tlZi-%e684q8((UJNMr+vRM zqh0;+^#&De0@`ftf~(O3ra=etlozkN4|y5iXH}($g#4h(yYl3h?G6k@Bh{4}vVHmy zpC(G&lcm;gfJCM*CR^oxXkmZ(60uB5MEYgK<#tmVQ?I(d=hp>29P|f@Oo@q!^(J7% z*ZdpYX5v5jeWN5}*N`T<=JWlg(-hjXU#nk$11_Hx>HC`O+&h*N_Dyk?Ec}S(BXwn# z{8#nMZK=9G4I%Gj?b=(e(#KI#Q_m+hOZxrz_|4@E21eQ-9e(MStL0j$<>}j_iTb~+2LDbnJ z#^)_&P3odxx)m=%WgjImdwjiDcU91Sk2zevmD}dq_d%8Ks-(_gdhH2m1D}hu*>-nk z9&*_AV_>9hDi z5`8}tR~^_5Cfc@BBQK~~AS{c2JfMy*NHx|~tSRzZ9?c&gH}eeqcrz@izn0Tt->4LhGriz*6DFG`{1iUx9>+3-D>0mEf z^5}u>1l-gwcYc3K_EC4AU-<{!A|apW?)tW)SB%79ik65kwJC4nXcRN1;>e=lqw`Cj ztKaGKYbPZo5!|kJ8hMceMPBk*8zj00H_cIM{zbdOXduEu-Myc#C0>4iKM9(do&9w# zPpv4+*z0xbOy@)Gx8L2^1utcArk1#>X^iWpAIM>AX~o`xPS1Ad+RgRkZ>)V&r51N* z$3MppClBhSojN?&(`(oFp?;aGm|&cRHDy#T&X7+cP5KvlI%5NVeb2se%kOFy3ry=4 zyFL%Drch2uX~ahQQc4?)sYrm$O9!s?Epe>gM>`MVzuBjM_d;CYUEBwkDUSBn6JBBP z*lfKEisvzkhjP`rKK-RKE~wD0wDawBJ6z}{Nx(K^x93}vw2u|;Wo~+wAC&uWn#O{T zG7~#-0(34@8@?K*M0`D(ZOd=Xju>gm^OjqDuBNz@JXZ{!-SF^$HpZXCb8wV3>%K9}B~P+*UNjMU^n+ARI~WAsm*>sG zBcmx zm|nIk9-klar#pFuWe}&%ODtApp404o?ig)Y$zWK2?Q@O0s5nz8cD!+J##XWx+zP9J z9*d<>F74ChTz=LHU!hR@WvZC+#J9eHw^Wh?WQ3=W5Mz;c({zv1bDZqW;^wakOH|{pBXA-QC@?LFD44hn`ab2fH4T zb7$6g%sPZ;zIcs!+&{8sA;+@0*}Kpg=i8E_e`)BVLi0mFQ~T*J5tqoBN(y%sH=CmI zeG-SBm5iZ{kJPvc1|9l~`>e5)nKq5uK(E((Yr?N-ItY5q>pa}asg72EC`lLAjKRqo z@x1y+&ExH+$Laj+t0v0>)R%3Hm$xf|VuPg^fB=82i>1|_Dvm#|-dp|HPQTXqt87x; z5KSL-3Q{exIP-1x_5cYkHO(iv+-NzLf1=TJmdwY=R9xB5SH2pEH__14CBI8EW(Sdp_ z?gTb}=>Wz9V8XP3-dUKTwEz_|lD)Ovur;bp$NUU`dt**J^0iU5(-*Bu9e>*=ok2py z^iIaO*vdEZhnP8$#Nt?yrSa%3_4OfbYajGl9u)Oo7K;%f#b>NijurXk01 z@(xxf-|~%=>XuOCsBvbvLb6|s7X~b%{1-i*$y-{Xi*0u~?tHMEU25@U{4_g%dGq$U z%rXI^T24X4BeDG92W6Q-9nfuDpeOB(pRC_si$7SE#R-naQylIh3m3BO3f6mky=-Xy zO+20I87w@aN#@t%z>Vc?wWQm0W}SVFk*Jr*`7RM_WxZ4CW<-h3G}_|f?FQxg2Jss)<1(eUUslAeys~nU0iNZ9@-^8g}W@#AFAgL zWT%Mk!F1nK^BGq=(FN?UzD=(=nC)gJE=z%$rt+BB?rzMbw-@|q{#=cq+^K3ln|$@g zr9eFz0rj`HV`D3$8&y;wT~oRiK5J9aNk_%08~8d~ognmj=PFq9+_Ay-q37LCR+^XC zfq8@iCZzAkNK2Y%5D`zMUwy#ar_y~xr}+EhYfqV$MryRzHyM2;d^c?8hn^L9vtE60 zZC$J{3ynn0IHz2=`2CwYujQ;)8|eO2u6-+uVm*QF$*i|az%n0WEso2 zVXzqbOWx9RMESxsGO5qyA_2MA$qv4v7>enm+vb?`=wtu zs&c0axcnsUk=x2RO2h(CB6i+wxqUDjz58MlQG2@l#$3T@#=vcXoMw)j{iW9*bnk0B zj@GBy_m^@|80M>x&ksC4pM!S0eGgjizth9$lY+Bnyon0G2-Zza8pgUk0Oo*PDUjhI zQrel4HxY zhXvUw#fI^9@P+2ifq&!}6IwD(1xy64*R(x>Yd=(xZ#(&Z+6Y24^tLE80J}Vr@O)M0bJ&ywBv*3V3W{Z6iHN?}X4_06N4x%G~k#b@s3 zja>S+(_38n`Bb8fA;evLK&CzF#hhxMo1f7vxC*aE7{%|oEOx$uz6fas2F`Zmg^TR_ z_t)9T#7iF#jAIo_zQAPZOZu^3ZObX~6vT<7J%lT|PCtQ?HoOj}>YbVgg@+ZZu(B_< z-Keh_ zbM4ILfK_cR_Q%b}y(wG^*(OjJ=fI$AB*b3QZ@ZA0z9(#^E4srE2GDB z66~BQJi2XVemkhVxa&NI9&&egbAf8~^=+FS_YaTUAPmz4TH=-wue%XVth@7^nG%jX zTMS}Tg!X99ZnQJe;)^&8b{W&Ol{{q%mgN=;39pD4x$)6ZRW(R8kLu}Q6l~NchFnE$ zjDVdHS@PsE_HtKhZUTGO84||bQqVOnZ3k_9F=h+Zg|D0 zuU%#+bWqkTpI|>w?&wG}7j?HzG`5lFRri-lJBnl+nIPLo@l-OsjSY?aIxlHx<@}78 zd&sL9=MRCCy*&WZKebGWXZM($C4f_BRlRp*>r0*SkY(c=iV((EGd&%-bL6iq9_GSc zD=bbjYHB`~=j+>1J%WL=>A(V0BLb`&n?jsIUW)KxT#%L~>JxSx?gC+dCXda#WY*q# zAGd;mxz$i=A^WcLLD%Aw@ziy45^6n4!m~%~Y#0j}l#UJ_e;d0{38+cR;wgEsp$)tC zB-=?Zia=Muri1Ko{LGiHuOdArGcMBY%;qN8etM*aG@_a+J6wNwwB19OWh4*TCD@IQ zf6!U#4cEV2^nEnwYUb|uM~Vri>{)XA^wV|9qzVTEf8iWo)ch0kSEWguQ@lAryIGh0 zZp}oMGby%5-LOSc6Pq(W$`Cdv-p*%&N--3A_y1V>tO#prK>-V!0d~m5P zpkl;KCA^7xK?%jk2FJtw9mgL94nq~8PlSPA5zSYkOep_+b>q9#op@6`J^!W$g-qC+ znyspQ3ZdJp?A9?}>~_hJem)s3DZ6U7?Hr6%!O;F zkfe1PC39G;GFRSR)z6(*bjE=Sbm>F2>uyzEBNVjDnmHmjvw&6Tc|V z;mpIJ)-7pU8RwjX!AiSNqB2`^-BR)M%cHEaR))L2>(fRSz6Z6IV6tF-h8EHy0_iDHs8nTLM|EGTcoY1|3PN;Y?jo1e6wh={SZA7urKXdmnGf=>Xu$h z^87whR1#oiV=N~fN2{v{C)e!rSRd_|xsv6P-oNkJCsbgi?-E~Z(YB4HAUyzWWhkC} zn^65c4-aHjsJIYh1FcL%-0~YFh>t@R)}&)+qTz3;ZI?h$I;0}V%*4d*GSRA&r$kIl z7tTnM&M1eKF6@}TDNeg^kISHEx)z5V7FBEk?}(?rY|n}Wzzz{h-{C9~$61D&PX+1) zivSEfdj|QOy*Z@Q6@!*jlTJ8xHq*<&G@8WV@Zu5dU~k8biRk8BIJ2+fQB%+nH=l>R zw}d)9QZ{_Fu~>&uB|~q3Fj=o7VCP$^pzD+p-=$k`mn1IO2|ey{QH-UMF>TN^94{>9 z_N_*YQ3TQ4_;_DNYla2h=F;=>#wAE^`EDu^Ro7&|ZlMYVCqtKJcvi92WN}b_)aoXP z7>J!~?&GkKB6xmu>(CY*QOaLBIzQJ+e9)C6_qqwdQd}{ni5XZW`j6=kLL1lNq1#9( zrm+9Evy1=H*U0lRN0XvKoNIhzR8c&zD03w`w6lq%efg@XS^{wDXt=Ia@t!&RL}ENW zV^!#n;N;iJI%C_9ho0vvg;<#)-RHPW^YusT5z%%Nt?6orybu}ZCu2D|IW3NDQ`sx( z30O<-T)(~V4#%auVu;*XU0gh z46c^*(G!pPL|aY*00S30hJqXBnzQNP{bsceeX_7P%PS^eoeW)FLR%~&Xz-S0Wk4sl zP3r4yF|S8jc<07|`=JjJb<@!tfmEPjK+o9G3fYr+%yxxLcVu!lVpq(z{Y21ka4M#s zcBBa?(jG`uM3b^X_~FXmaMrn-LTbVy;70y~=qxCf#*;DK;Fm zk%fGke|z1!MpmF{M)q=b^JE4UdM49&<$au`b)xoAM#7Eiu|QI>ur|6|UuW1oU96f8 zcR2&r-my(663HpbOX0l@WPcONC>tZHeCrMxs6$C#zDTbXy052sxHZKz&j6i?di2fIb8&cP_+_=SX$8 z&Ql@A&gx?Oqn+coimEP{-tmt@J1oym{4`puR|Y!_R4QhYbnjj+aO(1<2$$+p4@>$P zDKk>e?i_@@L%UCaUH~WeDJblX%$YrFG0Nb}9xvP4oAfSWD#kNL)?bO43XJQfQmEb=m7gceR*!`I}bLME@3^8+~KE=(6zQcsFgoE-v9 zI|r{}*uE%Ic}<>Cn*k>Z%02t=EJn=oJAA`}HY0f{=XV;!i-jeG{&Fh8RwSBL`q1W1 zO@omJKd+kv+>MdOx=Pl{juvT!)GNx?sRtAxz*3Q1Ra`+QYZ~L-4+w#^N+iZxuAjpI zmLuping1me*6G?RTaKc$NJ>#v3*SMNi zB%eLHEE~L6nM|4jO>C=>cko%B-W_i6+iLY!l1gN1QF%MQ01^>tDd;!?>iW6cgW&B^ zGFevOCBesRwi8+)IwY=4%((8VD<;xc-%0;CmU!iUA^$B9XEZ~-Y0%qx-r=$|TyARm zSRHf}7@Cd~WBU!WxQg@f;b||x+UC~JNM^&Nx*l2c_*33f3LiOP5C}WR29w>XMBKLu z5(*tg%6rVA*>1J*C8vTVZvii02&xm{_ts<@nt)Zt)MV% z`R44QxZbY0$4!Sw+1ZM@96rrWBI(9VV&@1nmXhpCNf%8ea}r-Zx{MKp1Hs0=?fKsB z(<5@q@VZX{et|-yni<|GJm0{wmyV_P8<$=3&nRkHJzMc2web`)rD@8UTsU{JV&U16AZc! zl8Z{8^&1%%@Qd*lCuKBh06ykYLYZh=qEnQd(7=DRA$7Db;4yE-sS7#V0D|7zQpkF2 zzK7_x6wEO(3Kw0&$f%&j&qQbw`x37|YalU@XYrVxiYHKp^#{wE7Pd-rN5m1YctP$G z^{J-vbdNUpVOniVqzT>BlV9P})P+!*IlxQg6^>0Y=(2 zAZ{a6{-C$&=0FS~bFw+U#+|P`E~CEb-t?E5vTqemEC+7{hR5ebwK6q_jxkd;V$=EW zm&^!OpB3l}gnm|vdPVaFU#C>vRcbQ^qP?iBB^|2aD*bT+p)Pu`HY zhnYwwW$ojL0Y@*IumirXZY>Upjz=+;*wQ7;rX)7Db6RIsE^wxby4$Gv6Ed6W0IKOK ziWRqQrG_fnHnNZ#DSHR!O#~3U*8c$)0x-a>=_`r}^yH}k*S@b1OO*pvxlkN4K5@HB z8jbw5$9ylb++>czgqOpbwc*F*IKEX59s4<4kmA3eHi1Zge1hGj9k@B45><%;2J5cQ znmuPz&x(T1-{-Tm@DA%AWGL#&)O*!MHa=UVlpuaBXHsWXHkv$ohdFt)EWy{X*-p3Zh;m%Te$K!TyGAPXNpdDW=d4syS>Afy|wwPpQh4XJoW^1+MwQFvX2cP9(P300NKI;F$%th(CNPO{sF#}FH$?_be#?9PA{|~8 zbadEHSi_KM`vmeTQjSPdOf%rR`uF!(UNzsr{xL-|fVU)6`lE^L#*7q0@=iK|FPXSI1p zU2?TQ*@s@CIW!m!mr+=)_3c~J^=(il)P@Xnbic6|qXfTtq1$WX{o=8A-wVv8p| zM~q$X?#a8I_Ol|NM}Z+~Pb=n{M@8`P%}UY7QqOPi$zcG~L?IAd#E5%8Qs!=$)DaJa zOOTTO9*vS~Vl^8y9P=AJ)ROqj@;|Kb&0VZ)}9aStlHXG_c-q&)UYe=hu? zA0M*TV?nP4+n6eTBi|R-22Ng62J*(+-CsJ$7s07RNlbIkVr(ssvFYsOTJzw&CA~q} zs@O>2$_!$dg+BIs7ietPWMnjw24DtnV$X$Fk|VK}C-CLpMi9zHWvlenC6|Eoo)prT zRq0bpW(xuA9J8Qmc(m*=T&WD#{{|z9x;J^o5}-_%!-JQ%ydWBQ^+8hIUthr$G}?26SmEU9WwQ=Kh|xFxQcA&j41BG-L~;sy_bWzRq<-9=sexvKM&@8GnYi6&Q0mRIML^*yUJ4{{&J z5m^^vB^b0i{errki~w4qpQ0Li?}JZYo;r;+a{q8T*xl@BOtgno7}mI2dqsZR+b0k4 z+ltbA{JdTm@mwAqUMY=^YzdFf&X2njammL&^zMVg2|?E9cgV;N$_=YjAX;WUd`$h?{A!;_-5X409Fm``*r=pF>wClKx7b z7k}Z(eKWP8#00N}e(t{LhjoXt?Pw6)7zdR}21-(h?(I)FI-9a3I{dkL&&K7V8sHsX zV*!f5@q1DRHg-NFLgPq*!rEY^7V=EC1e9F1j0G=W*2=)I>P))gu)yC2oQoL+M@?Q3ySA37%Uk|0a~98 zQs!Meno4mvMECtsGOdiR1hjbLMovoDnh49s96eJ+*}jJ$4-(ilFA8G%ciFNS6~*$9 z;f$M;H9BxK5>58{JC%{RFXz+NuD zzD>y#I$v0aD@Y)t1^N!|;uUQh=3UATR+-nZ+~6^7@ZCf%Hyw?}oPoTP(QEI0Z?Jk{ z9iIa79Kp7`dlPxtkL&1Y?1<~gW}O<~rwv#_KU?(q^)q&f^i>SC|LBsKE%ljJy5D@y=-oi|U-a9S4%PU`Bz3yc|bY0Pvz#gDna21>ZrUec$b_&BL^t?|~O8Ibz zFoj5({HVej-gZ&(v$pU=KMlSHUpXgTaC;!au#yxa=>`;DCFaGym3+5Wc}nE^T^z& zr#_x>wxA5iR15Ryd^HRVDl@WVKj-rm32r0YK&gC{LsnI;z5@qIc~I_D|GM??ohDtZ z6WOd6VTNRYxW`=2>mW4eiXP9UVFTtOd}=`=I+?+bk!Uy`&aqNXBcd{JpdmIUQx(=9 zTKSIb>iTcb=HrJNK6fy9h@n-Y$IXaEAa`{Uz<^)1gX)ERyuDvAl&o#w`d>pcz`Wpa zLC$RP8WR0XR0IJlHGiwsZGmam`-RkQV|X({@uHeCxN`9{;?~*ZtQnZ-dmW6RDATP| znCY^083XjHu-axZT|5WTWeQ2Q0l^9dM3>MfX_lUngv0_9zq6!iF4iJ};T5Y$Wl4Az zPewFRTXOj9v>K9}Gm2V7xVt5`j_gA5n=cZtybpIbZ@O4PWt^#wT)gT%7}gjzc=Hqh z8vB~p?_)enaU7I%C!&r37Wljd{J)Deh;*hD0q%RRi<-Ps0XVl5bz}?OtlpU+&Z+j6 zeKB>?80@I>3AqR>Zo^8V?!KvacS?eV3BPKC%5#)f-jf&3Ke#jx_a!|d*{VSyJLMsy zo%p_h4S~nvpyo$XTG}g#!7h4Br8U~(56~1Fzr9a+7E^ff+LK@a6qL!S53#BfHLpq` z-582o+poqB6{w$=(bCRWVSOZhy|}lBl!K9>bT(y4?F?Ur(9L`INp6u(4Lw|%?<)x{ zeXo6f)NlFnQ+0Bf&Wep~Y|{@Kk(CSbw+QJN${cE?0sncPzZ|*hzx_3`pCa5ea3{aY zesHA36X9_Nn;?%5F}(5BmRzyaVOj>QTRTxFk*?;e_jw5U+k<3A5fV)ucVx0r5|cc{ z+_A5i9ArOmZD)xT>dbCZSNN?Ju?$OyhC_d$CbRwL=VjhssqVLLLm7_9 zAmlrRu5u7MNMGZc=~cX@z3&pcpw?;`jON46Zp)COz}bSM<-azi%qOB+@YQh^Oi7PGl*-fKlf~`n459_lWW$wKMv(ds=j{*p{wHiL4-&<~gceJGqQRZYKy5 ziM-@g^^R)cObCcuqnHDAHa_5U@PcSgteoo2%VmP7ZV)JxRZo3LRi}NOE7CA5>@fJ` z=%PP8EJFT}HSZh0O-Jbu!HDzA00*`>4`L>Si7IFcgTsO^$vSO^@4mvwR=L9*X`TQ0 zI$*RWWn!LiR{6Tq&f& z62zBoWEZ@0(X^4PjmWT~*m2t={J{gOS+9#lo`siF4c>%}WL-**VHfEoA&R~sK!kyg z4p+}2$`q5^81=()9jPzp*A!YFowv6hd3#;|^UDs~D70GTkUS)3%1DFA?RdPwWYv`bWm5z2ggV@oMm?-h+ zDp7@;g%nU^Soe8`PC3%w&f1Jygtqnu-!?ILR%B*o;YNmqSDqX;vc@f1qj$Ke37uBB z={Bg;H@&TwMU~iy&>v7Rgw_Tg_+}U|7(haCGSy7e@@RuCZJ|Fv=RihiXsCq;b`fTq zUy8fz-sXa$Wp`1e8Zky2GnRo#z4y~iNu{-r(*6>C_3mr_Pg9AdYp*s1!k0diwx>;g zd*>sr2&%TB?#8}qiKU&z#0zrnn*cZHe@QLuz>X{_(k}Snk3lbE z0y%E_kU#2^@5}85wQpKFPUG zf8=PhPl4%vtg&T&jPnxBy{OhdEy|-Qpz#i9B5cH51idIC7~XDs+mi{eCJbF>OQMzPmf2 zn7Huu;23hw;i(dSmmbg5#Q2a|7|s9-Dns6038u9WKF=H+LH+KI(SxWc?>)r%x^JDI z1T;5izUccbwb=phmPSS1ilZ1~P+_I%LvciVm0maQyp&{4Pdz-E-|_V?5;;!N*L;u8wYjN(9-WCHJ`+7 zyOHvk2J$#q8HB{!lY~rD_LIHyiw5V#TeMYCl58U*1hMZcmS?98ep(u5|HKeGSy5NQ zbCKOuZ>U!h?d>>2k=OT;r{`$tBBV9er{3ir=1Ec$i&aNWD`OouDcmi%k|qm+^a9`- zV>U>IJ&2m;ptXSpV~LZmNxzg<&I&(@e;ddf!=m2zg0DMrB6=hf zUCT7@)C>UFmaZYkK}VOFLNSFZ=4`VT_kp&00n1p!wf7TPMNL609x@5nSv-Hr;J)Q8Pn|$gMfyPt}A}+$6Hhe4tN8nU#!q&fZ5K9*m zB8vdf44Ph8ZZa?K-)NLo1^qDg(d7?gm^#Y`4?ch54=!+ermvx)8VNkb4+Vzb_8qd~_iz zKEOk=b8%haG6SeBWYtZ0i71%j!c@}WUFP66#x)I8*y$!9X%>=wkH^GtCmX9yZnU2i zheUKuF+Ge#PNlg}GuIZ>LN`Ev!s==z$%^p{3g3pPa)76H`+6)5SK0NK8az58`i607 z{(41_l;%ta9Q3j4G+rXs8g%`p&Wvgi0%kDwN37dSM?NsH8ar#>>^ie9*IYsB=38Zx}YP;e9)o)On@J1RkmF}UNXiQ6+($%wLBnlIf-2TG|+swPFZ0Gim4 zi8wU@4_&V(Je0)THU0rg*76|cc|kaZ10i=+J+x9w7ko-F{*}c_3aL!}859tpMObYi z_-D@&e|a)!tcd0_>#g@K!uCgV$QhIKw;or@n;rOq(madrCwMsMZMR6l{36K;5VSg> zMJiM^=yo%}{4ix+e;929jYq3*mJv)pfDO+9P`csQ6}@g|NfzlX${*kBmT@X@IJmBw zx*W_Yj7lL(Tpdjdc^GQnw~l$xzRSu++TnnnIg3x+sJ5Te#lz4bpJX%;BQMO|Bt{2i zG;qn-$2*=xG|qq*+3rdO1=hyI++Ye7Ofj~-{^7N5nPJLv$Ild&IF*J9I!_q#f(IWk zIWG+h$>@CxcUW%N%3>%1(X6v?(<91ky2V-#xbI!p&sDdfuY03&KaFscCBKun&-jpb$*jTgv1!vIkbmMNS5aEb^M~P_!Tns<$P9|SB;9?W zhtL4ga^#XeP98SR4xQhagU?Rim6Gb7`vhN|^t1NOQ06hcAnq9pZ6{eFe{57edVTF{ zBzzlym`p_u5vW-E>+@K;8xRNLU=7hq%AoD9*g3d3B#8nxS3kYlSotbPEOP+(>(HOh zAcFzY`_TR5CK{&q$<&Gf@v>`A7m|#GsL2A5TgKMtb}!nF%k$1mfv7VD>JKTbJV3ox zFM;tfFmmm~`eA7C^lE~ZSlzcN5lM#NeH)LPONo9PQM0=qOnK`{kC_#tR(wBU42t`t z3&IoA?z9!1f3#~`P26}6gyRV38H01^eMHic`4g{i8@cg=NR6!)nl{vRhh9@BQMU%5 z>dHjDr3TN^eRg;rq$p~y*4^$!SCae6dO&0PEsyL7OEa!L8-4%b5mAnW z#p=YpLOUd!nyEw1j|QLxqY5MA>zSf1vi03VTM&`$3cTe*&_Mzu>YkvTR<}o?^VgOL z)6@eD=lTTp;sDN0!z%l;p*Y;8!nhb+!f#x5Z;K>dg!9G?Q&7y&Mcv)AWPHHcDG&vN zmW-^RV+K)P;-_6Dq*DRS%y3Q3_{d`INOpC4)mxjW>ikaA>gD>xM+|IvPvd56@3lrp zS6DIP5IpyPU1Ft-tgudUU+N~=xp9E%^*i#*<0M>tivW#Jw_;3eA3ah{y{dn>QzC%s zMKDQmriH%B=ocgtz2E9w9}R`!Jetx&;YIT$j|)LB{sn^~-w(k~060~B9n1>eUZSn0 z=>s5wT+_i1ngEa~%0A7$-ryVhqM|__t%+F)Gw~2>oK2}jT7+}zCg*cbeXh*(shzAC z?C41>vh}a`B|Yp>#}5k~CDBUM4t%Id>hmfNAZN}-(-M&`AJf^?>19QzsYW4b4BJjp zi#AzeZWmA>#I9q#i;V4;Hmqn4xQ5GJ4v|?GrcjtvzwnxP-w(Q^8SomEVWd2PEpQfn zS#LNeWpdE?udu&K<`NaX4^kEd0OLLCWe|73K=(HTaZribyRH$8CaE@cJqJr^l99ep@!m&_9L?HV&9k7s{w(Lz)3cJcHN@ndQ49*=E?#{Mk-r$4Wqkx_a8l9!bQ?ZiNytss0YHgDO1$Dj{P>$u^1kQ2Z*KcET1f+w7 zpiv7npC^Hck?QMqoLUf6TG}LeZ?*V2GEyU0hdC7Ii+W8C+hr(hjcjN_0j=BJN9NqE zAMKBrY0jPFK$d7fx{|}3$i_snQ*u6FgIJ2*6Po=Ec8HaYSz?t~e#W3oD?et>JLJjL z`_h6w6|61us&u+r`Y*kxA%HJfL+qw?L**tCp&Ftem8RYfyx6e_oVDu65J^C_uY!Q_ zM>uZG^sPY&rovhIiz4nC7&0k6YGnBH1(cReAeCU$OCY@G8fu}SludZs-?(Vu-noIo zH|Z;Wf}l?*Bc}@=kAsvxjm#X=T;2vYRZ_u+0Yhn)wRF4a2yoBxSBvj*zVO>}8r!-j z%-j|D{83aClvu_iZzo12u)d}-%mPmsyalb#?=PB!mwZ)2-s~AeuseE?9*04VYd(~} zttt`}O*oMy^ha6Ykw%91_%*+oir}MKq0tL((2xw@&Sm&<1um*L`obl5=w92xDnEwk zb9pjdt;EDfic|CuA@G{Z^&f46pJeFJ(~Ams0YijiQB5(H7ggk!lL~N6%%}dp!rnWa z%KwiaKR7r>IWkK)R&*+(tdMLXna3(yNcJXs?^PlqtE?ic?2$4mMMn0>O!mm;`#kUW z`}6x<*Y}@qm#aTqb)R#;?$>xeAM3u(@qRq!Tx&^lC<%qSazf5ZH8iU1L9f{vTiQ?y z)G&QdL zWYvy$eyGgskPv4WJSf3l@b~)T2*RK&8Rh7%8%U{RCShCahG!JfX1{p&_@A86?&AS_ zjNHaiS5`@&b^wolcmc}|?+A7wxV{?x`n~pV`CJvx;$C29blA+yb7 zLv(4Mw`)aHS2)B?4lT^pFlqSA$f}Ih&RqHe>VNgmYww?rgMop(JgPKFIt)GbdW|g! z2uUI+S+?I&ED>>-$RHDbQYP(F-e|l(0ICgX&1)7NCLeH*JZ^0qItfZ4Oy#-R-wje% zdv9a2fBAy#V{4|9#>v#g1T!U@+%7Lqf@Gn_CE(=rxKt9^LwB27Ge}QWXJ5dhO=Kd; zDIZZE!w|L6Y{qITci)->VhVUh6PWfZg0pf~CDP|nm<$;sL@<4=!RniNqd!(5{wGH% zukK`E<7xw$G_?+gmT(*a#N^+*M?-wMVS4C6bn%-(%(8Vt00MwaT*lW6;>qQfn8oh0L6b`)Lk9!yjoT3)_~HoX>w zNjXETEm|8k)ryehdHWz-%F|Qy{@2t*sPv`Re|^e@{#~x)F0Mdby6%>*#8olrb){F>!gH0L&`X@%d*T9P~1-xF9mT~sm#`Z2|aQvj)FyL^3hg|Opf8P3d@TVT!M zh)CD(HNbF(FD$|_^!lMKCq13DyinM`8k{fY`Iw6b|hG-n6h(gO->oqINVZ4N-F)KJvdNv!xA8 zxR)DFjt{m#B0F_G#|r8=fkEt`p%rS<(+{57udCJ>r_YP>58MUTMGi^#L&!OJoD+o| ze?C+n+$=KTW8Nh%A1a2T*|;#|6O3>U?FplOzz>J$OAsP!fd|HSIwr;P)5NJfLX(E1 zsS|ZtQ*&e`-qNMhan5m+uj%lI0VC0GCRZENYpj3+ufC#^+?Bda+1}>)*h$W_EuNK6 zX%R%Wc2X#4qd4i3n{CIX5!7~GxF+!jOylc)CzgOeWZii57tQ(Mum#;idR;y()uMY- zUgQ!|Y><(l3cQg;!Z{?&R6PU>hGAi0QFKx^1*+-7B!;KzmMIDHJTX7Z;M5f0m5;u4 ze)Gk3m$o^pZY`wrmAmg=S4{2AV>uA|Ss?C>(&&H6pf!YJ;XRGaG93j9OUnfGG`iHP zPbu>+ciaU8;?2c@_ULXZ{FYV0Ah~UUcTupSbtOu+a&U{k9uHB+jz$SNRcl*6VU z;^ZblDJpY~uI2;(3=KzVYL^}=YXJ*lOkrnxP)M=SzW%jS45f0LLBiL&Jgot)U+(`v zZ?}`a20a9$yWvu`AO0MC(XjT6yTd;n6p!e@aAB4>AHzcA&a+mDGAv3CByT9J(jq)~ zySAl~_obM^)y9JotjjuPg$<)S)sZez%9CYw?E+y!)rZdHK3CaB@(vK$4q5fRHI-FP z<Ld>$Kg%gR!5;8j+w03`T(J%rd#j}_GC?P_Ks0ZB%~azmx(~kgIfLs`nKBpN zP^@x;SxM;ctFoQ{7`OS(vjnT`2xgp|I^fa|uL-0{1Y{|nD<0pDvT_MQ6dgJZ69nfC z0+e@_hlVZS2@}sl%q)qsnq|)?`8e?$z;;?9ir8zTcrGz9-9gNovJ2a;?$ZjapH_*wQ z#s1LA=;mc=}gkd$NFxE)ZFd(apoBVjCUBQox)i2Kz!kK^;ZXk#yg=8{XLIyDw1%fl) zwho}syKf8Syn3aGS?h9QU)xxA6iHFvM4V-<1QN7gI`c<#D2vrD z+;-btGhN}G4pnSD*7JbzL#p0|6-P~I-MtP1$lfOxwNObenyse`WHH!MkKm`5I~Ef; z6knIHbCGG7yPBDsBWGYY+OC){3t46+hO(qi5osV0YVQ4VgNcUS`Lw@eSnrap5K}9R}--xMXMrX z%s?OkM8#c^XQId)w<3;ex8Vk=!lYdnwJ5$2Q}w=)60gn zppupORGm+<$2g<-!Eq~Qfz+dDgSLNgRWT(8WOvaNXSkF(;|^;SYu!CJJN9J|oJ_=~ z&P<|?SqtNz+E54>M!&!3wLt)G;6-Le^rQ8aKT^u*t?q`=@R}I3NbEsTGUQcD0KDRj zuVPNWLeTRH$9lmer~1xiwkNU$}d1684Gw$(mhN%D~AuVdxM#BIu!sTOJtj5z$^UVM~F_z zy^CS8^K6A5^xmv{op+-SK-^TqZYa!>l-Z*Dz&&0p&K5t1L(*T?lNwa%MB|F|%5rpl zCp!;G2|6;bFWQ8z2EJv_*#qoq|9k>9heCwlg&Mq(o#fQH8JCGcf(g8LT%`Ju?*JGd zB6&rUn8jR`l&H&oQxV8DI(_9UjR;Lq8bL^Z(iJx62-qUm7Rg1Bkob1Ok@9UY+HBP= z9+4^X{99HTGE*A(OFCK;@c;hVjMH8CO7bh_X<1TsGkFh=_FmZ@1DwZ47ES9a@w^!R z670bID-Jp$YHc*GX14ROB3k==PuVjff{t)$1JqN{-)cy`lqV0r-u(+#5ZeqL3F`L^ zbzibmr>aiJvp0fP_cyx-p0^8nE#s$hRmow>_p64-jVgg(>sxr#{Aek3E>kMq2}CnF zAI>Zb4s8V@;{HmjJM}rRP9dacIrzZ&Y+#fXnAz78%MEV{+G~)Qi z9Ni+CKu#bbJc`4(Mw*lkNk_P5J11)bwM&R_3cnJO=}F=+w5E*21U_$%L48)Ac?k^Y zXA{ZsT;L_`U2&K~;AqNiZuhIjm(e0p({~vOJgB8Kn2x(;zpLBBR09p%s`twU>#%y^ znfGNEWf3+09c8i}qJt32LDrrI%UT27OnRxqAp8EMD5oAqq|H_6=XZwB0AysDtGD@0 zd=*uW)$LwOyC#vS3x&ucU8>g^e?`QbJJixp%a@#@1!Rl{xn1#HvR6n>#9^HFdL?Ay zyaYwupQ|aVKS3x4xZ9g^i3^JhtAvL2Kd*LY|OZPaaaw z)&v!bNcozUaPYi%n|o)re!*{YS0^Ph{Mpt($uF%Aj$an@sfE)-Nj9`KbhLa*#KwTh zz5V-Yrvp$cZnOZqDYdJYP54bZ8toNDtBAe5Au1n4wmi;i|y(*Jn0EgEQ2nw$*v z%4Sl>H>bSDm54>%7N59b5AU$mM*W=xbq5?LUVoExbFLxR3RyYndX#Dc-v(rTzH7Dk z)h&GG5R{tQAWDBI9n5kmWIMtf^u^t`8=PmO9I386pnhg)j*N+t($h;-A^?-wxv2?f zS&p*@*7-Vrd_cRzQ*Kc0nC%uN_}~Ysyl6)d`-Wd&x*B-<>S8$=B?({|E*NGsri9Xm zcF0!U9F`{_MA-nQx7`pRT7R(dg^$e8%;Y@I&5o&OTTv`2B#LO`Fy@TBhh3;oauYiu zXBYGz(enpDc+)a6^2qHciY%%I*VErAm=T8R4H8%iM8+m-1o3@$!UXujWQorXJLkD+=E8hk z9Mnqhjv1Xl`4DTccGb81>FZ#pxuSPQw0%ePxy>8F*suIa!q&b z!X{b(E!6<44CxPRuTDvyubwuK+4&*!qKAwk-&b9{^XZlJ#Y;zX=?)(!?$?;!$(wq6 zmJ0uwBNd`I6kJQyvtRk zg6_{-q_Xjj=zgflG=T&>GVP2S7%ac9Vo+`oSxu@;2Fg(l*PF&)QK{5{!24-HE&3ZI zU;6FlAoq>5cc| z9D-z1CdY1bu1ny#3^sss_SMsN(e<)YHmPaC1U=#Zu>G0xaOiuux1z$Ql<2RA5D*aX zPJCx*lS8~Aq?RVWm26W4xDC~jn;rt zb2urM+*e|qz17(#<^;a{JM*r=^UIyJhEYwX|MM{9tt&=mJ?RH0HaAz-K{~>fjj#ovIxt+@0S4X zX^LBMcsXY2ZeG-LR^eIHq$s(DwzbLUH^4fx2jRYN;J~2Ff$)7a zfC2cFW!+Eyl5Qavc5v-=xjOgdGld91``&LGDN(Z5lApaIv^1Hm<+}BetJY2YIeTV( zUO|C12!ADsIn^Myjzn{vy`}(i9Bppj9~OY*4)=#%rX?7}eWaX5>xU1vyzQ(q34Ji= zwO1*PH%SO8bw07~!4a(iMEfBjLuIl&0ZvXXM2~NWaVOetUVa}%z+;_dSky4JdP>_K z{^84xYl066^bO0*Lbn@%@{epX@Ah!|wT8Bw)(~D|E{;UwGTbwf@)JC0`YJXkYxc_;M#YfGCVfnMkM4XY7qME$sZZAfc*TYlQq+A_q z#3O)@{1`D$T~>f#n-L8owdbhwQ1HpAv(0je$bAI$MBleL7D7xx2Xm|LAb@&gUg?(a zKq32DCz6{spOY0K@|!j zZpz1_`v~Q9p-w;h$1@_!e(;>KDYZWB_p>9X9uV5AXB0`FH|1i#P*ik7MA~Jf|pwDoSJ6JDFT3i?pS7v#Q|d}tTdc8a+;Fv`0pvUul4Rm(Bd;7 zv%HBK%<3JnBwgAb%b>Jir;{4@MYAgX=`Wsl>=-|?`mr9}vT|pBtMOm69owDKYn#My z#@5%>3kM>$ETlET7i%Ak8r}}ftKIBO&vuIm@YSZ%e4t4~USUdN>U^u1(V(#VcBnfR zzk+s86ZA1n?oMb06E=ev=_9GbOvVV)xSEW2*Q?^4@<(FHy$2O9>trC0ERXY6l4!#~ z39{KXQ%WT%wlitEewvRF?sO(osoM-2H`Y_S8`QyTPn~<7@qH z>3~{tSwPE!{d}SD!-3nB`<>h`B7_~g4=gx_8-43KWXH6I0p zLVPID+D`j(+=ZfVndHpJ=Nl40*}*HJ@?5kmVPVb6$!Uh14QvIusErsY&GqmI5XUKL z^uvrIFDwGNb`4j@@{M@I^xqJ;CUb2Ys*_(NGli#FB}c2$_pq?_tO0y09u_kW;`9Y* zEQF6a!Sh?-DU}2*m6?E#{6DF^zKvn;wsVe@PeH}>o%QUN|i}VNLA|o31kNlOBU=xL}Cn9Zyf_qbqawbK!HYkb@!#WP3fJt}hNCe<7rzm01A;*cbabJkiN z8&&;%^@Mw%Q?ev6dEebBwtz*PPT_^Wi&WY4aZ>STIox{gR2@^oW_TPE(1h7gsrJH= z=QAI}VAYmLsYIAGIXm!Dg+W^L8Na8~wL`}u8tyxTv_YyFV##!zhilg-xeoaLOE`82 zJz^afzs2oNmGkjdug25SZ62+^22U`inoI%p(%S!8;zgS7(b3?7k5+E@9qQ3)X3G+j zi+{C6-)$2(fjYtNY2+vyT3ap!HupEw1%`FPM^HPNrD1VJ=ovdGQ52f}(FG%Br{(1= z-}3JEHD=SgA0%v_kkN4E-@Y!HbGc@*SNs;?r=3~3?d^v-;}aiwdpzdHuhshQJTsH} zFCm$y@w>GM>_EeDdx1^<$9wsh0!_2JN{a}=;4ERAz78t&Nv=hLK&Ztc8z+*OMI8f_ zQ+VHy{v0I5FAGA#|FV;ZzPfIYo~dyQQqx+O*kzWin#U}K60ne7(|l^!%xpTL1@TOvPUiS7gR?mBZYU-wNy z5R&C}rHfEpk-aejxF?Tsqff}8^&mu_SZ^)#c)LII#V1@QQ>O|%ymIB}N@A;qgW+*T z$x%a4=FJA0>G>N6d^w%w22rcQ1p_uK)yZ3bPJWvE?Oh$VD^36Dcgs+$-{ruMV-%Fb zs5%R0)@-YARFU>;cHx|AKs`8TW+7sK(TO^1aSer1}>5CyDcm)I{7#aN~l(V4-5B z!re>%Wiz|7a2P{<)lp|fIN`tln_F6QSv*7#_GWY+=w#DOlIxLVfq0c5_(?zdrwcxi z0T|*Zm#DQ`jPR!3@GybdP{quTnAVztqxSkxRwLtsgj;f;GPK~#m55JlTAZNRW_NJdtdSbGi_N>U~$fDeWi{$0y zvkp`1SkatCfe?uRZWJa*D@%$XsGy-|8NZ`P_(SI-tkS|toIqHAhEvI8>wftOFUE1E zgIeQyya+KP_RsbP!R*%WUhRVJ5%OU`vEsTb|Biqb z(hzN8d=<=30{-8LIBCh*5bec9#u1x@FwWU-m7gDR+pG*&_%a9Y|7w}kvJ0dktAnrV z9@Pb(o5%$N_{VVTMd-yodMaF=3-~TNm$IL$()^z@#P&E6VUUnhm$o%UY=eMh_V*&= z!u~*Z>)rJwIF&cAE%q``b)F4;kCI}(P-4yF2k3-zkI(fsKvdAIX|r=miv7vHW#2V? zS~SW(Fm3Eus>AyEU5`p37774Ewaf0>*QhHOG(Id%VoWs&q^ z^%kn2s3<>Sg~=~`O?iM`N6k+GO0;?L;>Sc2D1?4Pt!ZYy|94phSjKr(Gg$C^Y?DiR z<3?xOnBOP3#L-C1JfEIjm;g*5MxfHy3adM(^Fu}myC{US%c*BWc)W7H&%Jw272V8d zTW0oY0rc$D)8A7Cg0p@Xc+@#fhhYl-%B*xt5le?&fHDpPln2X9$mjc6KB0GjXnHVQ z`M_)+NH#Qk<7)k8TMsT40c&)JJx#z|K{qVYNmc3l z1J`2(cln)xLi*cBwby@Ye9D4=J1rrR2%CSS-o|M6?thG~Su_r>3lq6r@AM&|$pfNF z#+n`kW&PTQ6kH~`9#wKqy0MqA*WX}&3k(l<-O~E2AQr*7&p4jxljLuD0*CWo%-365 zTAUPQt86H*J%Yf1rU9K(QdH?Z|K>yMQk7UQ64BsPr`#OYpdHY!+gV`q;48slN~yq_ zz-RofhO+-Lra(T5)`<$)7)`x)hEN?N3gmg}8whi#gvb)UuCKlPgl$q%-`h9FW!_5v z(WgBY?6})|J%gKHpXZJKVCQc)%W5bRzPURAA>CJmyEH{tbf^*W1mcr2ylz$glXp;6 z8r7CAiLx21ojZAwU#DssPRqd$ktfhSBvez?Jm|jN1+>a9CkSq8?m#i_>5bp5_80di zu)9(4;&ID=3DWgdJ}dNLq7stj=os4s^0e=`P?ASVk>a83|IT^{O4&svd0q9kc~sqBi7krL4|F z5~2g!rQTq`3VfBYYl(J{gaNVbApw!A^a$ZQx#Md9sm-b zeM3DSGOH@ep?*LGMM*9Lq&tiXkYjhC6{`=WsJpP5^D!Js;wTRPi$6k6JHlpx&xZJd zDhLXwF!z$X1&z?k`n|@Yr#pjA8K=8>EzTJ>5PPH2 z0p}7c@l+B$Y3}8_6g+G+*(&=_R`lqj<)&c5(jQB#V(2vb-E@c`dCweg-T)>GY9#KA zWD4)6bkXrqh~nz72Gc&f|XGrFKP}AqsZ6nNn$R}icOq_gN8uQd9}HTT(a<%(p-MT1$J@O1npzIg)$JZlb_msa@m-Xc6Z=W0K(+K=EW}ZcOSIL({`{EF z<<%DlC7+7ccOJA3sN-H1W2ABUHt3V1UlB$0@vjBifd0vc#(LykSKAaE+7v+sAlfm% z9MT{tM2H@e^v3?kjpJXK)T~!GYocx!-X_X60b_qD;&P)l9ddUy8Oz_s7{XCBJXtiN z!3PC6djX(p73%_^hYZTX+e6+rR+t|LBNTZKtxCt>r5Kl4)7^P=ztja|CvxcteF*dD zlb$-I78FJ2+bcqvDlO9OV+6H;&(cIw6Q&Tbx7zI&%$ux8(fKd_f5qM8s(3z| zkzj|~#876Ztal^>?*2WqsXx4~1q6%ztw+7)yq|$xkVE2K2*Sl*AQ>7{01VigW516$ zM;0&P=x=#LiK_EmOzj(BE{fF>e-b}iHEPO|PTh^?v&KfLZs_bxRZ~)HAGrVHYeUlU z0pI6jd%PDS|^uNdKRhx37A`u*~2c+9>ffh2iqc+s}T?Kl*n1pO^OFMIa_GD&r2T zX@>OQ**=TN0cCKTB^Mxq^vEhe0LgVaQmzt?%V%kfu;S=C-G4rccvzY?F*G*f=3UmZ_8GBp1%7utY^2!;fJdf`hv@FodN0XQn0Oz-=UaZHLM6C^9P zP0meRHxY4x#9ye&qySmM*IVGfmnQ?I$45uu|lK!a#}>z@!{ro;Y?Bz z67nE?Sby=B^%{Rnz&n)SzYSuf>=*U4zDRS??R_DQkB?JaSAI2Bn#{`Y@AOc-Kd(qy zEJPt*<5qf3#Opv$Qg~Ti@~N)5h!-W)N$L~;uHQ-MyDPK-zCDzNR?YVBUd4ny*hCAr z@2!c(1iV63KTh9oFE@C?qcOVox~J5*_jRa$?=PX0(GBA-_izX1UI!gKtY7M??*Eeq zntC3t=w$mAk2Yk!8FbbGBvJ)BK-%^Ls2@J|4^=}Naq~~qzY(t%x-(&y(8ig&dAhdh%9?CUSAd}lc?#rmX635T^PkMlTbaj_YFke;*d z^G5lSwIlfzF9ERbZe#0n(}Ihi z?ye=i71RRKMr_ctMB#7m1c}i*UG)DtvR^gNsr&P8o#}?WUCnyWr`fuP`~J%r{5jbIbFtR1$KKX@DXq-;&q=Tb0dP z_+0$i&$hi+ZdZQ%di~5+(M4)3#%N9E5#RW9LyF7Fjnzlr16@{+GGqNGeKR*Rzi+Lt|hc-&wuDcVp$fdr?m?YYI}@xI=rRrB7x^E>xU5QG>6Vapwa z4DJGvXI&yQuZ)b0wZ>mjVGumRE2xL6fzdpn+X$LNoj^#!`~ep0SYH_8$OG8RV}CT!Fp2uCWF03r*3{h%MVLF zBE8HMIV!o(TmJqBt>&(O4^QC;oGCRI73Q3ST8zjnu+38CZgW7-dw(5v*^jO0kQV+yzZDLIZe`5_)P-#W z6QSATQ>Q2>ayvSdhFB|xM*r;C{_%${khk|ou|gHbgZ-syld)z(#SBkIab0JuT!}*R z2-!am6@e6FLu4<8Xyh?@jdD=^X#2s<$#5})cjB)w`JwgBZboWp%_dfR!K8M2QV6x65R5)hyf1RVGwj|x&+zUQ|h zm*(V9*JJtTRo~a6+E2I=_mZrv0MM-zj)-r&zviUuj}XngVl7l0zm&1CG$8)##<9Fj zOTSTe?aIi_LLY~Q?(i)7dnl%ue}wR~-&3qwr)(C^%3qd~J-t+NNN6i@Go zI6FBxm6q73(L_WUqo@l5PLn%(|UD|Ka(=WBn*p%1!p7<-dmeb=jfV zzx@=+{h}sQuBT-4PhB7-#Gpfv7;WEs8mel{w^SHzT=4SpGU6Jm?(eyjtPmUVih#&l z?4(rWot9QbviR7Hj`qWc4?E4u3g};;!$t>9vxo?KHly59qoP%w>%JB--6h43h9v$= z*UQ;xw@aK?9T~XuxPIwPz1)4KJ9Tn_%ur(z!?EKJbD zG`Y2p9Yy3b=+i*tSM{^~+Ag)XHty+&e)7vKntpb$ zLS5_nGFIFME~G$0L`1{GJ9aC)D4fwUtWYa$;c@)jIZMe@6e<7#Khz;bTJ#a_9BPjw z;w0}TUG_4oD%wslSQP6T-8l{<5lXI1YTn zWOJ@)H%@o?y&%2t&q~8gD&sd44!PRIXVKqNUUv7*=*91SitFpV=b36?AYQYH9_twG zFqtGp#%LBt4Qt<2qQ6azl)~_kNfp$F8s;$#L81CA*Lpo@{iA)cv%k%W&8zz_xhek+)br~32_;G^=e_u44N#Ah8 zgcfBjA260D?BAnP`(xDQf|IetdN7mqmwnpu8F+$)HL+H+V6$l!yS0LcoI~mzEQ8Mp%eQw(yZH7*<53X*b=bxP1=p zT?y_Ti!_Qhz+G`BuPQW9525nsXg)sq%Tk;3y9c*~Dw@}ve&9E-c^pQxiJu68SzNHA zux)vlE_r?tdwuB15z<5|MxYa>rS3}zHkG+4f8`{U@==^akH5sgF=4u#`hL0p; z9c`*cUoVT2y=9uQ@hi<5{6v=cQ)hyh9j8Oq?vG4{>*)4 Q1pK)re@CuF#w6(f0EMW3>i_@% literal 0 HcmV?d00001 diff --git a/docs/topic_guides/evaluation/clusters/graph_metrics.md b/docs/topic_guides/evaluation/clusters/graph_metrics.md index 2bf641fdde..6d49eb79f9 100644 --- a/docs/topic_guides/evaluation/clusters/graph_metrics.md +++ b/docs/topic_guides/evaluation/clusters/graph_metrics.md @@ -41,6 +41,28 @@ However, erroneous links (false positives) could also be the reason for _high_ n It is important to consider [cluster size](#cluster-size) when looking at node degree. By definition, larger clusters contain more nodes to form links between, allowing nodes within them to attain higher degrees compared to those in smaller clusters. Consequently, low node degree within larger clusters can carry greater significance. +Bear in mind, that the centrality of a single node in a cluster isn't necessarily representative of the overall connectedness of a cluster. This is where [cluster centralisation](#cluster-centralisation) can help. + +### Node Centrality + +##### Definition + +Node centrality is the **proportion of all possible edges connected to a node**. It can also be interpreted as a normalised node degree, or the proportion of other nodes in the cluster that are linked to. Centrality ranges from 0 to 1. A centrality of 1 means a node is connected to all other nodes in a cluster. + +##### Example + +In the cluster below node B is connected to all nodes (giving a centrality of 1), whereas node A is connected to 1 out of 4 nodes (giving a centrality of 0.25). + +![](../../../img/clusters/basic_graph_centralisataion.drawio.png){:width="80%"} + +##### Application in Data Linkage + +High node centrality is generally considered good as it means the node is directly connected to many of the other nodes in a cluster. Low node centrality (particularly in relation to the rest of the nodes in the cluster) can be an indicative of a false link (false positive). + +Unlike node degree, centrality takes the cluster size into account and, being normalised, is more appropriate for comparing nodes across clusters. + +Node centrality can be useful as the node with the highest centrality in a cluster could be chosen to represent a cluster (sometimes know as a "golden record"). This is not appropriate in all cases, but the most connected node within a cluster will likely have much in common with other nodes. + Bear in mind, that the degree of a single node in a cluster isn't necessarily representative of the overall connectedness of a cluster. This is where [cluster centralisation](#cluster-centralisation) can help.
diff --git a/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb b/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb index 08e61d440a..0772003946 100644 --- a/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb +++ b/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb @@ -65,6 +65,7 @@ "The metrics computed by `compute_graph_metrics()` include all those mentioned in the [Graph metrics](./graph_metrics.md) chapter, namely:\n", "\n", "* Node degree\n", + "* Node centrality\n", "* 'Is bridge'\n", "* Cluster size\n", "* Cluster density\n", diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py index 370a5a5629..7e2371d28d 100644 --- a/splink/internals/graph_metrics.py +++ b/splink/internals/graph_metrics.py @@ -25,7 +25,7 @@ def _truncated_edges_sql( return sql_info -def _node_degree_sql( +def _node_degree_centralisation_sql( df_predict: SplinkDataFrame, df_clustered: SplinkDataFrame, composite_uid_edges_l: str, @@ -34,7 +34,8 @@ def _node_degree_sql( threshold_match_probability: float, ) -> List[Dict[str, str]]: """ - Generates sql for computing node degree per node, at a given edge threshold. + Generates sql for computing node degree and node centralisation (i.e. + normalised node degree) per node, at a given edge threshold. This is includes nodes with no edges, as identified via the clusters table. @@ -77,17 +78,26 @@ def _node_degree_sql( # join clusters table to capture edge-less nodes # want all clusters included so left join sql = f""" + WITH all_nodes AS ( SELECT c.{composite_uid_clusters} AS composite_unique_id, c.cluster_id AS cluster_id, - COUNT(*) FILTER (WHERE neighbour IS NOT NULL) AS node_degree + COUNT(*) FILTER (WHERE neighbour IS NOT NULL) AS node_degree, + COUNT(*) OVER(PARTITION BY c.cluster_id) AS cluster_size FROM {df_clustered.physical_name} c LEFT JOIN {all_nodes_table_name} n ON c.{composite_uid_clusters} = n.node - GROUP BY composite_unique_id, cluster_id + GROUP BY composite_unique_id, cluster_id) + + SELECT + composite_unique_id, + cluster_id, + node_degree, + node_degree / (cluster_size - 1) AS node_centralisation + FROM all_nodes """ sql_info = {"sql": sql, "output_table_name": "__splink__graph_metrics_nodes"} sqls.append(sql_info) diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py index 715695988c..55ee1a0ce1 100644 --- a/splink/internals/linker_components/clustering.py +++ b/splink/internals/linker_components/clustering.py @@ -8,7 +8,7 @@ from splink.internals.edge_metrics import compute_edge_metrics from splink.internals.graph_metrics import ( GraphMetricsResults, - _node_degree_sql, + _node_degree_centralisation_sql, _size_density_centralisation_sql, ) from splink.internals.misc import ( @@ -349,17 +349,18 @@ def _compute_metrics_nodes( Node metrics produced: * node_degree (absolute number of neighbouring nodes) + * node_centralisation (proportion of neighbours wrt maximum possible number) Output table has a single row per input node, along with the cluster id (as - assigned in `linker.cluster_pairwise_at_threshold()`) and the metric - node_degree: - - |-------------------------------------------------| - | composite_unique_id | cluster_id | node_degree | - |---------------------|-------------|-------------| - | s1-__-10001 | s1-__-10001 | 6 | - | s1-__-10002 | s1-__-10001 | 4 | - | s1-__-10003 | s1-__-10003 | 2 | + assigned in `linker.cluster_pairwise_at_threshold()`) and the metrics + node_degree and node_centralisation: + + |-----------------------------------------------------------------------| + | composite_unique_id | cluster_id | node_degree | node_centralisation | + |---------------------|-------------|-------------|---------------------| + | s1-__-10001 | s1-__-10001 | 6 | 0.9 | + | s1-__-10002 | s1-__-10001 | 4 | 0.6 | + | s1-__-10003 | s1-__-10003 | 2 | 0.3 | ... """ uid_cols = ( @@ -371,7 +372,7 @@ def _compute_metrics_nodes( composite_uid_clusters = _composite_unique_id_from_nodes_sql(uid_cols) pipeline = CTEPipeline() - sqls = _node_degree_sql( + sqls = _node_degree_centralisation_sql( df_predict, df_clustered, composite_uid_edges_l, From 0a85b196aaf893badcba47db3f8ba2b9ef21c873 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 15:23:21 +0000 Subject: [PATCH 02/11] remove whitespace for lint --- splink/internals/graph_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py index 7e2371d28d..0c42df3309 100644 --- a/splink/internals/graph_metrics.py +++ b/splink/internals/graph_metrics.py @@ -92,7 +92,7 @@ def _node_degree_centralisation_sql( c.{composite_uid_clusters} = n.node GROUP BY composite_unique_id, cluster_id) - SELECT + SELECT composite_unique_id, cluster_id, node_degree, From c43a0e7014e60cb57a1b03ae0c322b7bd6c97920 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 15:54:12 +0000 Subject: [PATCH 03/11] switch with statement to sql pipeline --- splink/internals/graph_metrics.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py index 0c42df3309..d5efcd06c4 100644 --- a/splink/internals/graph_metrics.py +++ b/splink/internals/graph_metrics.py @@ -78,7 +78,6 @@ def _node_degree_centralisation_sql( # join clusters table to capture edge-less nodes # want all clusters included so left join sql = f""" - WITH all_nodes AS ( SELECT c.{composite_uid_clusters} AS composite_unique_id, c.cluster_id AS cluster_id, @@ -90,17 +89,24 @@ def _node_degree_centralisation_sql( {all_nodes_table_name} n ON c.{composite_uid_clusters} = n.node - GROUP BY composite_unique_id, cluster_id) + GROUP BY composite_unique_id, cluster_id + """ + node_degree_table_name = "__splink__graph_metrics_node_degree" + sql_info = {"sql": sql, "output_table_name": node_degree_table_name} + sqls.append(sql_info) + # calculate node centrality + sql = f""" SELECT composite_unique_id, cluster_id, node_degree, - node_degree / (cluster_size - 1) AS node_centralisation - FROM all_nodes + node_degree / (cluster_size - 1) AS node_centrality + FROM {node_degree_table_name} """ sql_info = {"sql": sql, "output_table_name": "__splink__graph_metrics_nodes"} sqls.append(sql_info) + return sqls From bac07897f85300c2ab79ada78e714886c8a62010 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 16:20:38 +0000 Subject: [PATCH 04/11] cover divide by zero edge case --- splink/internals/graph_metrics.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py index d5efcd06c4..6f534781f8 100644 --- a/splink/internals/graph_metrics.py +++ b/splink/internals/graph_metrics.py @@ -81,7 +81,7 @@ def _node_degree_centralisation_sql( SELECT c.{composite_uid_clusters} AS composite_unique_id, c.cluster_id AS cluster_id, - COUNT(*) FILTER (WHERE neighbour IS NOT NULL) AS node_degree, + COUNT(*) FILTER (WHERE n.neighbour IS NOT NULL) AS node_degree, COUNT(*) OVER(PARTITION BY c.cluster_id) AS cluster_size FROM {df_clustered.physical_name} c @@ -101,7 +101,10 @@ def _node_degree_centralisation_sql( composite_unique_id, cluster_id, node_degree, - node_degree / (cluster_size - 1) AS node_centrality + CASE + WHEN cluster_size > 1 THEN node_degree / (cluster_size - 1) + ELSE 0 + END AS node_centrality FROM {node_degree_table_name} """ sql_info = {"sql": sql, "output_table_name": "__splink__graph_metrics_nodes"} From 6dcae92b1443681f9135ccab4f644be97bfd0779 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 16:27:16 +0000 Subject: [PATCH 05/11] trailing whitespace --- splink/internals/graph_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py index 6f534781f8..d1c6dd7809 100644 --- a/splink/internals/graph_metrics.py +++ b/splink/internals/graph_metrics.py @@ -101,7 +101,7 @@ def _node_degree_centralisation_sql( composite_unique_id, cluster_id, node_degree, - CASE + CASE WHEN cluster_size > 1 THEN node_degree / (cluster_size - 1) ELSE 0 END AS node_centrality From 6997f067f60c43f1fe144ae8fb6312ce6332919e Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 16:48:06 +0000 Subject: [PATCH 06/11] amend tests to include centrality --- tests/test_graph_metrics.py | 57 ++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index f18cec630a..499c8b5d2d 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -190,42 +190,42 @@ def test_metrics(dialect, test_helpers): + [{"cluster_id": 5, "unique_id": i} for i in range(24, 24 + 1)] ) - expected_node_degrees = [ + expected_node_metrics = [ # cluster 1 # max degree 3 # centralisation = (1 + 2 + 1)/(3 * 2) - (1, 3), - (2, 2), - (3, 1), - (4, 2), + (1, 3, 0.75), + (2, 2, 0.5), + (3, 1, 0.25), + (4, 2, 0.5), # cluster 2 # centralisation = (2 + 1 + 2 + 1 + 2)/(5 * 4) - (5, 3), - (6, 1), - (7, 2), - (8, 1), - (9, 2), - (10, 1), + (5, 3, 0.5), + (6, 1, 1.0/6), + (7, 2, 2.0/6), + (8, 1, 1.0/6), + (9, 2, 2.0/6), + (10, 1, 1.0/6), # cluster 3 # centralisation = NULL - (11, 1), - (12, 1), + (11, 1, 1.0), + (12, 1, 1.0), # cluster 4 # centralisation = (3 + 2 + 1 + 3 + 1 + 4 + 3 + 4 + 3 + 4)/(10*9) - (13, 6), - (14, 3), - (15, 4), - (16, 5), - (17, 3), - (18, 5), - (19, 2), - (20, 3), - (21, 2), - (22, 3), - (23, 2), + (13, 6, 6.0/11), + (14, 3, 3.0/11), + (15, 4, 4.0/11), + (16, 5, 5.0/11), + (17, 3, 3.0/11), + (18, 5, 5.0/11), + (19, 2, 2.0/11), + (20, 3, 3.0/11), + (21, 2, 2.0/11), + (22, 3, 3.0/11), + (23, 2, 2.0/11), # cluster 5 # centralisation = NULL - (24, 0), + (24, 0, 0.0), ] # pass in dummy frame to linker @@ -280,13 +280,18 @@ def test_metrics(dialect, test_helpers): df_nm = cm.nodes.as_pandas_dataframe() - for unique_id, expected_node_degree in expected_node_degrees: + for unique_id, expected_node_degree, expected_node_centrality in expected_node_metrics: relevant_row = df_nm[df_nm["composite_unique_id"] == unique_id] calculated_node_degree = relevant_row["node_degree"].iloc[0] assert calculated_node_degree == expected_node_degree, ( f"Expected node degree {expected_node_degree} for node {unique_id}, " f"but found node degree {calculated_node_degree}" ) + calculated_node_centrality = relevant_row["node_centrality"].iloc[0] + assert calculated_node_centrality == expected_node_centrality, ( + f"Expected node degree {expected_node_centrality} for node {unique_id}, " + f"but found node degree {calculated_node_centrality}" + ) def make_edge_row( From 9ca59f5c23e9ab5a9d4b4d54b8a398c00ec722e2 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Wed, 12 Feb 2025 16:57:53 +0000 Subject: [PATCH 07/11] correct test values --- tests/test_graph_metrics.py | 52 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index 499c8b5d2d..7aa2174fba 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -194,35 +194,35 @@ def test_metrics(dialect, test_helpers): # cluster 1 # max degree 3 # centralisation = (1 + 2 + 1)/(3 * 2) - (1, 3, 0.75), - (2, 2, 0.5), - (3, 1, 0.25), - (4, 2, 0.5), + (1, 3, 1.0), + (2, 2, 2.0/3), + (3, 1, 1.0/3), + (4, 2, 2.0/3), # cluster 2 # centralisation = (2 + 1 + 2 + 1 + 2)/(5 * 4) - (5, 3, 0.5), - (6, 1, 1.0/6), - (7, 2, 2.0/6), - (8, 1, 1.0/6), - (9, 2, 2.0/6), - (10, 1, 1.0/6), + (5, 3, 0.6), + (6, 1, 0.2), + (7, 2, 0.4), + (8, 1, 0.2), + (9, 2, 0.4), + (10, 1, 0.2), # cluster 3 # centralisation = NULL (11, 1, 1.0), (12, 1, 1.0), # cluster 4 # centralisation = (3 + 2 + 1 + 3 + 1 + 4 + 3 + 4 + 3 + 4)/(10*9) - (13, 6, 6.0/11), - (14, 3, 3.0/11), - (15, 4, 4.0/11), - (16, 5, 5.0/11), - (17, 3, 3.0/11), - (18, 5, 5.0/11), - (19, 2, 2.0/11), - (20, 3, 3.0/11), - (21, 2, 2.0/11), - (22, 3, 3.0/11), - (23, 2, 2.0/11), + (13, 6, 0.6), + (14, 3, 0.3), + (15, 4, 0.4), + (16, 5, 0.5), + (17, 3, 0.3), + (18, 5, 0.5), + (19, 2, 0.2), + (20, 3, 0.3), + (21, 2, 0.2), + (22, 3, 0.3), + (23, 2, 0.2), # cluster 5 # centralisation = NULL (24, 0, 0.0), @@ -280,16 +280,16 @@ def test_metrics(dialect, test_helpers): df_nm = cm.nodes.as_pandas_dataframe() - for unique_id, expected_node_degree, expected_node_centrality in expected_node_metrics: + for unique_id, expected_degree, expected_centrality in expected_node_metrics: relevant_row = df_nm[df_nm["composite_unique_id"] == unique_id] calculated_node_degree = relevant_row["node_degree"].iloc[0] - assert calculated_node_degree == expected_node_degree, ( - f"Expected node degree {expected_node_degree} for node {unique_id}, " + assert calculated_node_degree == expected_degree, ( + f"Expected node degree {expected_degree} for node {unique_id}, " f"but found node degree {calculated_node_degree}" ) calculated_node_centrality = relevant_row["node_centrality"].iloc[0] - assert calculated_node_centrality == expected_node_centrality, ( - f"Expected node degree {expected_node_centrality} for node {unique_id}, " + assert calculated_node_centrality == expected_centrality, ( + f"Expected node degree {expected_centrality} for node {unique_id}, " f"but found node degree {calculated_node_centrality}" ) From 156a9a18b61b4b6f82fe3adfc39502057642f186 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Thu, 13 Feb 2025 09:51:55 +0000 Subject: [PATCH 08/11] convert centrality to float --- splink/internals/graph_metrics.py | 2 +- tests/test_graph_metrics.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py index d1c6dd7809..121def4ad4 100644 --- a/splink/internals/graph_metrics.py +++ b/splink/internals/graph_metrics.py @@ -102,7 +102,7 @@ def _node_degree_centralisation_sql( cluster_id, node_degree, CASE - WHEN cluster_size > 1 THEN node_degree / (cluster_size - 1) + WHEN cluster_size > 1 THEN (1.0 * node_degree) / (cluster_size - 1) ELSE 0 END AS node_centrality FROM {node_degree_table_name} diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index 7aa2174fba..f45eaf239a 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -289,8 +289,8 @@ def test_metrics(dialect, test_helpers): ) calculated_node_centrality = relevant_row["node_centrality"].iloc[0] assert calculated_node_centrality == expected_centrality, ( - f"Expected node degree {expected_centrality} for node {unique_id}, " - f"but found node degree {calculated_node_centrality}" + f"Expected node centrality {expected_centrality} for node {unique_id}, " + f"but found node centrality {calculated_node_centrality}" ) From da9419b1528ab6afe513e3d03e8b896b81a37a1e Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Thu, 13 Feb 2025 10:02:48 +0000 Subject: [PATCH 09/11] format fix --- tests/test_graph_metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index f45eaf239a..cea48cd666 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -195,9 +195,9 @@ def test_metrics(dialect, test_helpers): # max degree 3 # centralisation = (1 + 2 + 1)/(3 * 2) (1, 3, 1.0), - (2, 2, 2.0/3), - (3, 1, 1.0/3), - (4, 2, 2.0/3), + (2, 2, 2.0 / 3), + (3, 1, 1.0 / 3), + (4, 2, 2.0 / 3), # cluster 2 # centralisation = (2 + 1 + 2 + 1 + 2)/(5 * 4) (5, 3, 0.6), @@ -288,7 +288,7 @@ def test_metrics(dialect, test_helpers): f"but found node degree {calculated_node_degree}" ) calculated_node_centrality = relevant_row["node_centrality"].iloc[0] - assert calculated_node_centrality == expected_centrality, ( + assert round(calculated_node_centrality, 3) == round(expected_centrality, 3), ( f"Expected node centrality {expected_centrality} for node {unique_id}, " f"but found node centrality {calculated_node_centrality}" ) From 4f028d3aa06313775070eb69edd2a0f4fa108a12 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Thu, 13 Feb 2025 14:28:13 +0000 Subject: [PATCH 10/11] pr review feedback --- docs/topic_guides/evaluation/clusters/graph_metrics.md | 2 +- tests/test_graph_metrics.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/topic_guides/evaluation/clusters/graph_metrics.md b/docs/topic_guides/evaluation/clusters/graph_metrics.md index 6d49eb79f9..20dff541f4 100644 --- a/docs/topic_guides/evaluation/clusters/graph_metrics.md +++ b/docs/topic_guides/evaluation/clusters/graph_metrics.md @@ -53,7 +53,7 @@ Node centrality is the **proportion of all possible edges connected to a node**. In the cluster below node B is connected to all nodes (giving a centrality of 1), whereas node A is connected to 1 out of 4 nodes (giving a centrality of 0.25). -![](../../../img/clusters/basic_graph_centralisataion.drawio.png){:width="80%"} +![](../../../img/clusters/basic_graph_records.drawio.png){:width="80%"} ##### Application in Data Linkage diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index cea48cd666..5f99ea02b3 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -288,7 +288,9 @@ def test_metrics(dialect, test_helpers): f"but found node degree {calculated_node_degree}" ) calculated_node_centrality = relevant_row["node_centrality"].iloc[0] - assert round(calculated_node_centrality, 3) == round(expected_centrality, 3), ( + assert float(calculated_node_centrality) == approx( + expected_centrality + ), ( f"Expected node centrality {expected_centrality} for node {unique_id}, " f"but found node centrality {calculated_node_centrality}" ) From cfc08007205f093737a38c07145059925c249bb6 Mon Sep 17 00:00:00 2001 From: Ross Kennedy Date: Thu, 13 Feb 2025 14:33:09 +0000 Subject: [PATCH 11/11] reformat --- tests/test_graph_metrics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py index 5f99ea02b3..b0f1ad8b33 100644 --- a/tests/test_graph_metrics.py +++ b/tests/test_graph_metrics.py @@ -288,9 +288,7 @@ def test_metrics(dialect, test_helpers): f"but found node degree {calculated_node_degree}" ) calculated_node_centrality = relevant_row["node_centrality"].iloc[0] - assert float(calculated_node_centrality) == approx( - expected_centrality - ), ( + assert float(calculated_node_centrality) == approx(expected_centrality), ( f"Expected node centrality {expected_centrality} for node {unique_id}, " f"but found node centrality {calculated_node_centrality}" )