From 7df00fbf6d002e679a7ce9ad7a8efc2b1b0c4573 Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Thu, 22 Aug 2024 21:49:50 -0600 Subject: [PATCH] SeqTM --- MANIFEST.in | 1 + dialectid/__init__.py | 2 +- dialectid/data/emojis.json.gz | Bin 0 -> 21602 bytes dialectid/tests/test_text_repr.py | 16 ++- dialectid/text_repr.py | 193 +++++++++++++++++++++++++++++- 5 files changed, 206 insertions(+), 6 deletions(-) create mode 100644 MANIFEST.in create mode 100644 dialectid/data/emojis.json.gz diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5bc24d5 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include dialectid/data/emojis.json.gz \ No newline at end of file diff --git a/dialectid/__init__.py b/dialectid/__init__.py index 796f9c9..5053347 100644 --- a/dialectid/__init__.py +++ b/dialectid/__init__.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -__version__ = '0.0.4' +__version__ = '0.0.5' from dialectid.text_repr import BoW from dialectid.model import DialectId \ No newline at end of file diff --git a/dialectid/data/emojis.json.gz b/dialectid/data/emojis.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..6306c05f3df77673c6a2d6e4855a9d65a97b9b17 GIT binary patch literal 21602 zcmcG$2Ut@}_b+@95fv2y=}kaEy7b$M=2D|GoEnzI&hSHrcah%__5I{nliMG3fMZr~TF^f8d*0+gMsS z@K`$7JpZFPsUrq;IXq)~{gQ*5ljoC%54iN!xoQgWYUB!LI;%8w%=Fl$p#w86%8({QX1!X+OLTa>vW28=o*QM@ZV}<>0dL4_Jaj-Vz(7 z{U@`GAK-`hMSir4GA8ggg(tKZsqy*yd^!c01k{O!yp{JcQP_s9r5SH;RPE6+YJrY~ z+(XgRI{t^R+7uJph0cJsT><+e|28W6*@^F6#@?f3daUgv?$`?$q*r)w5U$EEasKjR^CHk)Q+~c_?)vHP&-%+*b}>?>74!5iiQLz&R7}ju^+Wcx(7m%AIhr zb@Za|dKLM4ANe6|#OV7&ZQHkPTCC|FQBZ7fA2dnbY25N%M-GB^^u{gEU_{UW3dcHJiK6My`8 zWkAIFDGk}>CJw*o;n%K+lT)|+7tcgU;lCH)?`Wcy7T{%ezA@D`i|wrB+>=Sg3bZ)C z#WG%K3*E<*L6rIEcYgSUS>t))c@lun3C|PsJHsbtox3p~Sk}s&r}8^4DQ+RquV5K; z@lT$@h}&hX{^%?ZeG%Md*ZVN?R4a!1n?!;?-A4*R73EY3Q%AJmbazRhBX;c0&F;FI zRgP{r!Crd7Q)Dx3xU>B}(qS~X+B>GjoPBoNexD(Wi1_;(w#k|H&%(aazM&N!%shqS zVg-t|GAsp>5xSp(Y{vw4q`VfQ?wesfh{n$HY%8#OGDmjFi*q!x^t|KWR@^o($*6QM z@BhF%+Q(%>J_!3>k?G1=F+!%j7QnHqkmu4xC7{rFz4^G^h=QC;E68?i!&_|N7V$c3vTB}npX7X9lQWkZwu3`ldE`RS# zyf9S+T|x=6qk6HbsiRiy_!LR0$vX4qKSVZy41mwONLp{!bum2I<5{<_Q=Odlif-yS zW$bY+=&t{VsG>fom21_aGm%{ z?YBKL*4ijF-?%Ler@`+5fJi(64MtxN(eD8}_Y`l^vSZ5%kxxK>xu!e5OC7);F7DMpUy zhOoL`5sVi^O)04J-+(TzSoH=+=Bfy2)DDZb#|ySa30btL45U$Kc3UStR*$!PCu9-M z4(jwxhxp7uCEQ&$JuQa!$aT@K1<}lJh(R->F1@RN?qj!*wL0?*h(;+G&%WfeI0FHX zZnL&I-#t6Cy-jnv5qujt$>lV2PoDL)Jf48#-Xqq$5ECn9!bo=58v5X|QI{@g?yr4y zV2Eu04uWmYwJxaIgHUmDTH*{Ta4>7%E-xzATo2_<{OIy<(Z~gM)S!O0Kw{2y#J^hO z%K~}zVgWGFNJ^W&T8iE`e_HAy?R5yU%}O8C`Pdl@xgpA-Y- zynAbkpT~ih@QQVr`}0)G`C7yW9pLx{IEF)>)!si(S?m>ex(E}$*o$h-+-#M&bYa`F z#@6Z^QTaf%?Lub|CjDXEd7C;$o3)o#H^05Hl~4^O63BS3d3BX_v@@aQlLSq;nNH%^ zeclbpAqAt%@#!QYQeKR4xQlBCK~Z7rg%m~=&8?cwg-g{&UuZ8?yT@#A-{(>bFzVmt zw5i4Dj(H#ps&{LRUDZ5|)f^R(>o!Jh)#0V+%0hBNpBvq@gh;EchGL&rg5tnZ&C*>IjK6zFj7RYH{LG4 z9ruj!y3(814-=t5qedgzT?x5+N;zL13kmA?FE*2> zk+*R!-DbN&)1za}&OPccT+{#AoQ@xe)-$!=q=<$l&k|;LDUGQ3T-V@^yLQ8wrxCFT z=Bq~1K~XNdTz4m@<0dZAc_S883+DHuMUcOlS9&cHGOagcoy;~LF%4&)MCdNN$L+p7 zgY;c(uZ?xxjR|VXvA{>#N4V_9sQHQ{=y6;TWEp()@pb}-he=U7spq@4T!;ix3 z=|ZOk^wbA4_}A{%-MIa`0rWX|(Syz!DBq7?=S-*V$taJ9t!GZhrS8&NP6SgD@LB~n zb;L766Cj`33I)IKwPfl_3N>vZ#f_Kk+4r;*Y~piOg%q3Kv|HvFcL=5H>%XjaRJwQ- zlE=sFjPEJxHrmQkls=DXVJB%UTk9Q~Ji5?FKYs*~6~gbE}Np+ZD>Y}e@&1(u;~Zteak{gSynRiShW zYd3J~qhi7s=$Iv71yuh!p*v>h?sW0_saRJ%tt&Ow><7lj)x+40PK z^^ZX}Q$*Bt(&L#Y8=wMn!!4-5#G6E-5TI0|ZIJyhe5A+U4HB-={`e=0E_K+K24oEQ zbLQ38TwGs8HFeOYT`?NFe06p0)W?$raU^F+<0GsJ@7>WlD}7z>+hCM0Ahs@)?n4yW z@1f3qx%07*vDV|Mbp9K1DJ0hYF&vhvQb1gw1_NMgH~wxIFf9-DkID&SmbDqTc{&Z< zt8dDn6u)%502=yp7f&4L?&McI^R2D)es&f+0b^XQ>Hr8fX?GZ%k%LOrtIQT-!C#+Fr2DXu-6RQ+f0ZKfXoGE22L@hS44>z^}byMc}6 zvx@_Y8CHQ80CO*%?&b4wT6U@KQS+WP$_HBdb03&EY4b3K{n#FoGw$sGHR)&i1G|nM zU}xq53VIRfe?3y$A_?e8s^ikJ+1=K1gzRnL*zErORI%#TBvp^Pfg_eVzIk@+!}w@N z^%|E}49*&EsDE4mru|7HqQ;_#BUkIOVO;Lng-g`H?i$1u6^!g-*wqA6@`mmv<5&ST zvLRFZ_tQ0z39n6ytB24S@}9dnIn7J2Qhkul65!0c{*_wpm{xo|^FClHi@gXK!<%?! z6tKK3c79-apuo)gp5*bd<$YxD2ON=GstN&%P3G0}mmXoXRA&I?$~ zlf2Rc?Mpqno`XpHsC@3pW+{RNxmVVqUUDWh@TW1GXMV9XQqOoc*G6QnvAQWEHB$49 zm@J#8F#POhS!6AP^yuE~?0nxy(r2lS%@vE~u~CM-XVTwnUw0G?Tot`kIhci+)Jm-l z(hS+-krtY-+PRm`8|gmm=(uK~xGXeZyj#M&O|!YVGIw>mY;Ja5iw{@&80Q`WnyKju zj-BN4h(LGMiRz~n_#*DuyH=oUIv3`FGC^w07r@MPE>5P9GwCjBM(2jTVks$AaS zJ)Q-k)Y^`{hAd1`?d^XSNX;UC3B-vVV|`YEpJub?AF0v_ngVoIc8t#2QdoWjgqI7hg*A%wfN_~d!4uv$BYrwN&#Kn(ViW-W3|}4y$$j2 zx~ZBmdKIc69qjzTy3kP}((j%@p&=)E9?zGe#&1vWvDGv^~)Tg=PUBRTJ(V)7c_{l8z zw*F=z?Du_s+6A(<4^>=Ia(>fD$m47l>pPWnK7J_Y1iG zV8niG{Pc+L=<2N;74dsAX4U7MmnrNy=w|unMRreYS51hx_-;v}dmdnn8*P*W1XR=I zp5W-X`B4&6I=3*P23}B zlh_|~}o|!v3W+{YQKJVH8U3Y+;y%1)g(Hf9?v>U4pU&uu`543u|$dO*oS=e}J z^r#PLr-AN6BKtaUy+$EB*2e76-%ab<(fAwqkv8 zNfF(?v6{|(bM5e>^fBMpPp0&_($j^&to$NGr=~sIretWUqom{%aQM4Ox0s07Jx~i5 zYNX==qi#a7Qjm>y3ytnt7<~46Z`j&?-1Ya+Gt)=Ets&HHFUMOO_ z{EZ~b&GN53*<-M=;-$R2tZ40EneTT1L()>2qaRN-5Z-g>~!KC@{8#^;^<)iT9Ni&7C zADT|?JLrIukg?r7Zc-D=IX-FH*Ex&pjue&Vr@>cK#$TMpf$1@+j(vA74Nq_HQ&K;- z`G2D?M*H}sfg^Lb^HcMzgBydA2 zPYFbCw>5EOzxP=J>7L!Zw8>VnGZ^UYc?z0M|DwWN0G}bP`HbN!7}<^kv2iEow$#kG z?L%6^Fw&ar-#{7!v|{$8nOIqSBSFH*o&kW)Upv3%6&E7y+sAR>F>LCURNVl56$~SV z0?u!#3a{Vis;*x<<2uK2O%*rs>whKzeeIRZh62uiKLy%1t=+`M7QaGGGj)*5c_OLh z#@my#=QRyF_%#9ZE7niOXeDDJQ-0gBF*mNZIh+RbQO*qQ6EMs6CN||F(p2Kh`ISaMN%l#YnrzQLc=uhNX zHxtup7}JKPh~Btbo|VR1sd&}1ottA$?K!(joo&_7f}9(vsTs&F{1s#J)N$e9lZ8bH z@hu2^TNtac=Y0X-$;~+D1QcOvF_E~okJ{?tFhFl_2nhRtS4#KfPRHSO`PBZ&^(ePF z-;q;$O8;qPXU1*-?1e*t*%|)8qeWW=&ZRQB&^ z<`}S@oszT+tm}pX|FuEwtH)!$rjyhBq#iExx3mmg#E%4#Xd7@YKSM2w%urQf!;iNa?!Hbq4UkCs~qY}b{p?bdM;Uo&jfC(T# zpcVl7-1=|`5CRw=4)25tq-(tZFXB=s+5ZOpenYqE@i)X~yp(a83HdKzv$V8eI{q#< zqZaOb!`awB=a<{_*FUvq-L9s-E)- zBqcDKpS0146#&S8Tzy+TD%_RcvaE5RCJerPaSSdegT?~@2PHQ-Z6kTJ`XGc50ys|& z^_g&n{-*NA8z9m?hc{(hasz24buc{`JoY$OfRWh&sSK4TqHK(eJBelV23@QGs_nBOdwLuH!l5Pm<&TtWf zby0to#`}74jW;oyPb2fPoxVO8u;VDnRZe)e?xXQ@;MWc7nck1m_gg=9gC}|m9;%Z+ zEm5VNAN(QeR3L2{-Z!YJmATh1t`TtP^IW@C__#4^!c>F)!Vs-wGpV7+!wz#gFP@?9 zJ=F^n{^=C(S(|=Xlw5xxRb7uBiGE@W>v!!%h~56uu`UK>|K+|^#kv+(`H|xcF)C&N zL>@vlSVJf;FhIfs>&uB90C^5=QFS(EUe|bynBNkSeftO~BTyD;R5NT%r2#Xu&1;=lWavb|D6t?cxi z2y!Un0PkIIuG@@OJ!R4=M$c! z`ByIzE#Ve4Lutm1@f_hL%46Ssb%x9OUzBI}2{M#&FZ8eud&!n(o6`Pj%<1~y;x3bl z#c60J@!s=D>)0%uhL!}>a&3gxb>;EipQT?)GBE!1q5E)Qc&EI4b9b)0=cAF~ z=Fj$UwsIWgMN7|sKV!rsdi(F0hYq`Bcv_hWx2QS_3yy?M_&W3ul-yf8{ zvoNyfwVA>(BvD@LTvM^gFF3_!H)`{JFxAN4$}|=5cEeDnY;OOLvqbvtsTrC-wSAgW;jeF17~1tNyj286?V|Mu)rzT)76Q|eygj*i)sx}IfjcKzl# ze_!aq<8R+jFBP%eE7`%{OgRfl8t6$8=pNG3>R{#O;k=?Y)F+Ia-KbeNn$308rVsGd zW@VZH@ns|<{QrfoRReg!g@A4IZUkTEUjPAuSyTDtp4vBYXhdHh1%~V3)BeAc=+VnT zTB@Akt;4k(za`QCp^h#KMt_~-N_=<5rzqUG1a-9IRj#EcbPqfN+4yLUXwLLa*M1+P z-J|3Tijtu+vjYvge$nHxWe_qSbtdwd7SffH%?3j4%65X!yvPdz#u-r9X|Xuvp7c#79jou z#emhHQ#vS{F9xmA^m#OAGGkUO13tuLQ8SjCiF6=e$x>jnw=^A=#sU zQ`4=cv!ndR6(i}wSg<-7t9gQ~TCeWkAIOz9h-~^$Y|BWk#{4$6IfnR?8-`52G ziauqZyR~c-AXbgqHNUxf1GvXVz>D~)QY~rDwz`&>)YL(rDN~IqDnRr7`Uz1$W{Ohp zn>&#%VwR-5%IcHTSt`|ZLTjI=p2O5a$@U{BNR@K9lFPqDaH`oJd?k%Wy{h>(Vo{Up zm_wI0zceQ`JT}y@m_x%(uWv$&Zy`O9CLcC+f%_(+JDcTCG>)K&i71Fl)vl4!in?Zy zFz_%C!0#U|vhnJ|bJrDBtcZHoJ7ORx&QK*#@uU}VO>3p7ORYM01*~d-)%sPiYGR-8 zYt`h%J!sX$szg6OzQ30SU}06Ey9xjc-=n-bm0tYnI1ti~&cdx^Y#y&<*x^u>_IunS zNA$Gv@?hWKO0cHs0r;U7lrR4qUQBAO9cME@q8Jx-fy_KuIsuvamtbB5nR(f`cFUvA zH^J54-v=FMG(en=3%dRxIX|Phk~m#!)k!AgCxM#NdT#btRvyaE|G&u0dr!5PAy*Vu z522qRn4iHa$n8X zJR=qoLrR|cx>iZ8zaj#5jyzoFOM=&ld%ZbIj}+C?^wVWiTJEMuvxjFDMrEm-y<*9A zbw2z=0F4@Xg_vazr`}H^i!t$hE3>;dZRzjDj9X@LeG$5&mtyZRm8}!=JX%dxB1Edj zeAvLCR3k@L)sNz)RoU687f)Z7c;(32L&eI#w%Kl9)ve+p zIE9lOssja54g~zuHl_I) z9(Bn(qO*{r`z=*e)0yz?^l@Z(FN>WVU{m$MC>$uTQYCa@lYc73A(73y0DBSriM@EOprn^sH8kQtKw4>}gY%2p*|AZk zp%?ze3#z+|J^AmT+j`TgGUJ{a=YLi{DLzobmyu-1-FRE;@ zW(O8a+b!?`FPpzF-gpQ9Ydz@6xqUH!v^qkoIdrtq5($V&LR9T+2QVYBvjL#Y4?uqiPVdNr*SDFFwyU^d{1GMSpyD+oN9FgDPCm);>v5(0E%921OY zaG8G@Fi6O!Baj)@Wqe@0kA0-Un?$xdyoGH}G!oA-ZC+>en)Q9e1RU{u&U{Aof zjvw~s%fw!Rl|WUGB0;lQN{79%-Nj(^@eRwFD0oxkz!B6yk+kK~c0H0BA+_?oku6`K z<(>IeEOQHJ$fOf+fV<##wcO;LsStp)s5w!3_re#MhCV>gU~#+49CX;b#eq;#YSQ_lm***|0Qg^NiJT2OXB8mC5@(@`*;u$Gk&siSspI_cLq?&NDf? z(KCcBTA!#W+W2aMZnQI-i~iE(hx*#f8A6ZMMSaDlR}^ilYSDAo@c-DgHPmx6`-H2k zYRY!dUvbu(J%6#LcYL&$^HmbD?n5;|MVd}G#>irE(OW2gtL~ESmXXEE^S>JXRqL!f zT}Q@Zd--?y%u>Hvc$}q`o%TJ70bOXqf|ed3vGg`{?YIYYF{4A4N z`eX9ont}eaZvAI=V}mgiv?UY<&twhR^41tsRw?L1mJv^v0#SS5d#7eQ&!F@ARg)F9 z%`|yOuSB;PzZ8}=#~T_WAH5pL7tOaRl(W*M8!W${s%@k@bTQUI(W!rkymgaX`00|q z+`PfFZd_^LX|%4SaMLtc`!4;x%~N>~IB$9e5m`0FuPb>76sO7a9=@80AGr6hHfvu% zKZu(r_mCDe0y_GTFr~2hR9-WMu1Re&?gE_?k`8*c#?eN>QmK7gH}3&UK`prR4=1Sa z5oj4w*C7h4M`{0KC2~Y*f~kT>;Ha9_U|SBv+drI?teOQTXQ)Wvr?-2Dgw}98{^!ZC zcQ4B#UBl04k8NJOwe|zx=360h;L$4`hsE;JA0PVZ_N{@Cgj&4}f`s(lF{`H`#aK;k z4xaAZlVVT{I?vaul7lsw+6_~H0qIkH{2#l9kz$YozYcc_srGW0 zjDeU%GK8grN9M{~O)UzUl*BETYFb%lkAesCBL;24T6-U<2XFjNQ@F|B8lCmyt9mBT z1=$OsygLdOhr50nKpD%QXQ($Bn0cV`SPj`?tI$OzwpFGi3Fa7c0Nskj)>nk@6_8F(;<8YFThu*bQ&< z6xizSk+XXB`bKL3Li!eRJFnY~Q+ zTa4G48>zzP8n7$ z7Ikt;3P#1!EfPF2lg#KmqaN2dW_$NwL14D^*{)D^8*R^|6Y*1##n3o|u==g@wN-cq zB0EQ?kedD>7N@rGKQYQ8<;?d##a9w+cq?JecdUJptZfo)5+;-znTS2nNII*nWT9a1jVb;|@}+g4k5#^Ri>*NOyVURR6p z+1&8*JvhHpNh0H(LIb_$aAI)#v}qG;LzKVaZSab0sr#p9+3y?7hvvqnu*e!UyVK$& zy)4(oQetlXFiVK*Tc(m5SRmLS*dUzs4>-RO!uKkE^W+n@sqX<|jC|X&U3Aw|@a<=O z5+!84PI_LU-&3AtyrZT3{X2=6tAYjtiI!>bZ5bIYff1C!3PnY&@z7%VwE~bR{NxPo6tNKCgs4QAPWy3a^UR z{FDDyfWY;x7QN&MYvm^fxpc5utj|2(Mf|LK7W12Nc)|Dwubv(FUgPrI zVdBGWvMjAo5AV<^_{u6-&X2*st-Qjc!%&j2Is%*XgBD{)M|2@=!6fg0qS&V3AL?Yi zUoOi!e<=7sw#gezwMiV#cv(i?>+QkmU6PG6;6v>=?-#4#ZjXfH&*C|nJlsPe?Wj{` zC2=o$8T7TUW56ppEE2>YOsAd%j|4{~u(>>WWiH$e|A6l*pybukhL9ZyzMnwSv5xPk z&{h31v@PzZv_i`PGrd5ORQIDmmVuYE@gxt53F#$16V1>C`AK$*;U@`zgQ>I0;)USR zmxU77od{c&>wnNwRpFa5Y&NuRHXwVj68mAZNrk4EJL^*OKYhj6oDL4@ISseGjZ7F9 z-b)O~J1;&Qd5G;=#P_&0i!Di;?*U>YGDNbxKD%@$8574eB9?OhC3hSFPh2xE6`vo8 z1MaXvwTLV{u@N`z_Bi?s3E#t0sEAF;Cp`?C-2yo)C%-Yo`Q4?nyT>$-iTgOlXI>$Y zJ^pInSK5Sip^);k$<-ng|?oU+Lv$=ZBvDPrJ2*6KP8K4 zS}>uj4)vfVg?-FZY+!cL3Apq4dMK6gozel4!>Uy8Ha?<*x^C$e_twrQ%{%%2#C}^6 zO_VTs_g1>#=q=xa!j$ZS-1#_HyMyI8!co43O4Aud>}bU5a;Ht4&?ukPXMAiTIak{R z(OTmPEN@&XSN)*xj?qOV(WJZFfxS-QWh}RU9l?>_{Z|pjn0kcoLB(ZIt^YO_e~q}! z<|(!%iL}io%I`3w>JNV`+e-1}I^H%6(b~T6j)4z`V8`hAU^6oayvrZ8I^zz_JEFm1 zjXPXzys<}2wNzN5Nh>)IZhziS9V{tHGJt4489=dvxWzm0C-Y?%hX5 z&Cw%PEA2I>>h9T9rYXhF7F*ZfhGdedJ4X&8cvC$9e?yt01jKH@-YD2$2n% zHW=Tdx009PI8~|ga`dj6Yws&@mNi=<< zw{!55eAXo+Twc*Y4<~kuw|{zrAR$ceNSdM4PO?_Q&EULB27K#?HuLlMlUM=x@@U=m}e)?iDa{GG5aFWPJp-)s~o>JuyL|s)kE5aSdhsV zw$e{l>&E;_%1#OOQs6f6<8U_qCV|kmEB-9kC6&6_zz4k6+D)QPeD5P+_IFd@3V!~mahT*G(Fej=Z#gWHBcXUv5B?V8C#;WW;vFJUMpEH%2U=N1SwiXX z4-d-{{UOc=9x69 z2yR##h2~?TuxIFqC0H9PW}BdOItdNbw9pi{3%3*9dg5H-pe*5BvVI0G74YLq%GMQ~ zEtSbV%V9c%kJIvSr^VL^&*dEGhi`8e&Un00%7G*<@ zCkHr>EFthC0EvCo-1)To)n-oY{s z<|>V~ybg9Ls)Y=lbDv;-6f_TGDfj4gmVa>9Rh{ts!}E_(N)XQcNT~2TL2omTwA(wZ zrJNO*b)hM;^^mPY%C$e7yZgp!=jIU=sVJ9`@BoDjLt71Apt@sl|`yerQpD%H%iE zG^-z*_DS-?bLYX|;vrnTb&gJE@xg^3Rdk=6#Z0CJFSI>fEgz4rXqkR%u3Jt!XYsX1 zI5+jw;)(P3D<{+weAZc7IlMe-il;ExM@h50_0pzj%hw0zQTtx*b2(x7O0QCdrmii= zk<@8^^g8sT7X{n)O|4KTHu#;;cJnBdVRN*{T~SszUg$PQ{#0ZG-|jhE!|aN8h}4#; z^(;H!VWf#^`b>|qCRLYN!rJL4?Q+0Z4{0J1p)Ka*gPb2%qk$W;R|(6K%bFJAc51eZU{Mv}(Qh8bzH*dHyL~ZZ+id38A{BkMiaOc?5kFgu;b3`};GbnO<*8cfZp0;7T z0G+{_V~bAn=4)d3%0x`=M}s2$=W6F;@nhw$h<rq4V}trpK~Q@3xlxHF-| zOd90ISi~h$8dHtd=qsra0#9J^^81oQ<)#NtlYg*@m?w@T+SxnKwChxp2s1h?nc7&2 zZDqvJuVcV=7?D9|7cjO4%Q`cbIIBK2NvX6>U^B@Y@yQh7W&V9)jua{Cc>_y^*&1K; zuK$Vkai6ciU$6^{WsGPwl0^lr{V=B~6mF8Td%1ZS@U)TsjhFgTz;Vd#OdKv_)W*o$ zG)S~c9!`j_A&hBD^@saNc%N84I8&j;Hm;H6Bk?j+;0&tBgwf=SbiIAY-R1D=NLV^o zxZzF2-Rs0gti29V`wVJh&b@t6G7K-;>Mv0Ln2-!dxqhIW$*snp2uOUHV15u}^$E;I zAXMkRY`*^F6TO%-+f4XXSlsH83%+eDZB~UJ#|qs=rv!TENZqcOEbfn3|RS?iG8 zH5OAcUiP}9Ci4f@=?4YS&^Q4QUi7?f8kI(S?PcqA& zuuEWDmK_Py`(w=J4Clr2NsraD3ad#7a*Sj_h1A*og$w&=uq}s{3)#Bg8(+HP;x)D6 z(KfLgR)yMPkwn@#Cn{DXz6t=f>N)Mch|ULCPRQo*RiYNsn6jnzFe zs(Hj()>7l>k}^e+IM7VSQhxDH9D80A`fz}XaU_GzC4nC{D<)a!;PMFVvfVN_IiAs>CVP|{_n+@T7W#bDP(?_DN!*X`GJEJd$Ewc;*hUZ>X}5l>`x6jhwH z4eUno+zYF^Ew${kRe?W*%8>AGEEXR5j&{{z)S#>MnfKGu;<>C#5Ns^l+*lLOux;r! zg1@}Wwe-ewgmQq2cag68T3C`Fr4vKhl*V36qNj}kn3C8$)s`b{5FGiscTt>RxNkOb zt+eZ1cFJ(6lJHgxw0T`~8QY+4RDoRfDBDP2bZDU5k~3Oq3fjL+r8`Es*>J7q_Jp0x zGC02%{?cT{q?0F_L~6jPS^fMBXqc6mqp{5R>Rl57#oQIty~yj%u6UKaZ8BQs%U-nvHsrc%Gfn1e zGYW~lCkI8FHAyw0iD+$Ldopq#7wV*Ilplc^2E1~ALvgyMlK1io!O2=5*|D$%A6Y+I zk`ps-c<$Aj_Y&DSx?&`CYx&}jJU|Q;I`-XT4`6;^oWsG{UyZNTSK9hHOk9w7_1xUE z{;ZYxQNvk6bOXKjdYDb-gqKO!{!1w@yawd;1^v4soyYI^`}HL*x7R4;9w^rqoW61r zruF44p^dv}m}@p0y_H+FJ9O1kvciJ6z*4L3#b)#9QO~6(TC|6|c$c8|}r=uZ6|@ zOsB?*2kdH^F+f_-pEnq2$s$g;QFIskz$O%GiB`^yIojdDz6=xc)C=IjCPL}E_9PNF zo$-D$*^&p%p7$T~$)>uVc{)z(d<*hGh_G(#)q&c}<5TIQYwk zRy*I_`u&m)-6!&_jnr+MqV#9JzMQ-wZb(WWw>w>N7vl*T6b%pD(2C7DS!%|fn&;q2 zDq}e13auig_^^FM9r^6jt6s6+jGrrfwQYAixEsw_==|0gEio3vNa<(3Z~-y!Af{-d zWLX%BS}aUYw>xj%SmTyl^hvMuxS7Is#g61uz(US9QS+*AX_@Sp3eQJlK@Khj24 zG@M$Wb0be^ZV6r4r|BN#928--br4>@u(DIiC1dYRQx#lcP>+h0KXgfC8?n{;_BO$? zj9XnSg+&ZoIU>7-HV9bgIMG%9j0i**aHIXc-MnqID5J?GKUC7PGr0nh5XXb_8S zqT`RWRJ57FKDI_p*m+0`?XebB#ZRC5y~ z9-S6NQQ3ZG8V@$DpCippm-MRo8kWXVw?4>5@`cA_E6Cl9&TE@H*G0JK5$x@Yw~YmB zZ^^4Ja&9&Xh&~|GnTmFzWSrOxG6yUmthP6JEn}Ys*qFk`R zLu(&|lHHcjkX;hRbVZg+blELSwg%9BVj)~?!;^p)C8w-UtwQ0a(n^ft>H6zl>VhkS z3eJf>!x5hN9Ap~wCv*AmonAGPIV($sE$%mneW%n&sC^D1G_u5dt_*D2Mxr0r8n~8( z(%4(6J|HBxS!|?SQyHBLt`Wnf<^-ms#-bQLDF;in`5@&{)M-AWDAe{yixrsc=05}@T~fLGD-0`zxiZdz}pv| zK#wBXY!6O`!pjAxXU{C3aK*gw{uUeBe*(sr1pfxGo_LSJf#t;R$RwBvl?S|vzO@bU z>YTr)vy4TRar;%hk@PCAp?m$ztBrHhFARDR(HrN?>h+(2O?2Q}1A0P-wK6l^YJZ-C2 zJBSI=X53@=T*9hbd|}!t?W-oLeaWBXipk6~ygd zh6zM(rz#=6w^E3!JT5pi^6ZDOT`o*;Z#>#LuZb5b!M0pLWe4G>xBT?m=9(BbtTDE^ z4GY9w97X?5VvhRjXmjxZ1a#!zhZ?kJ+K%8;o){vzOx7E<-KNpjia~N7P0BpaY*ZnT zVN$O1R$LG2qv!WysAiI!I6pDxBEO^h^KxYbEM1am_llJAm-Xl~ns-=ZldE-KYbG|g z(x)7~T`v*otg{jAacpEK9XXtH*=9qyEJd|CgwmTg*$(Fohb7#4l3ND08G{(#=tjwR zuvE!H%>zS7u1sBIQ0GJKJLLwq19pIfmQi_$oA>2%D}>xy#VS$GM70B3Sr0fZ2Wj}D zhA&A8h=ClTn5tv}bf*65Hg&Msw!3gxA%Z?snjZ9S^&br*sbWlB_Y$!CtWM&qft%1< zme7|=;6I17@363K44`t~n8zrc zhj~5sl+baKITF`#Us(EKO#LZA#3Wz*9Q9Z6@vZcSz%IpRBp3M5dSScY+&XOrOvCJ@ zwz>xfA7VCNu@iT?nsjBdqr{u5C?E!7e8)^f${BL%h|EIQM{)loH1d3vV|xQKWW0AT zHw4XqKM^!Rnu|KUX8wxZJKEN5*!5cFIhsshQQOy4#hUxFs8!wBiy3e0J*7%~CX~xM zZ6gj8SXDKlLdHfSC@e?SvWXUzyy;Z~wGG&7SV4g>oe`>#x&b|^;Z7yV{rZ*+vsT!B z?hyHxq5Zx70_rSVua!z{nqb{jb?mM+qhbJs3mH~0eM7w!tjl zaB>q^&B-2)U$X0VbC9_GOSEIk!`q>toz;Rs=>9ic2a-xn)j6 z3nzBm$E!l$SEJipoO~)GCA=n`8Ve3bnK{&RYxmllnpzhmA2B{}kRE&OkW98L_};d`y&;K}40c)F zrAv51Z1^?$85yit-KFnDUFW2&cqj2wWc}8|E>YYTkXT(M%7$^lqCJAf_8qQZ>xYMl zM*@maPGR)r?uKJef`DA<2NCD62-guLa{cJpOH4`uJVStSTb*-2dzR_Z z1eKOmyrAC)dG@77`#nYht?o7Zgv;e;>^0?<+1qMPR>odNpA=(jBaq{nn%-jKEhqLJ zxz-lqJHplW&3A;kjoWvGqb<)j(Wm%9{ZF3pXOy;OM54n4iAyJpobY(7D5t6~mlN3s z4{vr7R>sA|reYuI8-;rLTeyntF=b3oQ(^eHd6luvCDn9?lznUVon%nCuC*k0O%Kvt125&Nl8q zx#9|4#<(XQW&Rt*SwE;7j8HdruYw-T*2Tp{58Jp_9%Ta>Ls>`hhq<~hg zH?IBlcWhmPc<@+ByNokC)t$z;!hwAT^w9n15{HP@V8CD~)b+w7K5kCu06-+xln!DO z1+o45=)u8?CD$tJG>5>I)-!9Nx>!eRv7dcg(CRONtFYefin zYEnGIY?^019jYL03M$|(QPpgq+DQ+i0aNpMA5C@!9@p#87eLG&}dViRnPJ~R8x6M_K z=d?W&xbG7an{cvHCA^Br-j|Vz7}Pi^^C?tO@S&t$EAtRf+e7-51ZFX}ir87vAkf!P zbHJXRB*YPl8n8=;fY^Ylip?xQ%4?Db70&yPTyEn-K-yaWU#kJ~f%!$}#6~-ywR0M> z&uvzc0;g@xj|C_7(`O0rPX{I-bk54zAelSq{TXLKA$+__`f}3Fm(Txu%YlLn5`>_n z1<6+bKyCwOD`31AE?YXF%D_dVXq(4KNG%SzKq0 z3VS!{d`q$8?gGDb@pCU-Ocu_azDH%w_M5Gr&2|_3V+Do^&>_IE6MuNe$3@w$wb)p& z7ZRHOAdzG{;d5tQOpY}d^J`O`HoYrxj;ty;5U7$sj zz&X~phjuQAwGbD%2n|Rl3x6bbJCyBr;HK(5V6I4?USWTkWlrof)HoZG{Fe&+Yjn^PVc zgHpgpJ)88tj3bdg)~Em0DPP=QclY@7yPM|yllT9B-~WHR$^Tgu{+lMpl~!MW`Q`7+ n|E1fuZVy_&Z}0Q>vA5!at6zWlxBvgg|BQ9_-<+Rl!NLFlX^oar literal 0 HcmV?d00001 diff --git a/dialectid/tests/test_text_repr.py b/dialectid/tests/test_text_repr.py index 6592bb4..7860217 100644 --- a/dialectid/tests/test_text_repr.py +++ b/dialectid/tests/test_text_repr.py @@ -22,7 +22,7 @@ # https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/ -from dialectid.text_repr import BoW +from dialectid.text_repr import BoW, SeqTM import numpy as np @@ -44,4 +44,16 @@ def test_subwords(): bow = BoW(lang='es', voc_size_exponent=13, subwords=True) bow.transform(['Hola']) - \ No newline at end of file + + +def test_SeqTM(): + """Test SeqTM class""" + + seq = SeqTM(lang='es', subwords=True, voc_size_exponent=13) + assert seq.language == 'es' + assert seq.voc_size_exponent == 13 + _ = [['q:~dia', 'q:s~', 'duro', 'q:s~']] + assert seq.compute_tokens('~dias~duros~') == _ + assert seq.compute_tokens('~🤷~') == [['🤷']] + assert seq.compute_tokens('~🙇🏿~') == [['🙇']] + assert seq.tokenize('buenos dias 🙇🏿')[-1] == '🙇' \ No newline at end of file diff --git a/dialectid/text_repr.py b/dialectid/text_repr.py index e58fb88..558c481 100644 --- a/dialectid/text_repr.py +++ b/dialectid/text_repr.py @@ -20,10 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from collections import OrderedDict +from os.path import join, dirname from EvoMSA import BoW as EvoMSABoW -from EvoMSA.utils import b4msa_params from b4msa.textmodel import TextModel from microtc.weighting import TFIDF +from microtc import emoticons +from microtc.utils import tweet_iterator from dialectid.utils import load_bow @@ -48,6 +51,7 @@ def __init__(self, pretrain: bool=True, assert loc is None loc = 'qgrams' self.loc = loc + self.subwords = subwords if estimator_kwargs is None: estimator_kwargs = {'dual': True, 'class_weight': 'balanced'} super().__init__(pretrain=pretrain, @@ -63,6 +67,15 @@ def loc(self): def loc(self, value): self._loc = value + @property + def subwords(self): + """Whether to use subwords""" + return self._subwords + + @subwords.setter + def subwords(self, value): + self._subwords = value + @property def bow(self): """BoW""" @@ -72,7 +85,7 @@ def bow(self): data = load_bow(lang=self.lang, d=self.voc_size_exponent, func=self.voc_selection, - loc=self._loc) + loc=self.loc) params = data['params'] counter = data['counter'] params.update(self.b4msa_kwargs) @@ -82,4 +95,178 @@ def bow(self): tfidf.word2id, tfidf.wordWeight = tfidf.counter2weight(counter) bow.model = tfidf self._bow = bow - return bow \ No newline at end of file + return bow + +class SeqTM(TextModel): + """TextModel where the utterance is segmented in a sequence.""" + + def __init__(self, language='es', + voc_size_exponent: int=17, + voc_selection: str='most_common_by_type', + loc: str=None, + subwords: bool=True, + **kwargs): + if subwords: + assert loc is None + loc = 'qgrams' + self._map = {} + data = load_bow(lang=language, + d=voc_size_exponent, + func=voc_selection, + loc=loc) + params = data['params'] + counter = data['counter'] + params.update(kwargs) + super().__init__(**params) + self.language = language + self.voc_size_exponent = voc_size_exponent + self.voc_selection = voc_selection + self.loc = loc + self.subwords = subwords + self.__vocabulary(counter) + + def __vocabulary(self, counter): + """Vocabulary""" + + tfidf = TFIDF() + tfidf.N = counter.update_calls + tfidf.word2id, tfidf.wordWeight = tfidf.counter2weight(counter) + self.model = tfidf + tokens = self.tokens + for value in tfidf.word2id: + key = value + if value[:2] == 'q:': + key = value[2:] + self._map[key] = value + tokens[key] = value + _ = join(dirname(__file__), 'data', 'emojis.json.gz') + emojis = next(tweet_iterator(_)) + for k, v in emojis.items(): + self._map[k] = v + tokens[k] = v + + @property + def language(self): + """Language of the pre-trained text representations""" + + return self._language + + @language.setter + def language(self, value): + self._language = value + + @property + def voc_selection(self): + """Method used to select the vocabulary""" + + return self._voc_selection + + @voc_selection.setter + def voc_selection(self, value): + self._voc_selection = value + + @property + def voc_size_exponent(self): + """Vocabulary size :math:`2^v`; where :math:`v` is :py:attr:`voc_size_exponent` """ + return self._voc_size_exponent + + @voc_size_exponent.setter + def voc_size_exponent(self, value): + self._voc_size_exponent = value + + @property + def loc(self): + """Location/Country""" + + return self._loc + + @loc.setter + def loc(self, value): + self._loc = value + + @property + def subwords(self): + """Whether to use subwords""" + + return self._subwords + + @subwords.setter + def subwords(self, value): + self._subwords = value + + @property + def tokens(self): + """Tokens""" + + try: + return self._tokens + except AttributeError: + self._tokens = OrderedDict() + return self._tokens + + @property + def data_structure(self): + """Datastructure""" + + try: + return self._data_structure + except AttributeError: + _ = emoticons.create_data_structure + self._data_structure = _(self.tokens) + return self._data_structure + + def compute_tokens(self, text): + """ + Labels in a text + + :param text: + :type text: str + :returns: The labels in the text + :rtype: set + """ + + get = self._map.get + lst = self.find_token(text) + _ = [text[a:b] for a, b in lst] + return [[get(x, x) for x in _]] + + def find_token(self, text): + """Obtain the position of each label in the text + + :param text: text + :type text: str + :return: list of pairs, init and end of the word + :rtype: list + """ + + blocks = list() + init = i = end = 0 + head = self.data_structure + current = head + text_length = len(text) + while i < text_length: + char = text[i] + try: + current = current[char] + i += 1 + if "__end__" in current: + end = i + except KeyError: + current = head + if end > init: + blocks.append([init, end]) + if (end - init) > 2 and text[end - 1] == '~': + init = i = end = end - 1 + else: + init = i = end + elif i > init: + if (i - init) > 2 and text[i - 1] == '~': + init = end = i = i - 1 + else: + init = end = i + else: + init += 1 + i = end = init + if end > init: + blocks.append([init, end]) + return blocks