Skip to content

Commit

Permalink
Performance
Browse files Browse the repository at this point in the history
  • Loading branch information
mgraffg committed Jun 11, 2024
1 parent ddd28e6 commit af1afd5
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 171 deletions.
14 changes: 4 additions & 10 deletions quarto/data/de-recall.csv
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
,Recall,Country,Training Size
0,0.6396484375,de,4096
1,0.6064453125,at,4096
2,0.65673828125,ch,4096
3,0.658447265625,de,8192
4,0.621826171875,at,8192
5,0.660400390625,ch,8192
6,0.77490234375,de,16384
7,0.621826171875,at,16384
8,0.660400390625,ch,16384
,Recall,Country,System
0,0.9208984375,de,EvoMSA
1,0.621826171875,at,EvoMSA
2,0.64990234375,ch,EvoMSA
133 changes: 54 additions & 79 deletions quarto/data/fr-recall.csv
Original file line number Diff line number Diff line change
@@ -1,79 +1,54 @@
,Recall,Country,Training Size
0,0.640625,ca,4096
1,0.6316225165562914,cl,4096
2,0.67333984375,ml,4096
3,0.6127735596248325,dj,4096
4,0.72412109375,cm,4096
5,0.745849609375,bf,4096
6,0.600830078125,be,4096
7,0.5625,ch,4096
8,0.69677734375,gn,4096
9,0.6865234375,ga,4096
10,0.6591796875,mc,4096
11,0.70263671875,bj,4096
12,0.760009765625,ht,4096
13,0.504638671875,lu,4096
14,0.71484375,td,4096
15,0.5236442516268981,pf,4096
16,0.66357421875,fr,4096
17,0.67822265625,cg,4096
18,0.7306434023991276,rw,4096
19,0.7060546875,tg,4096
20,0.695068359375,sn,4096
21,0.5434402332361516,nc,4096
22,0.691162109375,ne,4096
23,0.3102310231023102,cf,4096
24,0.7004608294930875,km,4096
25,0.79736328125,cd,4096
26,0.657958984375,ca,8192
27,0.6316225165562914,cl,8192
28,0.691162109375,ml,8192
29,0.6239392585975883,dj,8192
30,0.750732421875,cm,8192
31,0.762939453125,bf,8192
32,0.593994140625,be,8192
33,0.5556640625,ch,8192
34,0.713623046875,gn,8192
35,0.70849609375,ga,8192
36,0.6728515625,mc,8192
37,0.7158203125,bj,8192
38,0.78125,ht,8192
39,0.5146484375,lu,8192
40,0.72900390625,td,8192
41,0.5336225596529284,pf,8192
42,0.677978515625,fr,8192
43,0.695068359375,cg,8192
44,0.7310069065794257,rw,8192
45,0.725830078125,tg,8192
46,0.7138671875,sn,8192
47,0.5521865889212828,nc,8192
48,0.71142578125,ne,8192
49,0.3606789250353607,cf,8192
50,0.679147465437788,km,8192
51,0.8134765625,cd,8192
52,0.67333984375,ca,16384
53,0.6316225165562914,cl,16384
54,0.710205078125,ml,16384
55,0.6239392585975883,dj,16384
56,0.77197265625,cm,16384
57,0.78564453125,bf,16384
58,0.60205078125,be,16384
59,0.560302734375,ch,16384
60,0.738525390625,gn,16384
61,0.722412109375,ga,16384
62,0.68017578125,mc,16384
63,0.74609375,bj,16384
64,0.80712890625,ht,16384
65,0.53857421875,lu,16384
66,0.752685546875,td,16384
67,0.5336225596529284,pf,16384
68,0.70068359375,fr,16384
69,0.7119140625,cg,16384
70,0.7310069065794257,rw,16384
71,0.744140625,tg,16384
72,0.727783203125,sn,16384
73,0.5521865889212828,nc,16384
74,0.7353515625,ne,16384
75,0.3866100895803866,cf,16384
76,0.679147465437788,km,16384
77,0.837890625,cd,16384
,Recall,Country,System
0,0.693359375,ca,EvoMSA
1,0.6456953642384106,cl,EvoMSA
2,0.735595703125,ml,EvoMSA
3,0.6145600714604734,dj,EvoMSA
4,0.783935546875,cm,EvoMSA
5,0.794921875,bf,EvoMSA
6,0.62744140625,be,EvoMSA
7,0.57568359375,ch,EvoMSA
8,0.762451171875,gn,EvoMSA
9,0.726806640625,ga,EvoMSA
10,0.670166015625,mc,EvoMSA
11,0.761962890625,bj,EvoMSA
12,0.802001953125,ht,EvoMSA
13,0.547607421875,lu,EvoMSA
14,0.7578125,td,EvoMSA
15,0.5587852494577007,pf,EvoMSA
16,0.714111328125,fr,EvoMSA
17,0.718994140625,cg,EvoMSA
18,0.7320974191203199,rw,EvoMSA
19,0.761474609375,tg,EvoMSA
20,0.750732421875,sn,EvoMSA
21,0.558600583090379,nc,EvoMSA
22,0.751953125,ne,EvoMSA
23,0.38189533239038187,cf,EvoMSA
24,0.6653225806451613,km,EvoMSA
25,0.834716796875,cd,EvoMSA
26,0.693603515625,ca,Uniform
27,0.6448675496688742,cl,Uniform
28,0.73681640625,ml,Uniform
29,0.6141134435015632,dj,Uniform
30,0.78466796875,cm,Uniform
31,0.79931640625,bf,Uniform
32,0.62451171875,be,Uniform
33,0.578369140625,ch,Uniform
34,0.763671875,gn,Uniform
35,0.73291015625,ga,Uniform
36,0.67431640625,mc,Uniform
37,0.76513671875,bj,Uniform
38,0.806884765625,ht,Uniform
39,0.556884765625,lu,Uniform
40,0.759033203125,td,Uniform
41,0.5553145336225597,pf,Uniform
42,0.71435546875,fr,Uniform
43,0.720703125,cg,Uniform
44,0.7313704107597238,rw,Uniform
45,0.76416015625,tg,Uniform
46,0.74951171875,sn,Uniform
47,0.565597667638484,nc,Uniform
48,0.750732421875,ne,Uniform
49,0.3837812352663838,cf,Uniform
50,0.6676267281105991,km,Uniform
51,0.838623046875,cd,Uniform
52,0.7099609375,fr,Country
10 changes: 3 additions & 7 deletions quarto/data/nl-recall.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
,Recall,Country,Training Size
0,0.728515625,nl,4096
1,0.7099609375,be,4096
2,0.760009765625,nl,8192
3,0.733154296875,be,8192
4,0.77587890625,nl,16384
5,0.753662109375,be,16384
,Recall,Country,System
0,0.8935546875,nl,EvoMSA
1,0.756591796875,be,EvoMSA
28 changes: 12 additions & 16 deletions quarto/data/pt-recall.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
,Recall,Country,Training Size
0,0.66064453125,pt,4096
1,0.751220703125,br,4096
2,0.73583984375,mz,4096
3,0.7847165160230074,cv,4096
4,0.77099609375,ao,4096
5,0.66162109375,pt,8192
6,0.762939453125,br,8192
7,0.7568359375,mz,8192
8,0.7978635990139687,cv,8192
9,0.794677734375,ao,8192
10,0.6669921875,pt,16384
11,0.778076171875,br,16384
12,0.770263671875,mz,16384
13,0.7978635990139687,cv,16384
14,0.79833984375,ao,16384
,Recall,Country,System
0,0.66845703125,pt,EvoMSA
1,0.80078125,br,EvoMSA
2,0.781005859375,mz,EvoMSA
3,0.7682826622843056,cv,EvoMSA
4,0.819091796875,ao,EvoMSA
5,0.668212890625,pt,Uniform
6,0.804443359375,br,Uniform
7,0.78125,mz,Uniform
8,0.7654067378800329,cv,Uniform
9,0.824462890625,ao,Uniform
10,0.798583984375,br,Country
23 changes: 10 additions & 13 deletions quarto/data/ru-recall.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
,Recall,Country,Training Size
0,0.619140625,ru,4096
1,0.5751953125,kg,4096
2,0.49267578125,kz,4096
3,0.522705078125,by,4096
4,0.615478515625,ru,8192
5,0.58349609375,kg,8192
6,0.522216796875,kz,8192
7,0.53857421875,by,8192
8,0.628173828125,ru,16384
9,0.599609375,kg,16384
10,0.53955078125,kz,16384
11,0.550537109375,by,16384
,Recall,Country,System
0,0.62890625,ru,EvoMSA
1,0.609375,kg,EvoMSA
2,0.551513671875,kz,EvoMSA
3,0.552734375,by,EvoMSA
4,0.626708984375,ru,Uniform
5,0.615478515625,kg,Uniform
6,0.5556640625,kz,Uniform
7,0.555908203125,by,Uniform
8,0.626220703125,ru,Country
10 changes: 3 additions & 7 deletions quarto/data/tr-recall.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
,Recall,Country,Training Size
0,0.5584415584415584,cy,4096
1,0.758544921875,tr,4096
2,0.5584415584415584,cy,8192
3,0.913818359375,tr,8192
4,0.5584415584415584,cy,16384
5,0.980224609375,tr,16384
,Recall,Country,System
0,0.5476190476190477,cy,EvoMSA
1,0.99755859375,tr,EvoMSA
18 changes: 5 additions & 13 deletions quarto/data/zh-recall.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
,Recall,Country,Training Size
0,0.964599609375,cn,4096
1,0.968017578125,tw,4096
2,0.627197265625,hk,4096
3,0.7228521038044847,sg,4096
4,0.9677734375,cn,8192
5,0.96923828125,tw,8192
6,0.66064453125,hk,8192
7,0.7195767195767195,sg,8192
8,0.970703125,cn,16384
9,0.969970703125,tw,16384
10,0.660400390625,hk,16384
11,0.7195767195767195,sg,16384
,Recall,Country,System
0,0.96923828125,cn,EvoMSA
1,0.97705078125,tw,EvoMSA
2,0.659423828125,hk,EvoMSA
3,0.7170571932476695,sg,EvoMSA
58 changes: 32 additions & 26 deletions quarto/dialectid.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,11 @@ fig.show()
#| title: German (de)
import pandas as pd
df = pd.read_csv('data/de-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

Expand Down Expand Up @@ -281,11 +281,11 @@ fig.show()
#| title: French (fr)
import pandas as pd
df = pd.read_csv('data/fr-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

Expand All @@ -294,11 +294,11 @@ fig.show()
#| title: Dutch (nl)
import pandas as pd
df = pd.read_csv('data/nl-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

Expand All @@ -307,11 +307,11 @@ fig.show()
#| title: Portuguese (pt)
import pandas as pd
df = pd.read_csv('data/pt-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

Expand All @@ -320,11 +320,11 @@ fig.show()
#| title: Russian (ru)
import pandas as pd
df = pd.read_csv('data/ru-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

Expand All @@ -333,11 +333,11 @@ fig.show()
#| title: Turkish (tr)
import pandas as pd
df = pd.read_csv('data/tr-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

Expand All @@ -346,34 +346,40 @@ fig.show()
#| title: Chinese (zh)
import pandas as pd
df = pd.read_csv('data/zh-recall.csv', index_col=0)
df2 = df.sort_values(by=['Training Size', 'Recall'])
fig = px.bar(df2.astype({'Training Size': str}),
df2 = df.sort_values(by=['Recall', 'System'])
fig = px.bar(df2.astype({'System': str}),
x='Country', y='Recall',
barmode='overlay',
color='Training Size')
color='System')
fig.show()
```

## Column

The recall of the different countries can be seen in the figures on the left. The table below presents the macro-recall for the Arabic, English, and Spanish languages.
The figures on the left show the recall of the different countries, using three different vocabularies. [EvoMSA](http://evomsa.readthedocs.io) corresponds to the vocabulary estimated in our previous development; Uniform (e.g., `BoW(lang='es')`) is obtained by taking a uniform sample from all the regions; and Country (e.g., `BoW(lang='es', loc='mx')`) is the vocabulary of a particular location. In all the cases, the vocabulary is estimated with $2^{22}$ Tweets. Of course, there is not enough information for all the cases, so the vocabulary cannot be estimated in that scenario.

The table below presents the macro-recall for the different languages and models. Since the Country model is not available for all countries, the missing values were filled with the corresponding Uniform's recall to compute the macro-recall for all the countries.

```{python}
#| echo: false
#| caption: Performance in terms of macro-recall
#| label: tab-macro-recall
from IPython.display import Markdown
import pandas as pd
perf = {}
for lang in ['ar', 'en', 'es']:
for lang in ['ar', 'de', 'en',
'es', 'fr', 'nl',
'pt', 'ru', 'tr', 'zh']:
df = pd.read_csv(f'data/{lang}-recall.csv', index_col=0)
df.set_index(['Country', 'System'], inplace=True)
df.sort_index(level='Country', inplace=True)
df2 = df.unstack()
df2.columns = df2.columns.get_level_values(1)
mask = df2.Country.isna()
df2.loc[mask, 'Country'] = df2.Uniform.loc[mask]
if 'Country' in df2.columns:
mask = df2.Country.isna()
df2.loc[mask, 'Country'] = df2.Uniform.loc[mask]
perf[lang] = df2.mean(axis=0)
_ = pd.DataFrame(perf).reindex(['EvoMSA', 'Uniform', 'Country'])
_ = pd.DataFrame(perf).reindex(['EvoMSA', 'Uniform', 'Country']).T
Markdown(_.to_markdown())
```

0 comments on commit af1afd5

Please sign in to comment.