Skip to content

Commit

Permalink
Merge pull request #13 from INGEOTEC/develop
Browse files Browse the repository at this point in the history
Version - 0.0.4
  • Loading branch information
mgraffg authored Aug 2, 2024
2 parents cf81936 + 7d67c07 commit 268c598
Show file tree
Hide file tree
Showing 14 changed files with 266 additions and 75 deletions.
2 changes: 1 addition & 1 deletion dialectid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__version__ = '0.0.3'
__version__ = '0.0.4'

from dialectid.text_repr import BoW
from dialectid.model import DialectId
10 changes: 8 additions & 2 deletions dialectid/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class DialectId:
"""DialectId"""
lang: str='es'
voc_size_exponent: int=15
subwords: bool=True

@property
def bow(self):
Expand All @@ -43,8 +44,12 @@ def bow(self):
path = BOW[self.lang].split('.')
module = '.'.join(path[:-1])
text_repr = importlib.import_module(module)
kwargs = {}
if module != 'EvoMSA.text_repr':
kwargs = dict(subwords=self.subwords)
_ = getattr(text_repr, path[-1])(lang=self.lang,
voc_size_exponent=self.voc_size_exponent)
voc_size_exponent=self.voc_size_exponent,
**kwargs)
self._bow = _
return self._bow

Expand All @@ -55,7 +60,8 @@ def weights(self):
return self._weights
except AttributeError:
self._weights = load_dialectid(self.lang,
self.voc_size_exponent)
self.voc_size_exponent,
self.subwords)
return self._weights

@property
Expand Down
16 changes: 12 additions & 4 deletions dialectid/tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_DialectId():
from dialectid.model import DialectId
from dialectid import BoW

dialectid = DialectId(voc_size_exponent=15)
dialectid = DialectId(voc_size_exponent=15, subwords=False)
assert dialectid.lang == 'es' and dialectid.voc_size_exponent == 15
assert isinstance(dialectid.bow, BoW)

Expand All @@ -38,7 +38,7 @@ def test_DialectId_df():

from dialectid.model import DialectId

dialectid = DialectId(voc_size_exponent=15)
dialectid = DialectId(voc_size_exponent=15, subwords=False)
hy = dialectid.decision_function('comiendo tacos')
assert hy.shape == (1, 20)
assert hy.argmax(axis=1)[0] == 0
Expand All @@ -49,7 +49,7 @@ def test_countries():

from dialectid.model import DialectId

dialectid = DialectId(voc_size_exponent=15)
dialectid = DialectId(voc_size_exponent=15, subwords=False)
assert len(dialectid.countries) == 20
assert dialectid.countries[0] == 'mx'

Expand All @@ -59,10 +59,18 @@ def test_predict():

from dialectid.model import DialectId

dialectid = DialectId(voc_size_exponent=15)
dialectid = DialectId(voc_size_exponent=15, subwords=False)
countries = dialectid.predict('comiendo tacos')
assert countries[0] == 'mx'
countries = dialectid.predict(['comiendo tacos',
'tomando vino'])
assert countries.shape == (2, )


def test_DialectId_subwords():
"""Test DialectId subwords"""

from dialectid.model import DialectId
dialectid = DialectId(voc_size_exponent=15)
countries = dialectid.predict('comiendo tacos')
assert countries[0] == 'mx'
15 changes: 12 additions & 3 deletions dialectid/tests/test_text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,18 @@ def test_bow():
"""Test BoW"""
from b4msa.textmodel import TextModel

bow = BoW(lang='es')
bow = BoW(lang='es', voc_size_exponent=13)
assert isinstance(bow.bow, TextModel)
X = bow.transform(['Buenos dias'])
bow2 = BoW(lang='es', loc='mx')
bow2 = BoW(lang='es', loc='mx', voc_size_exponent=13)
X2 = bow2.transform(['Buenos dias'])
assert (X - X2).sum() != 0
assert (X - X2).sum() != 0


def test_subwords():
"""Test subwords"""

bow = BoW(lang='es', voc_size_exponent=13,
subwords=True)
bow.transform(['Hola'])

4 changes: 4 additions & 0 deletions dialectid/text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,14 @@ def __init__(self, pretrain: bool=True,
v1: bool=False,
estimator_kwargs: dict=None,
loc: str=None,
subwords: bool=False,
**kwargs):
assert pretrain
assert not v1
self._bow = None
if subwords:
assert loc is None
loc = 'qgrams'
self.loc = loc
if estimator_kwargs is None:
estimator_kwargs = {'dual': True, 'class_weight': 'balanced'}
Expand Down
7 changes: 5 additions & 2 deletions dialectid/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,16 @@ def load(filename):
return data


def load_dialectid(lang, dim):
def load_dialectid(lang, dim, subwords=False):
"""Load url"""

diroutput = join(dirname(__file__), 'models')
if not isdir(diroutput):
os.mkdir(diroutput)
filename = f'dialectid_{lang}_{dim}.json.gz'
if subwords:
filename = f'dialectid_subwords_{lang}_{dim}.json.gz'
else:
filename = f'dialectid_{lang}_{dim}.json.gz'
output = join(diroutput, filename)
if not isfile(output):
Download(f'{BASEURL}/{filename}', output)
Expand Down
22 changes: 21 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
[project]
name = 'dialectid'
description = "Set of algorithms to detect the dialect of a given text"
readme = "README.rst"
dependencies = [
'numpy',
'scikit-learn>=1.3.0',
Expand All @@ -8,9 +10,27 @@ dependencies = [
'EvoMSA'
]
dynamic = ['version']
classifiers = [
"Development Status :: 3 - Alpha",
"Environment :: Console",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]



[tool.setuptools.dynamic]
version = {attr = 'dialectid.__version__'}

[tool.setuptools]
packages = ['dialectid', 'dialectid.tests']
packages = ['dialectid', 'dialectid.tests']

[project.urls]
Homepage = "https://ingeotec.github.io/dialectid"
Repository = "https://github.com/INGEOTEC/dialectid"
Issues = "https://github.com/INGEOTEC/dialectid/issues"
23 changes: 22 additions & 1 deletion quarto/data/ar-recall.csv
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,25 @@
42,0.7392578125,sa,Country
43,0.712158203125,eg,Country
44,0.717529296875,kw,Country
45,0.60107421875,ae,Country
45,0.601318359375,ae,Country
46,0.78173828125,lb,q-grams
47,0.690673828125,sd,q-grams
48,0.80322265625,ma,q-grams
49,0.623291015625,jo,q-grams
50,0.740966796875,iq,q-grams
51,0.7700386954118298,mr,q-grams
52,0.77783203125,dz,q-grams
53,0.573974609375,qa,q-grams
54,0.6945740240608888,sy,q-grams
55,0.63916015625,bh,q-grams
56,0.7412109375,sa,q-grams
57,0.72412109375,om,q-grams
58,0.7378640776699029,dj,q-grams
59,0.7023172905525846,so,q-grams
60,0.67431640625,ly,q-grams
61,0.731201171875,ye,q-grams
62,0.6657223796033994,td,q-grams
63,0.712158203125,tn,q-grams
64,0.716064453125,eg,q-grams
65,0.725830078125,kw,q-grams
66,0.611328125,ae,q-grams
71 changes: 67 additions & 4 deletions quarto/data/en-recall.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
13,0.7451171875,ph,EvoMSA
14,0.710205078125,zw,EvoMSA
15,0.6904296875,za,EvoMSA
16,0.719970703125,us,EvoMSA
16,0.7197265625,us,EvoMSA
17,0.69287109375,bb,EvoMSA
18,0.71484375,bs,EvoMSA
19,0.6865329512893983,sx,EvoMSA
Expand All @@ -24,7 +24,7 @@
22,0.653160453808752,mp,EvoMSA
23,0.69482421875,lr,EvoMSA
24,0.734130859375,ls,EvoMSA
25,0.48905109489051096,ck,EvoMSA
25,0.4890510948905109,ck,EvoMSA
26,0.5830078125,sg,EvoMSA
27,0.73046875,gh,EvoMSA
28,0.8357380688124306,to,EvoMSA
Expand Down Expand Up @@ -87,7 +87,7 @@
85,0.6628849270664505,mp,Uniform
86,0.69873046875,lr,Uniform
87,0.735595703125,ls,Uniform
88,0.49635036496350365,ck,Uniform
88,0.4963503649635036,ck,Uniform
89,0.593017578125,sg,Uniform
90,0.734375,gh,Uniform
91,0.8390677025527192,to,Uniform
Expand All @@ -100,7 +100,7 @@
98,0.6966019417475728,fk,Uniform
99,0.694580078125,sl,Uniform
100,0.6236702127659575,gu,Uniform
101,0.726318359375,ke,Uniform
101,0.7265625,ke,Uniform
102,0.64306640625,gy,Uniform
103,0.55078125,mu,Uniform
104,0.760986328125,pk,Uniform
Expand Down Expand Up @@ -139,3 +139,66 @@
137,0.736328125,ie,Country
138,0.690673828125,au,Country
139,0.718994140625,gb,Country
140,0.6582213029989659,fj,q-grams
141,0.680419921875,ca,q-grams
142,0.61572265625,sd,q-grams
143,0.692626953125,cm,q-grams
144,0.7518394648829432,im,q-grams
145,0.71533203125,sz,q-grams
146,0.6917783411807317,gd,q-grams
147,0.7994413407821229,gg,q-grams
148,0.705322265625,bm,q-grams
149,0.5904,ai,q-grams
150,0.692626953125,vc,q-grams
151,0.744873046875,ug,q-grams
152,0.697265625,ag,q-grams
153,0.75341796875,ph,q-grams
154,0.710693359375,zw,q-grams
155,0.699462890625,za,q-grams
156,0.73681640625,us,q-grams
157,0.688720703125,bb,q-grams
158,0.7236328125,bs,q-grams
159,0.6974212034383954,sx,q-grams
160,0.810791015625,in,q-grams
161,0.658447265625,bz,q-grams
162,0.6515397082658023,mp,q-grams
163,0.7099609375,lr,q-grams
164,0.73486328125,ls,q-grams
165,0.5072992700729927,ck,q-grams
166,0.59423828125,sg,q-grams
167,0.7294921875,gh,q-grams
168,0.8335183129855716,to,q-grams
169,0.74462890625,na,q-grams
170,0.622286541244573,pw,q-grams
171,0.69775390625,rw,q-grams
172,0.768310546875,ng,q-grams
173,0.74365234375,tt,q-grams
174,0.695068359375,nz,q-grams
175,0.7014563106796117,fk,q-grams
176,0.705810546875,sl,q-grams
177,0.6263297872340425,gu,q-grams
178,0.729736328125,ke,q-grams
179,0.64794921875,gy,q-grams
180,0.54248046875,mu,q-grams
181,0.763916015625,pk,q-grams
182,0.742431640625,ie,q-grams
183,0.7731421121251629,vu,q-grams
184,0.6614035087719298,dm,q-grams
185,0.6933913934426229,pg,q-grams
186,0.7579408543263965,kn,q-grams
187,0.5888671875,mt,q-grams
188,0.6652977412731006,sh,q-grams
189,0.6015037593984962,fm,q-grams
190,0.7109375,gi,q-grams
191,0.635986328125,ky,q-grams
192,0.6748046875,lc,q-grams
193,0.6333938294010889,vg,q-grams
194,0.6607958251793868,tc,q-grams
195,0.733154296875,gm,q-grams
196,0.698974609375,au,q-grams
197,0.72900390625,zm,q-grams
198,0.7353515625,mw,q-grams
199,0.7751091703056768,sb,q-grams
200,0.7275390625,gb,q-grams
201,0.679931640625,jm,q-grams
202,0.7031963470319634,vi,q-grams
20 changes: 20 additions & 0 deletions quarto/data/es-recall.csv
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,23 @@
51,0.6376953125,ec,Country
52,0.6572265625,co,Country
53,0.68603515625,py,Country
54,0.62451171875,cr,q-grams
55,0.68017578125,cl,q-grams
56,0.66455078125,ni,q-grams
57,0.661376953125,gt,q-grams
58,0.6611328125,pe,q-grams
59,0.7255859375,do,q-grams
60,0.715087890625,ar,q-grams
61,0.683837890625,hn,q-grams
62,0.687744140625,mx,q-grams
63,0.7373046875,cu,q-grams
64,0.578369140625,bo,q-grams
65,0.75390625,es,q-grams
66,0.708251953125,uy,q-grams
67,0.67236328125,sv,q-grams
68,0.693603515625,ve,q-grams
69,0.649658203125,pa,q-grams
70,0.6455078125,ec,q-grams
71,0.67431640625,co,q-grams
72,0.7109375,gq,q-grams
73,0.692626953125,py,q-grams
Loading

0 comments on commit 268c598

Please sign in to comment.