Skip to content

Commit

Permalink
SeqTM
Browse files Browse the repository at this point in the history
  • Loading branch information
mgraffg committed Aug 28, 2024
1 parent acaec43 commit de656a0
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 4 deletions.
2 changes: 1 addition & 1 deletion dialectid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@

__version__ = '0.0.5'

from dialectid.text_repr import BoW
from dialectid.text_repr import BoW, SeqTM
from dialectid.model import DialectId
21 changes: 19 additions & 2 deletions dialectid/tests/test_text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def test_subwords():
def test_SeqTM():
"""Test SeqTM class"""

seq = SeqTM(language='es', subwords=True, voc_size_exponent=13)
seq = SeqTM(language='es', subwords=True,
sequence=False,
voc_selection='most_common_by_type',
voc_size_exponent=13)
assert seq.language == 'es'
assert seq.voc_size_exponent == 13
_ = [['dias', 'q:~dur', 'q:os~']]
Expand All @@ -62,7 +65,21 @@ def test_SeqTM():
def test_SeqTM_bug():
"""Test SeqTM class"""

seq = SeqTM(language='es', subwords=True, voc_size_exponent=13)
seq = SeqTM(language='es', subwords=True,
sequence=False,
voc_selection='most_common_by_type',
voc_size_exponent=13)
res1 = seq.tokenize('mira pinche a')
res2 = seq.tokenize('a pinche a')
assert res1[1:] == res2[1:]


def test_SeqTM_seq():
"""Test SeqTM seq option"""

seq = SeqTM(language='es', sequence=True,
voc_selection='most_common',
voc_size_exponent=13)
res1 = seq.tokenize('mira pinche a')
res2 = seq.tokenize('a pinche a')
assert res1[1:] == res2[1:]
5 changes: 4 additions & 1 deletion dialectid/text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,13 @@ def __init__(self, language='es',
voc_selection: str='most_common_by_type',
loc: str=None,
subwords: bool=True,
sequence: bool=True,
lang=None,
**kwargs):
assert lang is None
if subwords:
if sequence and subwords:
loc = 'seq'
elif subwords:
assert loc is None
loc = 'qgrams'
self._map = {}
Expand Down

0 comments on commit de656a0

Please sign in to comment.