Skip to content

Commit

Permalink
filter for shan unicode range
Browse files Browse the repository at this point in the history
  • Loading branch information
NoerNova committed Jun 6, 2023
1 parent 6326938 commit 0f008ff
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion shannlp/tokenize/m_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
"""
)

_SHAN_UNICODE_RANGE = re.compile(
r'[\u1000-\u109F]'
)

DEFAULT_WORD_DICT_TRIE = Trie(shan_all_corpus())


Expand All @@ -28,7 +32,7 @@ def maximal_matching(text: str) -> List[List[Optional[str]]]:
for j in range(i, n):
min_val = 1
substring = text[i: j + 1]
if substring in dictionary or substring.isspace() or substring.isdigit():
if substring in dictionary or not re.search(_SHAN_UNICODE_RANGE, substring):
if i > 0:
prev_col = [
text_parts[k][j - 1]
Expand Down

0 comments on commit 0f008ff

Please sign in to comment.