Skip to content

Commit

Permalink
corpus: update syllables and some names
Browse files Browse the repository at this point in the history
  • Loading branch information
NoerNova committed Jun 6, 2023
1 parent 0f008ff commit bbf2c0d
Show file tree
Hide file tree
Showing 6 changed files with 11,121 additions and 178 deletions.
9 changes: 3 additions & 6 deletions example.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
import time
from shannlp import Tokenizer, word_tokenize
from shannlp import word_tokenize

# to measure time
start = time.time()

# # Example usage
input_text = "ပူၼ်ႉမႃး မိူဝ်ႈဝၼ်းတီႈ 4/6/2023 ယူႇတီႈ ၸဝ်ႈၶူးၸၼ်ႇတႃႇဝႃႇရ (တူႉပီႈၸၼ်) ဢွၼ်ႁူဝ်ၼမ်းၼႃးသေ တႄႇပိုတ်ႇတိူၵ်ႈ သွၼ်လိၵ်ႈသင်ၶၸဝ်ႈ ၾၢႆႇမိူင်း(လိၵ်ႈတႆးၶိုၼ်) တီႈဝတ်ႉဝၢၼ်ႈသဵဝ်ႈ ဢိူင်ႇဝၢၼ်ႈၶုမ်ႉ ၸႄႈဝဵင်းမိူင်းၶၢၵ်ႇ ၸႄႈတွၼ်ႈၵဵင်းတုင် ၸိုင်ႈတႆးပွတ်းဢွၵ်ႇၶူင်း ပွၵ်ႈၵမ်းႁႅၵ်ႈ။"

input_text = "တိူၵ်ႈသွၼ်လိၵ်ႈသင်ၶၸဝ်ႈ တီႈဝဵင်းမိူင်းၶၢၵ်ႇ တႄႇပိုတ်ႇသွၼ်ႁဵၼ်းလိၵ်ႈ ပဵၼ်ပွၵ်ႈၵမ်းႁႅၵ်း မီးသင်ၶၸဝ်ႈ မႃးႁဵၼ်း 56 တူၼ်။"

tokenizer = Tokenizer()

print(word_tokenize(input_text))
print(word_tokenize(input_text, keep_whitespace=False))

end = time.time()
print(end - start)
12 changes: 12 additions & 0 deletions shannlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"shan_words",
"shan_character",
"shan_all_corpus",
"shan_syllables"
]

import os
Expand All @@ -27,6 +28,8 @@
_PERSON_MALE_NAMES = set()
_PERSON_MALE_NAMES_FILENAME = "person_names_male_shn.txt"

_SHAN_SYLLABLES = set()
_SHAN_SYLLABLES_FILENAME = "shan_syllables.txt"
_SHAN_WORDS = set()
_SHAN_WORDS_FILENAME = "words_shn.txt"
_SHAN_STOPWORDS = set()
Expand Down Expand Up @@ -96,6 +99,14 @@ def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
return _SHAN_PROVINCES


def shan_syllables() -> FrozenSet[str]:
global _SHAN_SYLLABLES
if not _SHAN_SYLLABLES:
_SHAN_SYLLABLES = get_corpus(_SHAN_SYLLABLES_FILENAME)

return _SHAN_SYLLABLES


def shan_words() -> FrozenSet[str]:
global _SHAN_WORDS
if not _SHAN_WORDS:
Expand Down Expand Up @@ -148,6 +159,7 @@ def shan_all_corpus() -> FrozenSet[str]:
_PERSON_FEMALE_NAMES_FILENAME,
_PERSON_MALE_NAMES_FILENAME,
_SHAN_CHARACTER_FILENAME,
_SHAN_SYLLABLES_FILENAME
]
)

Expand Down
1 change: 1 addition & 0 deletions shannlp/corpus/person_names_male_shn.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@
ၸၢႆးႁၢၼ်လႅင်း
ၸၢႆးႁၢၼ်လႅင်းမိူင်း
ၸၢႆးႁၢၼ်သႅင်
ၸၢႆးႁၢၼ်ၾႃႉ
ၸၢႆးဢွင်ႇမိူင်း
ၸၢႆးဢွၼ်ႇၶိူဝ်း
ၸၢႆးဢွၼ်ႇမိူင်း
Expand Down
File renamed without changes.
Loading

0 comments on commit bbf2c0d

Please sign in to comment.