-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6326938
Showing
26 changed files
with
21,653 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
.idea/ | ||
.vscode/ | ||
venv/ | ||
|
||
.DS_Store | ||
**/__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# ShanNLP: Shan Natural Language Processing | ||
**experimental project inspired by [PythaiNLP](https://github.com/PyThaiNLP/pythainlp)** | ||
|
||
## Current State | ||
- [ ] corpus dict word: 19904 words (60% corvered and need more to collected) | ||
|
||
## Word Tokenization method | ||
- [x] maximal_matching | ||
- [x] pythainlp (newmm) | ||
|
||
## TODO | ||
- [ ] mining more shan words, poem | ||
- [ ] experiment more method to tokenize | ||
- [ ] word tokenize | ||
- [ ] sentent tokenize | ||
- [ ] subword_tokenize | ||
- [ ] tokenize with deep learning | ||
- [ ] spelling check | ||
- [ ] pos tagging | ||
- [ ] translation | ||
- [ ] word_vector | ||
|
||
## USAGE | ||
```python | ||
# this project using pythainlp dependecy | ||
# - Trie data structure | ||
# - newmm (experimental) | ||
|
||
pip install -r requirements.txt | ||
# or pip install pythainlp | ||
``` | ||
|
||
```python | ||
from shannlp import word_tokenize | ||
|
||
# start measure execute time | ||
# start = time.time() | ||
|
||
# # Example usage | ||
input_text = "တိူၵ်ႈသွၼ်လိၵ်ႈသင်ၶၸဝ်ႈ တီႈဝဵင်းမိူင်းၶၢၵ်ႇ တႄႇပိုတ်ႇသွၼ်ႁဵၼ်းလိၵ်ႈ ပဵၼ်ပွၵ်ႈၵမ်းႁႅၵ်း မီးသင်ၶၸဝ်ႈ မႃးႁဵၼ်း 56 တူၼ်။" | ||
|
||
# default tokenizer engine="mm" (maximal_matching) | ||
print(word_tokenize(input_text)) | ||
|
||
# end measure execute time | ||
# end = time.time() | ||
# print(end - start) | ||
|
||
# output | ||
# ['တိူၵ်ႈ', 'သွၼ်လိၵ်ႈ', 'သင်ၶ', 'ၸဝ်ႈ', ' ', 'တီႈ', 'ဝဵင်း', 'မိူင်းၶၢၵ်ႇ', ' ', 'တႄႇ', 'ပိုတ်ႇ', 'သွၼ်', 'ႁဵၼ်းလိၵ်ႈ', ' ', 'ပဵၼ်', 'ပွၵ်ႈ', 'ၵမ်း', 'ႁႅၵ်း', ' ', 'မီး', 'သင်ၶ', 'ၸဝ်ႈ', ' ', 'မႃး', 'ႁဵၼ်း', ' ', '56', ' ', 'တူၼ်', '။'] | ||
# 0.7220799922943115 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import time | ||
from shannlp import Tokenizer, word_tokenize | ||
|
||
# to measure time | ||
start = time.time() | ||
|
||
# # Example usage | ||
|
||
input_text = "တိူၵ်ႈသွၼ်လိၵ်ႈသင်ၶၸဝ်ႈ တီႈဝဵင်းမိူင်းၶၢၵ်ႇ တႄႇပိုတ်ႇသွၼ်ႁဵၼ်းလိၵ်ႈ ပဵၼ်ပွၵ်ႈၵမ်းႁႅၵ်း မီးသင်ၶၸဝ်ႈ မႃးႁဵၼ်း 56 တူၼ်။" | ||
|
||
tokenizer = Tokenizer() | ||
|
||
print(word_tokenize(input_text)) | ||
|
||
end = time.time() | ||
print(end - start) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pythainlp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from shannlp.tokenize import ( | ||
Tokenizer, | ||
word_tokenize | ||
) | ||
|
||
shan_consonants = "ၵၷၶꧠငၸၹသၺတၻထၼꧣပၽၾပၿႀမယရ႟လꩮဝႁဢ" | ||
shan_vowels = "\u1083\u1062\u1084\u1085\u1031\u1035\u102d\u102e\u102f\u1030\u1086\u1082\u103a\u103d\u103b\u103c" | ||
shan_tone = "\u1087\u1088\u1038\u1089\u108a" | ||
shan_punctuations = "\u104a\u104b\ua9e6" | ||
|
||
shan_letters = "".join([shan_consonants, shan_vowels, shan_tone, shan_punctuations]) | ||
shan_digits = "႐႑႒႓႔႕႖႗႘႙" | ||
|
||
shan_characters = "".join([shan_letters, shan_digits]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
__all__ = [ | ||
"countries", | ||
"get_corpus", | ||
"provinces", | ||
"shan_female_names", | ||
"shan_male_names", | ||
"shan_words", | ||
"shan_character", | ||
"shan_all_corpus", | ||
] | ||
|
||
import os | ||
from typing import FrozenSet, List, Union | ||
from shannlp.tools import get_shannlp_path | ||
|
||
_CORPUS_DIRNAME = "corpus" | ||
_CORPUS_PATH = os.path.join(get_shannlp_path(), _CORPUS_DIRNAME) | ||
|
||
_COUNTRIES = set() | ||
_COUNTRIES_FILENAME = "countries_shn.txt" | ||
|
||
_SHAN_PROVINCES = set() | ||
_SHAN_PROVINCES_FILENAME = "shan_state_provinces.txt" | ||
|
||
_PERSON_FEMALE_NAMES = set() | ||
_PERSON_FEMALE_NAMES_FILENAME = "person_names_female_shn.txt" | ||
_PERSON_MALE_NAMES = set() | ||
_PERSON_MALE_NAMES_FILENAME = "person_names_male_shn.txt" | ||
|
||
_SHAN_WORDS = set() | ||
_SHAN_WORDS_FILENAME = "words_shn.txt" | ||
_SHAN_STOPWORDS = set() | ||
_SHAN_STOPWORDS_FILENAME = "stopwords_shn.txt" | ||
|
||
_SHAN_CHARACTER = set() | ||
_SHAN_CHARACTER_FILENAME = "shan_character.txt" | ||
|
||
_SHAN_ALL_C = set() | ||
|
||
|
||
def corpus_path() -> str: | ||
return _CORPUS_PATH | ||
|
||
|
||
def path_shannlp_corpus(filename: str) -> str: | ||
return os.path.join(corpus_path(), filename) | ||
|
||
|
||
def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: | ||
path = path_shannlp_corpus(filename) | ||
lines = [] | ||
with open(path, "r", encoding="utf-8-sig") as fh: | ||
lines = fh.read().splitlines() | ||
|
||
if as_is: | ||
return lines | ||
|
||
lines = [line.strip() for line in lines] | ||
return frozenset(filter(None, lines)) | ||
|
||
|
||
def get_m_corpus( | ||
filenames: List[str], as_is: bool = False | ||
) -> Union[frozenset, List[str]]: | ||
all_lines = [] | ||
for filename in filenames: | ||
path = path_shannlp_corpus(filename) | ||
with open(path, "r", encoding="utf-8-sig") as fh: | ||
lines = fh.read().splitlines() | ||
|
||
if as_is: | ||
all_lines.extend(lines) | ||
else: | ||
lines = [line.strip() for line in lines] | ||
all_lines.extend(filter(None, lines)) | ||
|
||
if as_is: | ||
return all_lines | ||
else: | ||
return frozenset(all_lines) | ||
|
||
|
||
def countries() -> FrozenSet[str]: | ||
global _COUNTRIES | ||
if not _COUNTRIES: | ||
_COUNTRIES = get_corpus(_COUNTRIES_FILENAME) | ||
|
||
return _COUNTRIES | ||
|
||
|
||
def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]: | ||
global _SHAN_PROVINCES | ||
if not _SHAN_PROVINCES: | ||
_SHAN_PROVINCES = get_corpus(_SHAN_PROVINCES_FILENAME) | ||
|
||
return _SHAN_PROVINCES | ||
|
||
|
||
def shan_words() -> FrozenSet[str]: | ||
global _SHAN_WORDS | ||
if not _SHAN_WORDS: | ||
_SHAN_WORDS = get_corpus(_SHAN_WORDS_FILENAME) | ||
|
||
return _SHAN_WORDS | ||
|
||
|
||
def shan_stopwords() -> FrozenSet[str]: | ||
global _SHAN_STOPWORDS | ||
if not _SHAN_STOPWORDS: | ||
_SHAN_STOPWORDS = get_corpus(_SHAN_STOPWORDS_FILENAME) | ||
|
||
return _SHAN_STOPWORDS | ||
|
||
|
||
def shan_female_names() -> FrozenSet[str]: | ||
global _PERSON_FEMALE_NAMES | ||
if not _PERSON_FEMALE_NAMES: | ||
_PERSON_FEMALE_NAMES = get_corpus(_PERSON_FEMALE_NAMES_FILENAME) | ||
|
||
return _PERSON_FEMALE_NAMES | ||
|
||
|
||
def shan_male_names() -> FrozenSet[str]: | ||
global _PERSON_MALE_NAMES | ||
if not _PERSON_MALE_NAMES: | ||
_PERSON_MALE_NAMES = get_corpus(_PERSON_MALE_NAMES_FILENAME) | ||
|
||
return _PERSON_MALE_NAMES | ||
|
||
|
||
def shan_character() -> FrozenSet[str]: | ||
global _SHAN_CHARACTER | ||
if not _SHAN_CHARACTER: | ||
_SHAN_CHARACTER = get_corpus(_SHAN_CHARACTER_FILENAME) | ||
|
||
return _SHAN_CHARACTER | ||
|
||
|
||
def shan_all_corpus() -> FrozenSet[str]: | ||
global _SHAN_ALL_C | ||
if not _SHAN_ALL_C: | ||
_SHAN_ALL_C = get_m_corpus( | ||
[ | ||
_COUNTRIES_FILENAME, | ||
_SHAN_PROVINCES_FILENAME, | ||
_SHAN_WORDS_FILENAME, | ||
_SHAN_STOPWORDS_FILENAME, | ||
_PERSON_FEMALE_NAMES_FILENAME, | ||
_PERSON_MALE_NAMES_FILENAME, | ||
_SHAN_CHARACTER_FILENAME, | ||
] | ||
) | ||
|
||
return _SHAN_ALL_C |
Oops, something went wrong.