Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
NoerNova committed Jun 6, 2023
0 parents commit 6326938
Show file tree
Hide file tree
Showing 26 changed files with 21,653 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.idea/
.vscode/
venv/

.DS_Store
**/__pycache__
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# ShanNLP: Shan Natural Language Processing
**experimental project inspired by [PythaiNLP](https://github.com/PyThaiNLP/pythainlp)**

## Current State
- [ ] corpus dict word: 19904 words (60% corvered and need more to collected)

## Word Tokenization method
- [x] maximal_matching
- [x] pythainlp (newmm)

## TODO
- [ ] mining more shan words, poem
- [ ] experiment more method to tokenize
- [ ] word tokenize
- [ ] sentent tokenize
- [ ] subword_tokenize
- [ ] tokenize with deep learning
- [ ] spelling check
- [ ] pos tagging
- [ ] translation
- [ ] word_vector

## USAGE
```python
# this project using pythainlp dependecy
# - Trie data structure
# - newmm (experimental)

pip install -r requirements.txt
# or pip install pythainlp
```

```python
from shannlp import word_tokenize

# start measure execute time
# start = time.time()

# # Example usage
input_text = "တိူၵ်ႈသွၼ်လိၵ်ႈသင်ၶၸဝ်ႈ တီႈဝဵင်းမိူင်းၶၢၵ်ႇ တႄႇပိုတ်ႇသွၼ်ႁဵၼ်းလိၵ်ႈ ပဵၼ်ပွၵ်ႈၵမ်းႁႅၵ်း မီးသင်ၶၸဝ်ႈ မႃးႁဵၼ်း 56 တူၼ်။"

# default tokenizer engine="mm" (maximal_matching)
print(word_tokenize(input_text))

# end measure execute time
# end = time.time()
# print(end - start)

# output
# ['တိူၵ်ႈ', 'သွၼ်လိၵ်ႈ', 'သင်ၶ', 'ၸဝ်ႈ', ' ', 'တီႈ', 'ဝဵင်း', 'မိူင်းၶၢၵ်ႇ', ' ', 'တႄႇ', 'ပိုတ်ႇ', 'သွၼ်', 'ႁဵၼ်းလိၵ်ႈ', ' ', 'ပဵၼ်', 'ပွၵ်ႈ', 'ၵမ်း', 'ႁႅၵ်း', ' ', 'မီး', 'သင်ၶ', 'ၸဝ်ႈ', ' ', 'မႃး', 'ႁဵၼ်း', ' ', '56', ' ', 'တူၼ်', '။']
# 0.7220799922943115
```
16 changes: 16 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import time
from shannlp import Tokenizer, word_tokenize

# to measure time
start = time.time()

# # Example usage

input_text = "တိူၵ်ႈသွၼ်လိၵ်ႈသင်ၶၸဝ်ႈ တီႈဝဵင်းမိူင်းၶၢၵ်ႇ တႄႇပိုတ်ႇသွၼ်ႁဵၼ်းလိၵ်ႈ ပဵၼ်ပွၵ်ႈၵမ်းႁႅၵ်း မီးသင်ၶၸဝ်ႈ မႃးႁဵၼ်း 56 တူၼ်။"

tokenizer = Tokenizer()

print(word_tokenize(input_text))

end = time.time()
print(end - start)
1 change: 1 addition & 0 deletions requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pythainlp
14 changes: 14 additions & 0 deletions shannlp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from shannlp.tokenize import (
Tokenizer,
word_tokenize
)

shan_consonants = "ၵၷၶꧠငၸၹသၺတၻထၼꧣပၽၾပၿႀမယရ႟လꩮဝႁဢ"
shan_vowels = "\u1083\u1062\u1084\u1085\u1031\u1035\u102d\u102e\u102f\u1030\u1086\u1082\u103a\u103d\u103b\u103c"
shan_tone = "\u1087\u1088\u1038\u1089\u108a"
shan_punctuations = "\u104a\u104b\ua9e6"

shan_letters = "".join([shan_consonants, shan_vowels, shan_tone, shan_punctuations])
shan_digits = "႐႑႒႓႔႕႖႗႘႙"

shan_characters = "".join([shan_letters, shan_digits])
154 changes: 154 additions & 0 deletions shannlp/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
__all__ = [
"countries",
"get_corpus",
"provinces",
"shan_female_names",
"shan_male_names",
"shan_words",
"shan_character",
"shan_all_corpus",
]

import os
from typing import FrozenSet, List, Union
from shannlp.tools import get_shannlp_path

_CORPUS_DIRNAME = "corpus"
_CORPUS_PATH = os.path.join(get_shannlp_path(), _CORPUS_DIRNAME)

_COUNTRIES = set()
_COUNTRIES_FILENAME = "countries_shn.txt"

_SHAN_PROVINCES = set()
_SHAN_PROVINCES_FILENAME = "shan_state_provinces.txt"

_PERSON_FEMALE_NAMES = set()
_PERSON_FEMALE_NAMES_FILENAME = "person_names_female_shn.txt"
_PERSON_MALE_NAMES = set()
_PERSON_MALE_NAMES_FILENAME = "person_names_male_shn.txt"

_SHAN_WORDS = set()
_SHAN_WORDS_FILENAME = "words_shn.txt"
_SHAN_STOPWORDS = set()
_SHAN_STOPWORDS_FILENAME = "stopwords_shn.txt"

_SHAN_CHARACTER = set()
_SHAN_CHARACTER_FILENAME = "shan_character.txt"

_SHAN_ALL_C = set()


def corpus_path() -> str:
return _CORPUS_PATH


def path_shannlp_corpus(filename: str) -> str:
return os.path.join(corpus_path(), filename)


def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
path = path_shannlp_corpus(filename)
lines = []
with open(path, "r", encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()

if as_is:
return lines

lines = [line.strip() for line in lines]
return frozenset(filter(None, lines))


def get_m_corpus(
filenames: List[str], as_is: bool = False
) -> Union[frozenset, List[str]]:
all_lines = []
for filename in filenames:
path = path_shannlp_corpus(filename)
with open(path, "r", encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()

if as_is:
all_lines.extend(lines)
else:
lines = [line.strip() for line in lines]
all_lines.extend(filter(None, lines))

if as_is:
return all_lines
else:
return frozenset(all_lines)


def countries() -> FrozenSet[str]:
global _COUNTRIES
if not _COUNTRIES:
_COUNTRIES = get_corpus(_COUNTRIES_FILENAME)

return _COUNTRIES


def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
global _SHAN_PROVINCES
if not _SHAN_PROVINCES:
_SHAN_PROVINCES = get_corpus(_SHAN_PROVINCES_FILENAME)

return _SHAN_PROVINCES


def shan_words() -> FrozenSet[str]:
global _SHAN_WORDS
if not _SHAN_WORDS:
_SHAN_WORDS = get_corpus(_SHAN_WORDS_FILENAME)

return _SHAN_WORDS


def shan_stopwords() -> FrozenSet[str]:
global _SHAN_STOPWORDS
if not _SHAN_STOPWORDS:
_SHAN_STOPWORDS = get_corpus(_SHAN_STOPWORDS_FILENAME)

return _SHAN_STOPWORDS


def shan_female_names() -> FrozenSet[str]:
global _PERSON_FEMALE_NAMES
if not _PERSON_FEMALE_NAMES:
_PERSON_FEMALE_NAMES = get_corpus(_PERSON_FEMALE_NAMES_FILENAME)

return _PERSON_FEMALE_NAMES


def shan_male_names() -> FrozenSet[str]:
global _PERSON_MALE_NAMES
if not _PERSON_MALE_NAMES:
_PERSON_MALE_NAMES = get_corpus(_PERSON_MALE_NAMES_FILENAME)

return _PERSON_MALE_NAMES


def shan_character() -> FrozenSet[str]:
global _SHAN_CHARACTER
if not _SHAN_CHARACTER:
_SHAN_CHARACTER = get_corpus(_SHAN_CHARACTER_FILENAME)

return _SHAN_CHARACTER


def shan_all_corpus() -> FrozenSet[str]:
global _SHAN_ALL_C
if not _SHAN_ALL_C:
_SHAN_ALL_C = get_m_corpus(
[
_COUNTRIES_FILENAME,
_SHAN_PROVINCES_FILENAME,
_SHAN_WORDS_FILENAME,
_SHAN_STOPWORDS_FILENAME,
_PERSON_FEMALE_NAMES_FILENAME,
_PERSON_MALE_NAMES_FILENAME,
_SHAN_CHARACTER_FILENAME,
]
)

return _SHAN_ALL_C
Loading

0 comments on commit 6326938

Please sign in to comment.