-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathuyghur.py
66 lines (51 loc) · 1.77 KB
/
uyghur.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
class Uyghur():
def __init__(self, ):
self.uyghur_latin = "abcdefghijklmnopqrstuvwxyz éöü’"
self._vocab_list = [self.pad_char, self.sos_char,self.eos_char] + list(self.uyghur_latin) # $ for padding char. index must be 0
self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)}
def encode(self, s):
s = s.replace("-", ' ').replace(",", ' ').replace(".", ' ').replace("!", ' ').replace("?", ' ').replace("'","’")
s = re.sub('\s+',' ',s).strip().lower()
seq = [self.vocab_to_idx(v) for v in s if v in self.uyghur_latin]
return seq
def decode(self, seq):
vocabs = []
for idx in seq:
v = self.idx_to_vocab(idx)
if idx == self.pad_idx or idx == self.eos_idx:
break
elif idx == self.sos_idx:
pass
else:
vocabs.append(v)
s = re.sub('\s+',' ',"".join(vocabs)).strip()
return s
def vocab_to_idx(self, vocab):
return self._vocab2idx[vocab]
def idx_to_vocab(self, idx):
return self._vocab_list[idx]
def vocab_list(self):
return self._vocab_list
@property
def vocab_size(self):
return len(self._vocab_list)
@property
def pad_idx(self):
return self.vocab_to_idx(self.pad_char)
@property
def sos_idx(self):
return self.vocab_to_idx(self.sos_char)
@property
def eos_idx(self):
return self.vocab_to_idx(self.eos_char)
@property
def pad_char(self):
return "<pad>"
@property
def sos_char(self):
return "<sos>"
@property
def eos_char(self):
return "<eos>"
uyghur_latin = Uyghur()