-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpersian_token_filters.py
57 lines (38 loc) · 1.51 KB
/
persian_token_filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from hazm import Stemmer, Normalizer, stopwords_list
from search_engine.analyzer.token import Token
from search_engine.analyzer.token_filter import TokenFilter
class RemoveSpaceFilter(TokenFilter):
def process(self, token):
if token is not None:
return Token(token.value.replace(' ',''), token.position, token.length)
class PersianStopFileFilter(TokenFilter):
def __init__(self, stop_file_addr):
with open(stop_file_addr, 'r') as f:
self.stop_dict = {
stop.replace('\n', ''): 1
for stop in f.readlines()
}
def process(self, token):
if token is not None and token.value not in self.stop_dict:
return token
class PersianStopHazmFilter(TokenFilter):
def __init__(self):
self.stop_dict = {
stop: 1
for stop in stopwords_list()
}
def process(self, token):
if token is not None and token.value not in self.stop_dict:
return token
class PersianStemFilter(TokenFilter):
def __init__(self):
self.stemmer = Stemmer()
def process(self, token):
if token is not None:
return Token(self.stemmer.stem(token.value), token.position, token.length)
class PersianNormalizeFilter(TokenFilter):
def __init__(self):
self.normalizer = Normalizer()
def process(self, token):
if token is not None:
return Token(self.normalizer.normalize(token.value), token.position, token.length)