-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
109 lines (98 loc) · 3.79 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re, string
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def convert_to_lower(text):
return text.lower()
def remove_emojis(text):
text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
text = re.sub(r"<.*?>","",text)
wierd_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
# u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
return wierd_pattern.sub(r' ', text)
def remove_numbers(text):
number_pattern = r'\d+'
without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
return without_number
def remove_punctuation(text):
#return text.translate(str.maketrans('', '', string.punctuation))
return text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).replace(' '*4, ' ').replace(' '*3, ' ').replace(' '*2, ' ').strip()
def remove_stopwords(text):
removed = []
list_stopw=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
'ourselves', 'you', "you're", "you've", "you'll",
"you'd", 'your', 'yours', 'yourself', 'yourselves',
'he', 'him', 'his', 'himself', 'she',
"she's", 'her', 'hers', 'herself',
'it', "it's", 'its', 'itself',
'they', 'them', 'their', 'theirs',
'themselves', 'what', 'which', 'who',
'whom', 'this', 'that', "that'll",
'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has',
'had', 'having', 'do', 'does',
'did', 'doing', 'a', 'an',
'the', 'and', 'but', 'if',
'or', 'because', 'as', 'until',
'while', 'of', 'at', 'by',
'for', 'with', 'about', 'against',
'between', 'into', 'through', 'during',
'before', 'after', 'above', 'below',
'to', 'from', 'up', 'down',
'in', 'out', 'on', 'off',
'over', 'under', 'again', 'further',
'then', 'once', 'here', 'there',
'when', 'where', 'why', 'how',
'all', 'any', 'both', 'each',
'few', 'more', 'most', 'other',
'some', 'such', 'no', 'nor',
'not', 'only', 'own', 'same',
'so', 'than', 'too', 'very',
's', 't', 'can', 'will',
'just', 'don', "don't", 'should',
"should've", 'now', 'd', 'll', 'm', 'o',
're', 've', 'y', 'ain',
'aren', "aren't", 'couldn', "couldn't",
'didn', "didn't", 'doesn', "doesn't",
'hadn', "hadn't", 'hasn', "hasn't",
'haven', "haven't", 'isn', "isn't",
'ma', 'mightn', "mightn't", 'mustn',
"mustn't", 'needn', "needn't", 'shan',
"shan't", 'shouldn', "shouldn't", 'wasn',
"wasn't", 'weren', "weren't", 'won',
"won't", 'wouldn', "wouldn't"]
stop_words = list(list_stopw +['’','“','“','amp','new','”','covid','via','us'])
tokens = text.split()
for i in range(len(tokens)):
if tokens[i] not in stop_words:
removed.append(tokens[i])
return " ".join(removed)
def remove_extra_white_spaces(text):
single_char_pattern = r'\s+[a-zA-Z]\s+'
without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
return without_sc
def preprocessText(text):
return remove_extra_white_spaces(remove_stopwords(remove_punctuation(remove_numbers(remove_emojis(convert_to_lower(text))))))