-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing_data.py
231 lines (213 loc) · 6.78 KB
/
preprocessing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# -*- coding: utf-8 -*-
"""Preprocessing_Data.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/10Kb5mKa0cJfdgwI1dr_st3gbpxGGUWkF
"""
import nltk
import pandas as pd
import numpy as np
# nlp imports
import re
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Code for text lowercasing
def lower_casing_text(text):
# Convert text to lower case
# lower() - It converts all upperase letter of given string to lowercase.
Formatted_Text = text.lower()
return Formatted_Text
# Remove Emails from text
def remove_emailFormate(text):
Formatted_Text = re.sub(r'\S+@\w+\.com','',text)
return Formatted_Text
def replace_comma(text):
Formatted_Text=text.replace('’',"'")
return Formatted_Text
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
}
# The code for expanding contraction words
def expand_contractions(text, contraction_mapping = CONTRACTION_MAP):
"""expand shortened words to the actual form.
e.g. don't to do not
"""
# Tokenizing text into tokens.
list_Of_tokens = text.split(' ')
# Checking for whether the given token matches with the Key & replacing word with key's value.
# Check whether Word is in lidt_Of_tokens or not.
for Word in list_Of_tokens:
# Check whether found word is in dictionary "Contraction Map" or not as a key.
if Word in CONTRACTION_MAP:
# If Word is present in both dictionary & list_Of_tokens, replace that word with the key value.
list_Of_tokens = [item.replace(Word, CONTRACTION_MAP[Word]) for item in list_Of_tokens]
# Converting list of tokens to String.
String_Of_tokens = ' '.join(str(e) for e in list_Of_tokens)
return String_Of_tokens
def removing_Num_between_brackets(text):
# The formatted text after removing not necessary punctuations.
Formatted_Text=re.sub('\[.*?\]','',text)
# In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.
return Formatted_Text
# The code for removing special characters
def removing_special_characters(text):
"""Removing all the special characters except the one that is passed within
the regex to match, as they have imp meaning in the text provided.
"""
# The formatted text after removing not necessary punctuations.
Formatted_Text = re.sub(r"[^-a-zA-Z0-9():,%.!]+", ' ', text)
# In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.
return Formatted_Text
# The code for removing stopwords
stoplist = stopwords.words('english')
# We can Removing Stop Words from Default NLTK Stop Word List
stoplist.remove('the')
stoplist = set(stoplist)
def removing_stopwords(text):
"""This function will remove stopwords which doesn't add much meaning to a sentence
& they can be remove safely without comprimising meaning of the sentence.
"""
# repr() function actually gives the precise information about the string
text = repr(text)
# Text without stopwords
No_StopWords = [word for word in word_tokenize(text) if word.lower() not in stoplist ]
# Convert list of tokens_without_stopwords to String type.
words_string = ' '.join(No_StopWords)
return words_string
# The code for spelling corrections
def spelling_correction(text):
'''
This function will correct spellings.
'''
# Check for spellings in English language
spell = Speller(lang='en')
Corrected_text = spell(text)
return Corrected_text
# The code for lemmatization
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatization(text):
"""This function converts word to their root words
"""
# Converting words to their root forms
word_list = nltk.word_tokenize(text)
lemmatized_output = ' '.join([lemmatizer.lemmatize(w,'v') for w in word_list])
return lemmatized_output