-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpreprocessing.py
84 lines (75 loc) · 2.76 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import re
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
def text_preprocessing(
text,
remove_stopwords=False,
stem_words=False,
stopwords_addition=[],
stopwords_exclude=[],
HYPHEN_HANDLE=1
):
"""
convert string text to lower and split into words.
most punctuations are handled by replacing them with empty string.
some punctuations are handled differently based on their occurences in the data.
- replaced with ' '
few peculiar cases for better uniformity.
'non*' replaced with 'non *'
few acronyms identified
SCID Severe Combined ImmunoDeficiency
ADA Adenosine DeAminase
PNP Purine Nucleoside Phosphorylase
LFA-1 Lymphocyte Function Antigen-1
"""
text = text.lower().split()
if remove_stopwords:
stops = list(set(stopwords.words('english')) -
set(stopwords_exclude)) + stopwords_addition
text = [w for w in text if w not in stops]
text = " ".join(text)
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=_]", " ", text)
text = re.sub(r"(that|what)(\'s)", r"\g<1> is", text)
text = re.sub(r"i\.e\.", "that is", text)
text = re.sub(r"(^non| non)", r"\g<1> ", text)
text = re.sub(r"(^anti| anti)", r"\g<1> ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
if HYPHEN_HANDLE == 1:
text = re.sub(r"\-", "-", text)
elif HYPHEN_HANDLE == 2:
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"\s{2,}", " ", text)
text = text.lower().split()
if remove_stopwords:
stops = list(set(stopwords.words('english')) -
set(stopwords_exclude)) + stopwords_addition
text = [w for w in text if w not in stops]
text = " ".join(text)
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
# Return a list of words
return(text)