-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummary.py
215 lines (198 loc) · 9.01 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from typing import List
# import nltk
# nltk.download()
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import SnowballStemmer
import re
import numpy as np
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# TODO quoting problem: e.g.
# "This way, tax-i™ makes legal research many times more complete and efficient.’ Roadmap for tax-i™ tax-i™
# currently contains ..... is considered as one sentence in the algorithm
class Summarize:
def __init__(self, text: str, lang: str = "english"):
self.text = text
self.lang = lang
@staticmethod
def sent_tokenizer(text: str) -> List:
"""Tokenize into lists of sentences, with primary processes eg ignore
citations.
:param text: raw text
:return: a list of sentences
"""
# TODO possible solutions for sth like "Rev.", "Ave."...:
# 1. add punkt_param of abbr. manually
# https://stackoverflow.com/questions/34805790/how-to-avoid-nltks-sentence-tokenizer-splitting-on-abbreviations
# 2. use regex sth like "upper letter + many lower letter + ."
# For acronyms, like i.e., F.B.I., ..., add a "<" sign after such that sent_tokenize will not stop sentences
acronyms_low = "([a-z][.][a-z][.](?:[a-z][.])?)"
acronyms_up = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
text = re.sub(acronyms_low, "\\1<", text)
text = re.sub(acronyms_up, "\\1<", text)
# use default pretrained model from nltk, know most of the prefixes, suffixes
# but not Rev. etc., and so forth
tokens = sent_tokenize(text)
sentences = []
for token in tokens:
token = re.sub("<", " ", token) # eliminate "<" sign(s) in sentences
# deal with python input which may included "\n" and +
token = re.sub("\n", " ", token)
token = re.sub(" +", " ", token)
# get rid of ', ", ), ]... at the beginning.
# TODO '...' he said.
if token[0] in [
'"',
"'",
")",
"]",
]:
token = token[1:]
# get rid of citations at the beginning and/or ending(1-99)
# TODO shorter way anything between "[]"
if token[0] == "[" and token[2] == "]":
token = token[3:]
if token[0] == "[" and token[3] == "]":
token = token[4:]
if token[-2] == "]" and token[-4] == "[":
token = token[:-4] + token[-1]
if token[-2] == "]" and token[-5] == "[":
token = token[:-5] + token[-1]
if token[0] == " ": # extra space at the beginning
token = token[1:]
token.strip()
sentences.append(token)
return sentences
def preprocess(self, sentences: List) -> List:
"""Preprocesses after tokenized: 1) removes stopwords 2) removes
special characters 3) removes and leading and trailing spaces 4)
transforms all words to lowercase.
:param sentences: list of sentences after tokenized
:return: 2D list of words in sentences e.g. [[w,w,w],[w,w,w]]
"""
tokens = sentences
# stopwords e.g. i, he, she, they, it, 's, 've, 'd ....
stop_words = stopwords.words(self.lang)
preprocessed_sentences = []
for index, s in enumerate(tokens):
preprocessed_sentences.append([])
words = word_tokenize(s)
for word in words:
word = re.sub("\n", " ", word)
# remove special characters, not done in Smmry
# e.g. fig1, $ @..., but no Covid-19
# word = re.sub("[^A-Za-z]+", " ", word)
word = re.sub(" +", " ", word)
word = word.strip()
word = word.lower() # transforms to lowercase
if word and word not in stop_words:
preprocessed_sentences[index].append(word)
return preprocessed_sentences
def tag_pos(self, preprocessed_sentences: List, pos=None) -> List:
"""Reference https://www.nltk.org/book/ch05.html Keep relevant words by
using `tag` which specify parts of speech of words Based on Smmry, we
want noun(NN), past participle verb(VBN, considered passive voice), and
adjectives(JJ)
:param preprocessed_sentences: list of sentences after preprocessed
:param pos: Parts Of Speech that is needed
:return: 2D list of tagged words of sentences
"""
if pos is None:
# based on Smmry, highlight n.(NN), past participle v.(VBN, considered passive voice), and adj.(JJ)
pos = ["NN", "VBN", "JJ"]
tagged_words_sentences = []
for index, sentence in enumerate(preprocessed_sentences):
tagged_words_sentences.append([])
for word in sentence:
# tag word with its p-o-s, e.g. [('macalester', 'NN')]
word = pos_tag([word])
if word[0][1] in pos:
tagged_words_sentences[index].append(word[0][0])
return tagged_words_sentences
def stem(self, tagged_words_sentences: List) -> List:
"""Stem words.
:param tagged_words_sentences: 2D list of tagged words of sentences
:return: 2D list of stemmed and tagged words of sentences
"""
stemmer = SnowballStemmer(self.lang)
stemmed_sentences = []
for index, sentences in enumerate(tagged_words_sentences):
stemmed_sentences.append([])
for word in sentences:
word = stemmer.stem(word)
stemmed_sentences[index].append(word)
return stemmed_sentences
@staticmethod
def build_similarity_matrix(stemmed_tokens: List) -> np.ndarray:
"""
Reference: https://monkeylearn.com/blog/what-is-tf-idf/
Creates tfidf vector using sklearn.feature_extraction.text.TfidfVectorizer() and builds pairwise similarity
matrix of linear kernal using sklearn.metrics.pairwise.linear_kernel().
:param stemmed_tokens: list of stemmed words of sentences
:return: pairwise similarity matrix
"""
token_strings = [" ".join(sentence) for sentence in stemmed_tokens]
vectorizer = TfidfVectorizer(min_df=2, max_df=0.50)
X = vectorizer.fit_transform(token_strings)
cosine_similarities = linear_kernel(X, X)
for index1 in range(len(cosine_similarities)):
for index2 in range(len(cosine_similarities)):
if index1 == index2:
cosine_similarities[index1][index2] = 0
for index in range(len(cosine_similarities)):
if cosine_similarities[index].sum() == 0:
continue
else:
cosine_similarities[index] /= cosine_similarities[index].sum()
return cosine_similarities
@staticmethod
def text_rank(
similarity_matrix: np.ndarray, eps: float = 0.0001, p: float = 0.85
) -> np.ndarray:
"""
reference: https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
:param similarity_matrix: input similarity_matrix
:param eps: endurable error for iterations
:param p: greedy probability i.e. 1-p for randomly randomly choosing
:return: np.array of unsorted ranked sentences' values
"""
P = np.ones(len(similarity_matrix)) / len(similarity_matrix)
while True:
new_P = np.ones(len(similarity_matrix)) * (1 - p) / len(
similarity_matrix
) + p * similarity_matrix.T.dot(P)
delta = abs(new_P - P).sum()
if delta <= eps:
return new_P
P = new_P
def summarize(self, length: int = 5) -> str:
"""
Pipeline:
1) sent_tokenizer(),
2) preprocessing(),
3) tag_pos(),
4) stem(),
5) build_similarity_matrix(),
6) text_rank()
:param length: number of sentences that user want to after summarized
:return: summarized sentences
"""
original_sentences = self.sent_tokenizer(self.text)
preprocessed_tokens = self.preprocess(original_sentences)
tagged_tokens = self.tag_pos(preprocessed_tokens)
# tag first because after tag v. -> n.
stemmed_tokens = self.stem(tagged_tokens)
sentence_ranks = self.text_rank(self.build_similarity_matrix(stemmed_tokens))
ranked_sentence_indexes = [
item[0]
for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])
]
selected_sentences = sorted(ranked_sentence_indexes[:length])
# make sure chronological order
summary = itemgetter(*selected_sentences)(original_sentences)
# a line of space between each output sentence
out_summary = "\n\n".join(summary)
return out_summary