-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathacronym_extract_github_code.py
243 lines (212 loc) · 12.2 KB
/
acronym_extract_github_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# -*- coding: utf-8 -*-
## CDIPS 2015 Workshop Project
## Team: Acronyms
## Members: Hong Ding, Philipp Dumitrescu, Herman Leung
## July 11 - Aug 1, 2015
'''
This code takes a (large text) string and extracts acronyms along with their definitions,
in either of these forms:
- University of California, Berkeley (UCB)
- UCB (University of California, Berkeley)
There are four possible outputs - see inside function for details.
'''
import re
def extract_acronym(text, search='complex', surrounding='F'):
if search in ['reduced1', 'reduced2']:
surrounding = 'F' # the 'reduced' and 'simple' options should never
# include surrounding words (i.e. surrounding='F')
# 'complex' = returns list of lists of acronyms with definitions
# Returns: [['acronym', 'definition',
# 'surrounding words (up to 2000 max)'], [], ...]
# 'reduced2' = list of tuples of (acronyms + definitions, counts)
# Returns: [(acronym_definition, counts),
# (acronym_definition2, counts2), ...]
# 'reduced1' = list of tuples of (acronyms, counts)
# Returns: [(acronym, counts), (acronym2, counts2), ...]
# * NOTE * ignores definitions
# * NOTE * uses the more complicated definition of acronyms
# (in parentheses or followed by parentheses)
# 'simple' = grabs all acronyms (bare or in parentheses) without
# definitions or surrounding words
# Returns: [(acronym, counts), (acronym2, counts2), ...]
'''
The 'simple' version
uses a very barebones definition of acronyms:
- Anything in capital letters more than 1 character long,
optionally with periods after each character (which will be
deleted and counted as the same as without periods)
The 'reduced1', 'reduced2', and 'complex' versions
use the following definition for acronyms:
1. composed of upper case alphabetic characters only
(except optionally with periods after each char)
2. at least 2 chars long
(but not one capital letter followed by a period)
3. in parentheses, or followed by parentheses
(original 'long' form preceding the acronym, or inside
parentheses after the acronym)
4. the definition (i.e., the original "long" form) of the acronym
must begin with a capital letter identical to that of the acronym
(e.g., 'bachelor of arts (BA)' would not be picked up,
because 'bachelor' is lowercase)
5. the definition has at least as many words as letters in the acronym,
and at most the number of letters in the acronym + 4 more words
(to account for function words that often don't get represented
in the acronym)
(e.g., 'community integration (COMINT)' would not be picked up,
because words in definition < letters in acronym)
'''
acronym_list = []
if search == 'simple':
acro_list = re.findall('[A-Z]{2,}|(?:[A-Z]\.){2,}', text)
acro_list = [re.sub('\.', '', a) for a in acro_list]
acro_dict = {}
for a in acro_list:
if a not in acro_dict.keys():
acro_dict[a] = 1
else:
acro_dict[a] += 1
acronym_list = list(acro_dict.items())
else:
if re.search('[A-Z]{2,} \(|\([A-Z]{2,}\)', text) == None:
pass
else:
text = re.sub('\s+', ' ', text)
text_list = text.split(' ')
LEN = len(text_list)
for i in range(LEN):
acronym = None
definition = None
surrounding_words = []
STRING = text_list[i]
# 1. Only look at strings that are at least 2 chars long
if len(STRING) > 1:
# 1.1. Get definition
# 1.1.1 ignore if there is lowercase, parentheses, or period
if re.search('[^A-Z\(\)\.]', STRING):
pass
# 1.1.2. else if there is an open parenthesis
elif STRING[0] == '(' and STRING[-1] == ')':
if len(STRING) > 3:
acronym = STRING[1:-1]
# if STRING[-1] == ')': # if there is a close paren
# acronym = STRING[1:-1]
# else: # if there isn't a close paren
# acronym = STRING[1:] # (sometimes an acronym in parentheses have other
# # information on the acronym before the close paren,
# # e.g., pronunciation, especially on Wikipedia)
if re.search('([A-Z]\.){2,}', acronym) and len(acronym)%2 == 0:
acro_len = int(len(acronym)/2)
elif '.' in acronym:
break
else:
acro_len = len(acronym)
if acro_len <= i: # check that acronym length is
# not longer than acronym
# position in list, because we're
# going to search the preceding
# words in order to grab the
# "definition" of the acronym
if i - acro_len <= 0:
START = 0
else:
START = i - acro_len
if START - 4 < 0:
STOP = -1
else:
STOP = START - 5
## First check if the number of words before acronym (that equal number of letters in acronym)
## have the same sequence of initial letters as the letters in the acronym
words_before_acronym = [re.sub('\"', '', text_list[m]) for m in range(i-acro_len, i, 1)]
initials_before_acronym = ''.join([word[0] for word in words_before_acronym]).upper()
if initials_before_acronym == acronym:
definition = ' '.join(words_before_acronym)
# reverse look back, up to (acronym length + 4) word slots before acronym
else:
for j in range(START, STOP, -1):
word = re.sub('\"', '', text_list[j])
if ((word[-1] not in '.!?\"\)\;\:') and # not last word of a previous sentence and word not in parentheses
(word[0] == acronym[0])): # first character is the same between acronym and word
if '(' not in ' '.join(text_list[j:i]):
definition = ' '.join(text_list[j:i])
break
else: # if parentheses in definition, then grab from word after close parenthesis and check its first letter
for k in range(START, i, 1):
word = re.sub('\"', '', text_list[k])
if word[-1] == ')':
if text_list[k+1][0] == acronym[0]:
definition = ' '.join(text_list[k+1:i])
break
# 1.1.3 else if there aren't parentheses (look for definition in parentheses after the acronym)
elif not re.search('[\(]', STRING) and i != LEN-1:
acronym = STRING
word_after = text_list[i+1]
if i == LEN - 1: # if word is the last one in the list, pass
pass
elif (word_after[0] == '(' and # if word after acronym starts with '('
word_after[1] == acronym[0]): # and first letters match
if re.search('([A-Z]\.){2,}', acronym):
acro_len = int(len(acronym)/2)
else:
acro_len = len(acronym)
if i + acro_len + 4 <= LEN:
STOP = i + acro_len + 4
else:
STOP = LEN
for j in range(i+1, STOP, 1):
word = text_list[j]
if word[-1] == ')':
definition = ' '.join(text_list[i+1:j+1])
definition = re.sub('[\(\)\"]', '', definition)
break
# 1.2. Get surrounding words
if surrounding == "T" and definition != None:
if i < 1000:
START = 0
else:
START = i - 1000
if i + 1000 > LEN:
STOP = LEN
else:
STOP = i + 1000
surrounding_words = text_list[START:STOP]
''' Some final checks
1. delete periods from acronyms
2.1 make sure the exact same acronym doesn't appear in the definition
2.2 make sure initials in acronym occur in sequential order in definition
3. make sure objects 'acronym' and 'definition' have values
4. if search = "reduced1" or "reduced2", reduce acronym_list by count
'''
if acronym != None and '.' in acronym:
acronym = re.sub('.', '', acronym)
if acronym != None and definition != None:
acronym_chars = [char for char in acronym]
CHECK_INITIALS = re.compile(''.join([str(char + '.*?') for char in acronym_chars]))
if not re.search(CHECK_INITIALS, definition) or re.search(acronym, definition):
acronym = None
definition = None
if acronym not in [None, [], ''] and definition not in [None, [], '']:
if surrounding == 'T':
acronym_list.append([acronym, definition, surrounding_words])
elif surrounding == 'F':
if search == 'reduced2':
acronym_list.append(str(acronym) + ' - ' + str(definition))
else:
acronym_list.append([acronym, definition])
if search == "reduced1":
acro_dict = {}
for a in acronym_list:
if a[0] not in acro_dict.keys():
acro_dict[a[0]] = 1
else:
acro_dict[a[0]] += 1
acronym_list = list(acro_dict.items())
if search == "reduced2": # this block is outdented from previous
# block for a very good reason
acro_dict = {}
for a in acronym_list:
if a not in acro_dict.keys():
acro_dict[a] = 1
else:
acro_dict[a] += 1
acronym_list = list(acro_dict.items())
return acronym_list