forked from rbshaffer/Legislative_Data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_country_entities.py
executable file
·179 lines (127 loc) · 5.62 KB
/
_country_entities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import nltk as _nltk
class _EntityBase:
def __init__(self, load_lstm):
import sys
if load_lstm:
sys.path.append('/home/rbshaffer/sequence_tagging')
from model.ner_model import NERModel
from model.config import Config
config = Config()
# build model
self.model = NERModel(config)
self.model.build()
self.model.restore_session(config.dir_model)
def get_chunks(self, parsed):
return []
def do_entity_extraction(self, parsed):
""" Somewhat complex function to actually do the entity extraction. """
import networkx as nx
import textwrap
from numpy import mean
chunks = self.get_chunks(parsed)
def total_edge_count(count_obj, total_counter=0):
""" Sub-function to calculate total number of edges in a container. """
if count_obj:
ceiling = count_obj.pop(count_obj.keys()[0])
total_counter += sum([min(ceiling, count_obj[c]) for c in count_obj])
total_counter = total_edge_count(count_obj, total_counter)
return total_counter
def observed_edge_count(raw_obj):
""" Sub-function to calculate the observed number of edges in a container. """
observed_counter = 0
for chunk_obj in raw_obj:
chunk_entities = {e: chunk_obj.count(e) for e in set(chunk_obj)}
observed_counter += total_edge_count(chunk_entities)
return observed_counter
# container to store all entities extracted, for matching use in-string
# maybe consider shifting this inside the loop to only match in-chunk?
# though note that the output generator currently depends on this
all_entities = []
# output container
out = []
# iterate over units of analysis, as defined in country-specific functions
for chunk in chunks:
entity_strings = []
sentences = self.process_doc(chunk)
for sent in sentences:
entities = []
tags = self.model.predict(sent)
for i, t in enumerate(tags):
if t == 'B-MISC':
entities.append([sent[i]])
elif t == 'I-MISC' and len(entities) > 0:
# this condition shouldn't be necessary - need to figure out why this is happening
entities[-1].append(sent[i])
new_entities = [' '.join(e) for e in entities]
new_entities = ['\n'.join(textwrap.wrap(e.strip(), 20)) for e in new_entities]
entity_strings += new_entities
all_entities += new_entities
out.append(entity_strings)
# get the actual output
entities_count = {e: all_entities.count(e) for e in set(all_entities)}
out = [[e for e in row if e in entities_count] for row in out]
edges = {}
for chunk in out:
if len(set(chunk)) > 1:
entities = list(set(chunk))
for i in range(len(entities)):
for j in range(i+1, len(entities)):
e1 = entities[i]
e2 = entities[j]
if (e1, e2) in edges:
edges[(e1, e2)] += min(chunk.count(e1), chunk.count(e2))
elif (e2, e1) in edges:
edges[(e2, e1)] += min(chunk.count(e1), chunk.count(e2))
else:
edges[(e1, e2)] = min(chunk.count(e1), chunk.count(e2))
edges = [k + (w,) for k, w in edges.iteritems()]
if entities_count:
graph = nx.Graph()
for u, v, w in edges:
graph.add_edge(u, v, weight=w)
degree = list(graph.degree(weight='weight').values())
if degree:
average_degree = mean(list(graph.degree(weight='weight').values()))
else:
average_degree = 0
# count_zeroes?
try:
clustering_coeff = nx.average_clustering(graph, weight='weight', count_zeros=True)
except ZeroDivisionError:
clustering_coeff = 0
else:
graph = None
clustering_coeff = None
average_degree = None
total_nodes = len(set(all_entities))
total_edges = sum([e[2] for e in edges])
return {'graph': graph, 'edges': edges, 'total_nodes': total_nodes, 'clustering': clustering_coeff,
'total_edges': total_edges, 'average_degree': average_degree}
@staticmethod
def process_doc(document):
sentences = _nltk.sent_tokenize(document)
sentences = [_nltk.word_tokenize(sent) for sent in sentences]
return sentences
class UnitedStatesAnnual(_EntityBase):
def __init__(self, load_lstm=True):
_EntityBase.__init__(self, load_lstm)
def get_chunks(self, parsed):
i = 0
chunks = ['']
for row in parsed:
if 'SECTION' in row[2] or 'SEC' in row[2]:
i += 1
chunks.append('')
if row[4] != 'title':
chunks[i] += ' ' + row[5]
chunks[i].strip()
return chunks
class UnitedStatesConsolidated(_EntityBase):
def __init__(self, load_lstm=True):
_EntityBase.__init__(self, load_lstm)
def get_chunks(self, parsed):
chunks = ['']
for section in parsed:
chunk = ' '.join(parsed[section])
chunks.append(chunk)
return chunks