-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathadd_ne_tag.py
74 lines (63 loc) · 2.55 KB
/
add_ne_tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Short module that should be run in case you didn't run disambiguate.py
with NER on but still want to use the NER tags. It will use all entries
that begin with he, she or it and replace the it with <NE> and then write
the new dictionary disambiguations_with_ner.yaml.
Note: This will delete the original yaml file.
"""
import yaml
with open("disambiguations.yaml", "r") as stream:
INP_DICT = yaml.load(stream)
def _convert_to_normalized(dictionary):
"""
Args:
- dictionary is a python dict whose values are (integer) numbers.
Returns:
- a python dict with the same keys but values which are normalized.
"""
sum_value = sum(dictionary.values())
for _key, _value in dictionary.items():
dictionary[_key] = float("{0:.3f}".format(_value/sum_value))
return dictionary
OUT_DICT = INP_DICT.copy()
for key, value in INP_DICT.items():
if (key[0][0] == 'he' or key[0][0] == 'she' or key[0][0] == 'it' or
key[0][0] == 'they'):
preposition = key[0][0]
bias = 1.0
if preposition == 'it':
# if the preposition is 'it' let it's values only count half, since
# most occurences will be names
# the value is arbitrary
bias = 0.5
# this is a bad work-around to get the tuple to actually be a
# single element as a tuple and not automatically converted to a
# list.
new_key = ['placeholder']
new_key[0] = ("<NE>", key[0][1])
new_key += key[1:]
new_key = tuple(new_key)
value = _convert_to_normalized(value)
if new_key in OUT_DICT:
# if the key is already in the dictionary add together the
# occurence values.
for key2, value2 in value.items():
key2 = key2.replace(preposition, '<NE>')
if key2 in OUT_DICT[new_key]:
OUT_DICT[new_key][key2] += bias*value2
else:
OUT_DICT[new_key][key2] = bias*value2
else:
# if it's not yet in the dictionary just add it
OUT_DICT[new_key] = {subkey.replace(preposition, '<NE>'):
subvalue*bias for subkey, subvalue
in value.items()}
# lastly normalize the whole dictionary just for form
for key, value in OUT_DICT.items():
value = _convert_to_normalized(value)
OUT_DICT[key] = value
with open("disambiguations.yaml", "w") as stream:
yaml.dump(OUT_DICT,
stream,
explicit_start=True,
width=79)