-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_selection.py
91 lines (71 loc) · 2.54 KB
/
data_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
def repl_sgml_wih_utf8(word: str) -> str:
word = word.replace("ă", "ă")
word = word.replace("â", "â")
word = word.replace("î", "î")
word = word.replace("ş", "ș")
word = word.replace("ţ", "ț")
word = word.replace("Ă", "Ă")
word = word.replace("Â", "Â")
word = word.replace("Î", "Î")
word = word.replace("Ş", "Ș")
word = word.replace("Ţ", "Ț")
return word
def pl_noun_selection() -> list[tuple]:
"""Reads 'tbl.wordform.ro' and selects the top 10, most
frequent plural nouns, according to the CoRoLa word frequency."""
tbl_file = os.path.join('..', 'Rodna', 'data',
'resources', 'tbl.wordform.ro')
tbl = {}
tbl_pos = {}
with open(tbl_file, mode='r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line.startswith('#'):
word, _, msd = line.split()
word = repl_sgml_wih_utf8(word)
if word not in tbl:
tbl[word] = set()
# end if
if word not in tbl_pos:
tbl_pos[word] = set()
# end if
tbl[word].add(msd)
tbl_pos[word].add(msd[0])
# end if
# end for
# end with
corola_freq_file = os.path.join(
'..', 'ro-wordpiece-tokenizer', 'corola-vocabulary.txt')
corola_freq = {}
with open(corola_freq_file, mode='r', encoding='utf-8') as f:
for line in f:
line = line.strip()
word, freq = line.split()
corola_freq[word] = int(freq)
# end for
# end with
corola_pl_nouns = []
for word in tbl_pos:
if len(tbl_pos[word]) == 1 and 'N' in tbl_pos[word] and \
word in corola_freq:
# This is a noun, only, that appears in CoRoLa
# Check if it can only be plural
only_plural = True
for m in tbl[word]:
if len(m) < 4 or m[3] != 'p':
only_plural = False
break
# end if
# end for
if only_plural and corola_freq[word] >= 5000:
corola_pl_nouns.append((word, corola_freq[word]))
# end if
# end for
return corola_pl_nouns
if __name__ == '__main__':
words = pl_noun_selection()
words = sorted(words, key=lambda x: x[1], reverse=True)
for w, f in words:
print(f'{w}\t{f}')
# end for