This repository was archived by the owner on May 28, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_iob1_to_iob2.py
60 lines (52 loc) · 2.15 KB
/
convert_iob1_to_iob2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import sys
import argparse
from utils_ner import read_from_path, \
iob2, \
iob_iobes, \
iobes_iob, \
update_tag_scheme
parser = argparse.ArgumentParser(description='Convert conll iob1 dataset to iob2 dataset.')
parser.add_argument("--files",
nargs='*',
default=["./ner_data/en/eng.train", "./ner_data/en/eng.testa", "./ner_data/en/eng.testb"],
help="Address of the files. Value-type: list(str)")
parser.add_argument("--encoding",
default="utf-8",
type=str,
help="The encoding method that will be used to read the texts. Value-type: (str)")
parser.add_argument("--lang_dict_address",
default="./lang_dict.txt",
type=str,
help="Exclude the seed value from the experiment. Value-type: (str)")
parser.add_argument("--rename",
action='store_true',
help="Rename the language two char short form with standard two char short form. Value-type: (bool)")
params = parser.parse_args()
def get_lang_dict(lang_dict_address):
_dict = {}
with open(lang_dict_address, "r") as filePtr:
for line in filePtr:
lang = line.strip().split()
assert len(lang) == 2
_dict[lang[0]] = lang[1]
return _dict
# lang_dict = get_lang_dict(params.lang_dict_address)
datasets = params.files
for _file in datasets:
sentences = read_from_path(_file, params.encoding)
update_tag_scheme(sentences, 'iob')
# prepare the new file name.
new_file = _file+".iob2"
if params.rename:
for k, v in lang_dict.items():
if k in new_file:
new_file = new_file.replace(k, v)
flag = 0
with open(new_file, "w", encoding=params.encoding) as filePtr:
for words in sentences:
if flag:
filePtr.write("\n")
for word in words:
filePtr.write(word[0]+" "+word[-1]+"\n")
flag = 1