-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlower_case_script.py
59 lines (47 loc) · 1.8 KB
/
lower_case_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
from os.path import exists, join
import json
from time import time
from datetime import timedelta
import multiprocessing as mp
from cytoolz import concat, curry, compose
from utils.io import count_data
import argparse
@curry
def process(data_dir, i):
with open(join(data_dir, '{}.json'.format(i))) as f:
data = json.loads(f.read())
art_sents = data['article']
abs_sents = data['abstract']
art_sents_lower = [art_sent.lower() for art_sent in art_sents]
abs_sents_lower = [abs_sent.lower() for abs_sent in abs_sents]
data['article'] = art_sents_lower
data['abstract'] = abs_sents_lower
with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
json.dump(data, f, indent=4)
def label_mp(data, split):
""" process the data split with multi-processing"""
start = time()
print('start processing {} split...'.format(split))
data_dir = join(data, split)
n_data = count_data(data_dir)
with mp.Pool() as pool:
list(pool.imap_unordered(process(data_dir),
list(range(n_data)), chunksize=1024))
print('finished in {}'.format(timedelta(seconds=time()-start)))
def main(data, split):
if split == 'all':
for split in ['val', 'train', 'test']:
label_mp(data, split)
else:
label_mp(data, split)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=('Make extraction labels')
)
parser.add_argument('-data', type=str, action='store',
help='The directory of the data.')
parser.add_argument('-split', type=str, action='store', default='all',
help='The folder name that needs to produce candidates. all means process both train and val.')
args = parser.parse_args()
main(args.data, args.split)