-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfind_rating_vocab.py
144 lines (113 loc) · 5.48 KB
/
find_rating_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import json
from os.path import join
import re
import argparse
from collections import Counter
import pickle as pkl
from tqdm import tqdm
from nltk.corpus import stopwords
from string import punctuation
def _count_data(path):
""" count number of data in the given path"""
matcher = re.compile(r'[0-9]+\.json')
match = lambda name: bool(matcher.match(name))
names = os.listdir(path)
n_data = len(list(filter(match, names)))
return n_data
def read_word_list_from_file(filename):
with open(filename) as f:
word_list = [l.strip() for l in f.readlines()]
return word_list
def main(data_dir, split):
split_dir = join(data_dir, split)
n_data = _count_data(split_dir)
#positive_word_list = read_word_list_from_file(positive_words_file)
#negative_word_list = read_word_list_from_file(negative_words_file)
#sentiment_word_list = positive_word_list + negative_word_list
stop_words = set(stopwords.words('english'))
out_dir = data_dir
rating_1_vocab_counter = Counter()
rating_2_vocab_counter = Counter()
rating_3_vocab_counter = Counter()
rating_4_vocab_counter = Counter()
rating_5_vocab_counter = Counter()
rating_1_vocab_counter_no_stop_word_and_punc = Counter()
rating_2_vocab_counter_no_stop_word_and_punc = Counter()
rating_3_vocab_counter_no_stop_word_and_punc = Counter()
rating_4_vocab_counter_no_stop_word_and_punc = Counter()
rating_5_vocab_counter_no_stop_word_and_punc = Counter()
total_num_review = 0
for i in tqdm(range(n_data)):
total_num_review += 1
js = json.load(open(join(split_dir, '{}.json'.format(i))))
rating = js['overall']
review_sent_list = js['reviewText']
num_review_sents = len(review_sent_list)
review_text = ' '.join(review_sent_list)
review_word_list = review_text.split(' ')
num_review_tokens = len(review_word_list)
summary_sent_list = js['summary']
summary_text = ' '.join(summary_sent_list)
summary_word_list = summary_text.split(' ')
num_summary_tokens = len(summary_word_list)
summary_word_list_no_stop_word_and_punc = [word for word in summary_word_list if word not in stop_words and word not in punctuation]
if rating == 1:
target_vocab_counter = rating_1_vocab_counter
target_vocab_counter_no_stop_word_and_punc = rating_1_vocab_counter_no_stop_word_and_punc
elif rating == 2:
target_vocab_counter = rating_2_vocab_counter
target_vocab_counter_no_stop_word_and_punc = rating_2_vocab_counter_no_stop_word_and_punc
elif rating == 3:
target_vocab_counter = rating_3_vocab_counter
target_vocab_counter_no_stop_word_and_punc = rating_3_vocab_counter_no_stop_word_and_punc
elif rating == 4:
target_vocab_counter = rating_4_vocab_counter
target_vocab_counter_no_stop_word_and_punc = rating_4_vocab_counter_no_stop_word_and_punc
elif rating == 5:
target_vocab_counter = rating_5_vocab_counter
target_vocab_counter_no_stop_word_and_punc = rating_5_vocab_counter_no_stop_word_and_punc
else:
raise ValueError
target_vocab_counter.update(summary_word_list)
target_vocab_counter_no_stop_word_and_punc.update(summary_word_list_no_stop_word_and_punc)
with open(os.path.join(out_dir, "rating_1_vocab_counter.pkl"),
'wb') as vocab_file:
pkl.dump(rating_1_vocab_counter, vocab_file)
with open(os.path.join(out_dir, "rating_2_vocab_counter.pkl"),
'wb') as vocab_file:
pkl.dump(rating_2_vocab_counter, vocab_file)
with open(os.path.join(out_dir, "rating_3_vocab_counter.pkl"),
'wb') as vocab_file:
pkl.dump(rating_3_vocab_counter, vocab_file)
with open(os.path.join(out_dir, "rating_4_vocab_counter.pkl"),
'wb') as vocab_file:
pkl.dump(rating_4_vocab_counter, vocab_file)
with open(os.path.join(out_dir, "rating_5_vocab_counter.pkl"),
'wb') as vocab_file:
pkl.dump(rating_5_vocab_counter, vocab_file)
with open(os.path.join(out_dir, "rating_1_vocab_counter_no_stop_word_and_punc.pkl"),
'wb') as vocab_file:
pkl.dump(rating_1_vocab_counter_no_stop_word_and_punc, vocab_file)
with open(os.path.join(out_dir, "rating_2_vocab_counter_no_stop_word_and_punc.pkl"),
'wb') as vocab_file:
pkl.dump(rating_2_vocab_counter_no_stop_word_and_punc, vocab_file)
with open(os.path.join(out_dir, "rating_3_vocab_counter_no_stop_word_and_punc.pkl"),
'wb') as vocab_file:
pkl.dump(rating_3_vocab_counter_no_stop_word_and_punc, vocab_file)
with open(os.path.join(out_dir, "rating_4_vocab_counter_no_stop_word_and_punc.pkl"),
'wb') as vocab_file:
pkl.dump(rating_4_vocab_counter_no_stop_word_and_punc, vocab_file)
with open(os.path.join(out_dir, "rating_5_vocab_counter_no_stop_word_and_punc.pkl"),
'wb') as vocab_file:
pkl.dump(rating_5_vocab_counter_no_stop_word_and_punc, vocab_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=('Preprocess review data')
)
parser.add_argument('-data_dir', type=str, action='store',
help='The directory of the data.')
parser.add_argument('-split', type=str, action='store',
help='train or val or test.')
args = parser.parse_args()
main(args.data_dir, args.split)