-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcLL-ML.py
executable file
·160 lines (128 loc) · 6.88 KB
/
cLL-ML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import argparse
from datetime import datetime
import os
import pickle
import random
import numpy as np
from sklearn.linear_model import LogisticRegression
from dataset import DataSet
np.set_printoptions(threshold=np.inf, suppress=True, precision=3)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--output_directory', default='results',
help='File path to log output')
parser.add_argument('--feature_types', nargs='+', choices=['rgb', 'shape', 'object'], default=['rgb', 'shape', 'object'],
help='feature type to learn')
parser.add_argument('--annotation_file', default='conf_files/UW_english/UW_AMT_description_documents_per_image_nopreproc_stop_raw.conf',
help='Amazon Mechanical Turk annotation file')
parser.add_argument('--cutoff', choices=[0.25, 0.5, 0.75], default=0.25, type=float,
help='The cosine similarity cutoff for negative example generation')
parser.add_argument('--feature_directory', default="../ImgDz",
help='folder for features')
parser.add_argument('--dataset_file', default='dataset.pkl',
help='Filename of a DataSet object for convenience')
parser.add_argument('--test_percent', type=float, default=0.2,
help='Percentage of instances to set aside for testing')
return parser.parse_known_args()
def get_features_from_file(instance, feature_type, feature_directory):
"""
Read feature data from file and return as flat list
:param string instance: "water_bottle/water_bottle_1_4_55"
:param string feature_type: type of feature (rgb, shape, or object)
:param string feature_directory: path where features are stored
:return: features read from file in a flat lists
"""
category, instance = instance.split("/")
# TODO: could/should probably do this with os.path library
path = f'{feature_directory}/{category}/{instance}/{instance}_{feature_type}.log'
features = []
with open(path, 'r') as fin:
for line in fin:
features += [float(datum) for datum in line.strip().split(',')]
return features
def get_feature_data(dataset, test_instances, feature_types, feature_directory):
# This is the variable that defines how many positive instances a token must
# have before it is deemed useful.
# NOTE: this is different from how many times it appeared for a particular
# instance. For the training that appears to be 1.
MIN_POSITIVE_INSTANCES = 3
MIN_NEGATIVE_INSTANCES = 2
for token in dataset.tokens:
# Avoid key errors and tokens with no positive or negative instances
if (token not in dataset.pos_tokens_to_instances
or token not in dataset.neg_tokens_to_instances):
continue
# Skip tokens who have few positive instances associated with them
if (len(dataset.pos_tokens_to_instances[token]) < MIN_POSITIVE_INSTANCES
or len(dataset.neg_tokens_to_instances[token]) < MIN_NEGATIVE_INSTANCES):
continue
for feature_type in feature_types:
pos_features = []
for instance in dataset.pos_tokens_to_instances[token]:
if instance in test_instances:
continue
pos_features.append(get_features_from_file(instance, feature_type, feature_directory))
neg_features = []
for instance in dataset.neg_tokens_to_instances[token]:
if instance in test_instances:
continue
neg_features.append(get_features_from_file(instance, feature_type, feature_directory))
# balance the number of features
# don't need to check for division by zero since length was already checked
if len(pos_features) > len(neg_features):
neg_features *= len(pos_features) // len(neg_features)
if len(neg_features) > len(pos_features):
pos_features *= len(neg_features) // len(pos_features)
X = pos_features + neg_features
y = ([1] * len(pos_features)) + ([0] * len(neg_features))
yield (X, y, feature_type, token)
def main():
ARGS, unused = parse_args()
# TODO: should be a cmd line arg
TEST_PERCENTAGE = 0.2
if not os.path.exists(ARGS.feature_directory):
print("Feature directory does not exist")
return
if not os.path.exists(ARGS.output_directory):
os.mkdir(ARGS.output_directory)
for feature_type in ARGS.feature_types:
if not os.path.exists(ARGS.output_directory + '/' + feature_type):
os.mkdir(ARGS.output_directory + '/' + feature_type)
print("START :: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
if os.path.exists(ARGS.dataset_file):
print('Found dataset pickle file, loading saved dataset...')
with open(ARGS.dataset_file, 'rb') as fin:
dataset = pickle.load(fin)
else:
print(f'Couldn\'t find {ARGS.dataset_file}, getting new dataset...')
dataset = DataSet(ARGS.annotation_file)
print('Writing DataSet to pickle file')
with open(ARGS.dataset_file, 'wb') as fout:
pickle.dump(dataset, fout)
# these instances should not be used during training
test_instances = random.sample(dataset.data.keys(), int(len(dataset.data) * ARGS.test_percent))
# Store test_data in memory for testing without grabbing feature data from file every time
print('Preloading test data...')
test_data = {'rgb': {}, 'shape': {}, 'object': {}}
for feature_type in ARGS.feature_types:
for instance in test_instances:
test_data[feature_type][instance] = get_features_from_file(instance, feature_type, ARGS.feature_directory)
print("ML START :: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Train and run binary classifiers for all tokens, find the probabilities
# for the associations between all tokens and test instances,
# and log the probabilities
for X, y, feature_type, token in get_feature_data(dataset, test_instances, ARGS.feature_types, ARGS.feature_directory):
print(f'Training model on token \"{token}\"...')
model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(X, y)
print(f'Testing model and writing output...')
output_file = ARGS.output_directory + f'/{feature_type}/{token}.txt'
with open(output_file, 'w') as fout:
fout.write('test_instance,y,predicted\n')
for instance, X_test in test_data[feature_type].items():
y_test = 1 if instance in dataset.pos_tokens_to_instances[token] else 0
probabilities = model.predict_proba([X_test])
fout.write(f'{instance},{y_test},{probabilities[0]}\n')
print("ML END :: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
if __name__ == "__main__":
main()