-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathexperiments.py
215 lines (194 loc) · 11.5 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import os
import itertools
import utils
import zlib
import numpy as np
import pandas as pd
import tensorflow as tf
from data import load_data, label_and_filter_data, model_inputs, FEATURE_GROUPS, SCALAR_FEATS, LFC_COLS
from models import build_model, train_model, test_model, explain_model
from normalization import get_normalization_object
# script arguments
parser = utils.common_parser_arguments()
parser.add_argument('--experiment', type=str, default=None, help='which experiment to run')
parser.add_argument('--replace', action='store_true', default=False, help='forces rerun even if predictions exist')
args = utils.parse_common_arguments(parser)
# enable GPU determinism
args.seed = 12345 if args.seed is None else args.seed
tf.config.experimental.enable_op_determinism()
# setup directories
data_sub_dir = utils.data_directory(args.pm_only, args.indels, args.seq_only)
experiment_path = os.path.join('experiments', args.dataset, args.experiment, data_sub_dir, args.holdout)
os.makedirs(experiment_path, exist_ok=True)
# create or load data set with fold assignments
scale_non_seq_feats = args.experiment == 'SHAP'
data, data_nt = load_data(args.dataset, args.pm_only, args.indels, args.holdout, scale_non_seq_feats)
available_features = set(SCALAR_FEATS).intersection(set(data.columns))
if args.seq_only:
available_features = set()
# define experimental values to test
if 'label-and-filter' in args.experiment:
assert args.dataset == 'junction'
assert args.normalization == 'No'
experimental_values = list(itertools.product(['MinActiveRatio'], [0.01, 0.05], np.arange(0.0, 0.35, 0.05)))
elif args.experiment == 'model':
experimental_values = list(itertools.product(['Tiger1D', 'Tiger2D'], [set(), available_features]))
elif args.experiment == 'normalization':
experimental_values = [
('No', dict()),
('FrequentistQuantile', dict(q_loc=50, q_neg=10, q_pos=90)),
('UnitInterval', dict(q_neg=0, q_pos=100, squash=False)),
('UnitInterval', dict(q_neg=1, q_pos=99, squash=False)),
('UnitInterval', dict(q_neg=5, q_pos=95, squash=False)),
('UnitInterval', dict(q_neg=10, q_pos=90, squash=False)),
('UnitInterval', dict(q_neg=1, q_pos=99, squash=True)),
('UnitInterval', dict(q_neg=5, q_pos=95, squash=True)),
('UnitInterval', dict(q_neg=10, q_pos=90, squash=True)),
('UnitVariance', dict()),
('UnitMeanOfSquares', dict()),
('ZeroMeanUnitVariance', dict()),
('DepletionRatio', dict()),
]
elif args.experiment == 'context':
experimental_values = [(-n, 0) if n < 0 else (0, n) for n in range(-25, 30, 5)]
experimental_values += [(n, n) for n in range(5, 30, 5)]
experimental_values += [(-n, 0) if n < 0 else (0, n) for n in [-4, -3, -2, -1, 1, 2, 3, 4]]
experimental_values += [(n, n) for n in range(1, 5)]
max_5p = max(data['5p_context'].apply(len))
max_3p = max(data['3p_context'].apply(len))
experimental_values = [(c5p, c3p) for c5p, c3p in experimental_values if c5p <= max_5p and c3p <= max_3p]
experimental_values = list(itertools.product(experimental_values, [set(), available_features]))
elif args.experiment == 'feature-groups-individual':
features = [tuple([feat for feat in available_features if feat in feats]) for feats in FEATURE_GROUPS.values()]
experimental_values = [(g, f) for (g, f) in zip(FEATURE_GROUPS.keys(), features) if len(f) > 0]
experimental_values = [('none', [])] + experimental_values
elif args.experiment == 'feature-groups-cumulative':
individual_performance = os.path.join(experiment_path.replace('cumulative', 'individual'), 'performance.pkl')
assert os.path.exists(individual_performance), 'run feature-groups-individual experiment first!'
individual_performance = pd.read_pickle(individual_performance)
groups = individual_performance['Pearson'].sort_values(ascending=False).index.get_level_values('feature group')
groups = list(groups.drop('none'))
experimental_values = [('none', [])]
for group in groups:
group_feats = [feat for feat in available_features if feat in FEATURE_GROUPS[group]]
experimental_values += [(group, experimental_values[-1][1] + group_feats)]
elif args.experiment == 'learning-curve':
experimental_values = np.arange(0.04, 1.04, 0.04)
elif args.experiment == 'SHAP':
experimental_values = [None]
else:
raise NotImplementedError
# initialize or load predictions
predictions_file = os.path.join(experiment_path, 'predictions.pkl')
df_predictions = pd.read_pickle(predictions_file) if os.path.exists(predictions_file) else pd.DataFrame()
# loop over experimental values
df_performance = pd.DataFrame()
df_shap = pd.DataFrame()
for experimental_value in experimental_values:
print('**********', args.experiment, experimental_value, '**********')
filter_method = experimental_value[0] if 'label-and-filter' in args.experiment else args.filter_method
nt_quantile = experimental_value[1] if 'label-and-filter' in args.experiment else args.nt_quantile
min_active_ratio = experimental_value[2] if 'label-and-filter' in args.experiment else args.min_active_ratio
model_name = experimental_value[0] if args.experiment == 'model' else args.model
normalization = experimental_value[0] if args.experiment == 'normalization' else args.normalization
normalization_kwargs = experimental_value[1] if args.experiment == 'normalization' else args.normalization_kwargs
context = experimental_value[0] if args.experiment == 'context' else args.context
if args.experiment in {'context', 'model', 'feature-groups-individual', 'feature-groups-cumulative'}:
features = [feature for feature in available_features if feature in experimental_value[1]]
else:
features = available_features
training_utilization = experimental_value if args.experiment == 'learning-curve' else 1.0
# set the configuration index
config_dict = {
'context': context,
'features': tuple(np.sort(tuple(features))) if len(features) > 0 else 'None',
'filter-method': filter_method,
'loss': args.loss,
'min active ratio': min_active_ratio,
'model': model_name,
'normalization': normalization,
'normalization kwargs': str(normalization_kwargs),
'non-targeting quantile': nt_quantile,
'training utilization': training_utilization,
}
if args.experiment in {'feature-groups-individual', 'feature-groups-cumulative'}:
config_dict.update({'feature group': experimental_value[0]})
index = pd.MultiIndex.from_tuples([tuple(config_dict.values())], names=list(config_dict.keys()))
# filter and normalize data
if args.dataset == 'junction-splice-sites':
data['observed_lfc'] = data[LFC_COLS].mean(axis=1)
junction_data = label_and_filter_data(*load_data('junction'), nt_quantile, filter_method, min_active_ratio)
filtered_data = data.loc[data['gene'].isin(junction_data['gene'].unique())].copy()
normalizer = get_normalization_object(normalization)(junction_data, **normalization_kwargs)
normalizer.original_lfc = data[['gene', 'guide_seq', 'observed_lfc']].copy().set_index(['gene', 'guide_seq'])
else:
filtered_data = label_and_filter_data(data, data_nt, nt_quantile, filter_method, min_active_ratio)
normalizer = get_normalization_object(normalization)(filtered_data, **normalization_kwargs)
normalized_data = normalizer.normalize_targets(filtered_data)
normalized_data = normalized_data.loc[~normalized_data.observed_lfc.isna()]
# add technical holdout
if args.experiment == 'label-and-filter (off-target)':
normalized_data['fold'] = 'training'
test_data = label_and_filter_data(*load_data('off-target', pm_only=True), nt_quantile=0.01, method='NoFilter')
test_data = test_data.loc[~test_data['guide_seq'].isin(normalized_data['guide_seq'])]
test_data['fold'] = 'test'
normalized_data = pd.concat([normalized_data, test_data])
# do we need results for this configuration?
if args.replace or not index.isin(df_predictions.index.unique())[0]:
# drop any existing results
length_prior = len(df_predictions)
df_predictions = df_predictions.loc[df_predictions.index.values != index.values]
assert {length_prior - len(df_predictions)}.issubset({0, len(normalized_data)}), 'welp!'
# loop over folds
df_tap = pd.DataFrame()
for k, fold in enumerate(set(normalized_data['fold'].unique()) - {'training'}):
# a deterministic but seemingly random transformation of the experiment seed into a fold seed
fold_seed = int(zlib.crc32(str(k * args.seed).encode())) % (2 ** 32 - 1)
# prepare training and validation data
tf.keras.utils.set_random_seed(fold_seed)
train_data = normalized_data[normalized_data.fold != fold].sample(frac=training_utilization)
train_data = model_inputs(train_data, context, scalar_feats=features)
valid_data = model_inputs(normalized_data[normalized_data.fold == fold], context, scalar_feats=features)
# train model
tf.keras.utils.set_random_seed(fold_seed)
model = build_model(name=model_name,
target_len=train_data['target_tokens'].shape[1],
context_5p=train_data['5p_tokens'].shape[1],
context_3p=train_data['3p_tokens'].shape[1],
use_guide_seq=args.use_guide_seq,
loss_fn=args.loss,
debug=args.debug,
output_fn=normalizer.output_fn)
model = train_model(model, train_data, valid_data, args.batch_size)
# accumulate targets and predictions on held-out fold
df_tap = pd.concat([df_tap, test_model(model, valid_data)])
# compute Shapley values if needed
if args.experiment == 'SHAP':
tf.keras.utils.set_random_seed(fold_seed)
df_shap = pd.concat([df_shap, explain_model(model, train_data, valid_data)])
# concatenate and save predictions
df_tap.index = index.repeat(len(df_tap))
if args.experiment != 'normalization':
df_tap = normalizer.denormalize_targets_and_predictions(df_tap)
df_predictions = pd.concat([df_predictions, df_tap])
df_predictions.to_pickle(predictions_file)
# save Shapley values if needed
if args.experiment == 'SHAP':
df_shap.to_pickle(os.path.join(experiment_path, 'shap.pkl'))
# concatenate and save performance
if args.experiment == 'normalization':
for normalized_observations, normalized_predictions in itertools.product([False, True], [False, True]):
df = df_predictions.loc[index].copy()
if not normalized_observations:
df = normalizer.denormalize_observations(df)
if not normalized_predictions:
df = normalizer.denormalize_predictions(df)
for active_only in [False, True]:
perf = utils.measure_performance(df.loc[df.observed_label == 1] if active_only else df, index)
perf['Normalized Observations'] = normalized_observations
perf['Normalized Predictions'] = normalized_predictions
perf['Active Only'] = active_only
df_performance = pd.concat([df_performance, perf])
else:
df_performance = pd.concat([df_performance, utils.measure_performance(df_predictions.loc[index], index)])
df_performance.to_pickle(os.path.join(experiment_path, 'performance.pkl'))