Skip to content

Commit

Permalink
load test datasets and model weights
Browse files Browse the repository at this point in the history
  • Loading branch information
PeterDeWeirdt committed Jan 27, 2020
1 parent 4af72a5 commit 23703c4
Show file tree
Hide file tree
Showing 23 changed files with 236 additions and 18,114 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,6 @@ fabric.properties
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
.Rproj.user

data/datasets/*.csv
data/features/*.csv
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os

if __name__ == '__main__':
models = [sg.Model_Doench2016(), sg.Model_Kim2018()]
models = [sg.SKLearn_sgrna_Model(), sg.Keras_sgrna_Model()]
train_datum = [da.load_doench_2016_train(), da.load_meyers_2017_train(), da.load_kim_2019_train()]
test_datum = [da.load_doench_2016_test(), da.load_meyers_2017_test(), da.load_kim_2019_test()]

Expand Down
25 changes: 0 additions & 25 deletions sgrna_modeler/architectures.py

This file was deleted.

Binary file added sgrna_modeler/data/datasets/Doench_2016.csv.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sgrna_modeler/data/datasets/Kim_2019_Test.csv.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sgrna_modeler/data/features/physiochem.csv.zip
Binary file not shown.
Binary file not shown.
Binary file added sgrna_modeler/data/saved_models/enPAM_GB.joblib
Binary file not shown.
62 changes: 34 additions & 28 deletions sgrna_modeler/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@ def curr_path():
return os.path.dirname(__file__)

class Activity_Data(object):
def __init__(self, data, enzyme, kmer_column, activity_column, name, type,
def __init__(self, data, enzyme, kmer_column, activity_column, name,
group_column = ''):
self.data = data
self.enzyme = enzyme
self.kmer_column = kmer_column
self.activity_column = activity_column
self.type = type
self.name = 'D_' + name
self.group_column = group_column
def get_xy(self):
Expand All @@ -24,58 +23,65 @@ def get_groups(self):
assert ValueError('No Group Column Supplied')
return self.data[self.group_column]

# Sp Datasets
def load_doench_2016_train():
data = pd.read_csv(os.path.join(curr_path(), 'data/Doench_2016_Train.csv'))
data_class = Activity_Data(data = data, enzyme = en.cas9, kmer_column='30mer',
activity_column='score_drug_gene_rank',
name = 'Doench_2016_Train',
type = 'train',
group_column='Target gene')
return data_class
# SpCas9 Datasets

def load_doench_2016_test():
data = pd.read_csv(os.path.join(curr_path(), 'data/Doench_2016_Test.csv'))
def load_doench_2016():
data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Doench_2016.csv.zip'))
data_class = Activity_Data(data = data, enzyme = en.cas9, kmer_column='30mer',
activity_column='score_drug_gene_rank',
name = 'Doench_2016_Test',
type = 'test',
name = 'Doench_2016',
group_column='Target gene')
return data_class

def load_meyers_2017_train():
data = pd.read_csv(os.path.join(curr_path(), 'data/Meyers_2017_Train.csv'))
data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Meyers_2017_Train.csv.zip'))
data_class = Activity_Data(data = data, enzyme = en.cas9, kmer_column='sgRNA context sequence',
activity_column='mean_activity',
name = 'Meyers_2017_Train',
type = 'train',
group_column='Gene Symbol')
group_column = 'Gene Symbol')
return data_class

def load_meyers_2017_test():
data = pd.read_csv(os.path.join(curr_path(),'data/Meyers_2017_Test.csv'))
data_class = Activity_Data(data = data, enzyme = en.cas9, kmer_column='sgRNA context sequence',
data = pd.read_csv(os.path.join(curr_path(),'data/datasets/Meyers_2017_Test.csv.zip'))
data_class = Activity_Data(data = data, enzyme=en.cas9, kmer_column='sgRNA context sequence',
activity_column='mean_activity',
name = 'Meyers_2017_Test',
type = 'test',
name='Meyers_2017_Test',
group_column='Gene Symbol')
return data_class

def load_kim_2019_train():
data = pd.read_csv(os.path.join(curr_path(),'data/Kim_2019_Train.csv'))
data = pd.read_csv(os.path.join(curr_path(),'data/datasets/Kim_2019_Train.csv.zip'))
data_class = Activity_Data(data = data, enzyme = en.cas9,
kmer_column='Target context sequence (4+20+3+3)',
activity_column='Background subtracted indel (%)',
name = 'Kim_2019_Train',
type = 'train')
name = 'Kim_2019_Train')
return data_class

def load_kim_2019_test():
data = pd.read_csv(os.path.join(curr_path(),'data/Kim_2019_Test.csv'))
data = pd.read_csv(os.path.join(curr_path(),'data/datasets/Kim_2019_Test.csv.zip'))
data_class = Activity_Data(data = data, enzyme = en.cas9,
kmer_column='Target context sequence (4+20+3+3)',
activity_column='Background subtracted indel frequencies\r(average, %)',
name = 'Kim_2019_Test',
type = 'test')
name = 'Kim_2019_Test')
return data_class

# AsCas12a datasets
def load_kim_2018_train():
data = pd.read_csv(os.path.join(curr_path(),'data/datasets/Kim_2018_Train.csv.zip'))
data_class = Activity_Data(data = data, enzyme = en.cas12a,
kmer_column='Context Sequence',
activity_column='Indel frequency',
name = 'Kim_2018_Train')
return data_class

def load_kim_2018_test():
data = pd.read_csv(os.path.join(curr_path(),'data/datasets/Kim_2018_Test.csv.zip'))
data_class = Activity_Data(data = data, enzyme = en.cas12a,
kmer_column='Context Sequence',
activity_column='Indel frequency',
name = 'Kim_2019_Test')
return data_class

# enAsCas12a datasets


33 changes: 4 additions & 29 deletions sgrna_modeler/enzymes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,9 @@
'V': ['A', 'G', 'C'],
'N': ['A', 'C', 'T', 'G']}

def expand_seq(encoded_seq, seq_str='', seqs=[]):
if not encoded_seq:
seqs.append(seq_str)
return seqs
else:
nts = nt_codes[encoded_seq[0]]
for nt in nts:
base = seq_str
base += nt
expand_seq(encoded_seq[1:], base, seqs)
return seqs
cas9 = {'guide_start':5, 'guide_length':20, 'pam_start':25,
'pam':'NGG', 'context_length':30}

def get_pam_splits(pam_list):
# Take a list of PAM string and return a list of PAMs
all_pams = set()
for pam in pam_list:
all_pams |= set(expand_seq(pam, seq_str='', seqs=[]))
return all_pams
cas12a = {'guide_start':9, 'guide_length':23,
'pam_start':5, 'pam':'TTTV', 'context_length':34}

class Enzyme(object):
# Default paramaters are for AsCas12a
def __init__(self, guide_start = 9, guide_length = 23,
pam_start = 5, pams = ['TTTN'], context_length = 34):
self.guide_start = guide_start
self.guide_length = guide_length
self.pam_start = pam_start
self.pams = pams
self.context_length = context_length

cas9 = Enzyme(context_length=30, guide_length=20, guide_start=5,
pam_start=25, pams=['TTTV'])
2 changes: 1 addition & 1 deletion sgrna_modeler/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def featurize_guides(kmers, features = None,
diff = features - possible_feats
assert ValueError(str(diff) + 'Are not currently supported as features')
current_path = os.path.abspath(os.path.dirname(__file__))
physio_path = os.path.join(current_path, 'physiochem.csv')
physio_path = os.path.join(current_path, 'data/features/physiochem.csv.zip')
physiochemical_data = pd.read_csv(physio_path)
k = len(kmers[0])
context_order = get_context_order(k)
Expand Down
7 changes: 0 additions & 7 deletions sgrna_modeler/keras_helper.py

This file was deleted.

125 changes: 104 additions & 21 deletions sgrna_modeler/models.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,70 @@
from sgrna_modeler import architectures as ar
from sgrna_modeler import features as fe
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from tensorflow import keras as k
import pandas as pd
import os
from joblib import load

class Model_Kim2018(object):
def __init__(self, random_state = 7, val_frac = 0.1):
def curr_path():
return os.path.dirname(__file__)

def get_deepcpf1_weights():
path = os.path.join(curr_path(), 'data/saved_models/Seq_deepCpf1_weights_tf.h5')
return path

def build_kim2018(input_shape=(34, 4)):
Input_SEQ = k.layers.Input(shape=input_shape)
C1 = k.layers.Convolution1D(80, 5, activation='relu')(Input_SEQ)
P1 = k.layers.AveragePooling1D(2)(C1)
F = k.layers.Flatten()(P1)
DO1 = k.layers.Dropout(0.3)(F)
D1 = k.layers.Dense(80, activation='relu')(DO1)
DO2 = k.layers.Dropout(0.3)(D1)
D2 = k.layers.Dense(40, activation='relu')(DO2)
DO3 = k.layers.Dropout(0.3)(D2)
D3 = k.layers.Dense(40, activation='relu')(DO3)
DO4 = k.layers.Dropout(0.3)(D3)
Output = k.layers.Dense(1, activation='linear')(DO4)
model = k.models.Model(inputs = Input_SEQ, outputs = Output)
return model

class Keras_sgrna_Model(object):
def __init__(self, random_state = 7, val_frac = 0.1, base_arc = None):
self.base_name = 'M_Kim_2018'
self.val_frac = val_frac
self.random_state = random_state
self.base_arc = ar.build_kim2018
if base_arc is None:
self.base_arc = build_kim2018
else:
self.base_arc = base_arc
self.train_dataset = None
self.enzyme = None
self.model = None
self.model_history = None
self.train_name = None

def load_weights(self, weights = None, name = None):
model = self.base_arc()
if weights is None:
deepcpf1_weights = get_deepcpf1_weights()
model.load_weights(deepcpf1_weights)
self.train_name = 'Seq-DeepCpf1'
else:
model.load_weights(weights)
self.train_name = name
self.model = model
return self

def fit(self, train_dataset):
self.train_datset = train_dataset
self.train_dataset = train_dataset
self.train_name = train_dataset.name
self.enzyme = train_dataset.enzyme
train_val_x, y = train_dataset.get_xy()
encoded_train_val_x = fe.encode_seqs(train_val_x)
train_x, val_x, train_y, val_y = train_test_split(encoded_train_val_x, y, test_size=self.val_frac,
random_state=self.random_state)
model = self.base_arc(input_shape = (train_dataset.enzyme.context_length,4))
model = self.base_arc(input_shape = (self.enzyme['context_length'],4))
model.compile(optimizer='RMSprop',loss='mse',metrics=['mae'])
self.model_history = model.fit(train_x, train_y, epochs = 200,
validation_data = (val_x, val_y),
Expand All @@ -39,35 +85,65 @@ def predict(self, test_dataset):
out_data['group'] = ''
out_data['prediction'] = predictions
out_data['model'] = self.base_name
out_data['training_data'] = self.train_datset.name
out_data['training_data'] = self.train_name
out_data['test_data'] = test_dataset.name
return out_data

class Model_Doench2016(object):
def __init__(self, random_state = 7, val_frac = 0.1):
def predict_seqs(self, seqs):
featurized_x = fe.encode_seqs(seqs)
predictions = self.model.predict(featurized_x).flatten()
return predictions

def get_rs2():
#path = os.path.join(curr_path(), 'data/saved_models/')
#return path
pass

def get_enPAM_GB():
path = os.path.join(curr_path(), 'data/saved_models/enPAM_GB.joblib')
return path

class SKLearn_sgrna_Model(object):
def __init__(self, val_frac = 0.1, model = None, features = None):
self.base_name = 'M_Doench_2016'
self.random_state = random_state
self.val_frac = val_frac
self.model = ensemble.GradientBoostingRegressor(n_iter_no_change=20,
validation_fraction = self.val_frac)
self.features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer',
'Pos. Dep. 1mer', 'Pos. Dep. 2mer',
'GC content', 'Tm']
if model is None:
# Gradient boosted model
self.model = ensemble.GradientBoostingRegressor(n_iter_no_change=20,
validation_fraction = self.val_frac)
else:
self.model = model
if features is None:
# Default features for RuleSet2
self.features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'GC content', 'Tm']
else:
self.features = features
self.enzyme = None
self.train_dataset = None
self.train_name = None


def load_model(self, model, enzyme, name):
self.enzyme = enzyme
self.model = load(model)
self.train_name = name
return self

def fit(self, train_dataset):
self.train_datset = train_dataset
self.train_name = train_dataset.name
self.enzyme = train_dataset.enzyme
train_val_x, y = train_dataset.get_xy()
featurized_train_val_x = fe.featurize_guides(train_val_x, features=self.features,
guide_start = train_dataset.enzyme.guide_start,
guide_length = train_dataset.enzyme.guide_length)
guide_start = self.enzyme['guide_start'],
guide_length = self.enzyme['guide_length'])
self.model.fit(featurized_train_val_x, y)
return self

def predict(self, test_dataset):
x, y = test_dataset.get_xy()
featurized_x = fe.featurize_guides(x, features=self.features,
guide_start=test_dataset.enzyme.guide_start,
guide_length=test_dataset.enzyme.guide_length)
guide_start=test_dataset.enzyme['guide_start'],
guide_length=test_dataset.enzyme['guide_length'])
predictions = self.model.predict(featurized_x)
out_data = pd.DataFrame({'kmer': x, 'y': y})
if test_dataset.group_column:
Expand All @@ -76,7 +152,14 @@ def predict(self, test_dataset):
out_data['group'] = ''
out_data['prediction'] = predictions
out_data['model'] = self.base_name
out_data['training_data'] = self.train_datset.name
out_data['training_data'] = self.train_name
out_data['test_data'] = test_dataset.name
return out_data

def predict_seqs(self, seqs):
featurized_x = fe.featurize_guides(seqs, features=self.features,
guide_start=self.enzyme['guide_start'],
guide_length=self.enzyme['guide_length'])
predictions = self.model.predict(featurized_x)
return predictions

17 changes: 0 additions & 17 deletions sgrna_modeler/physiochem.csv

This file was deleted.

Loading

0 comments on commit 23703c4

Please sign in to comment.