diff --git a/models/logistic_model.sav b/models/logistic_model.sav new file mode 100644 index 0000000..9fb9a84 Binary files /dev/null and b/models/logistic_model.sav differ diff --git a/pipeline/constants.py b/pipeline/constants.py index 48b531d..20b2562 100644 --- a/pipeline/constants.py +++ b/pipeline/constants.py @@ -9,6 +9,8 @@ max_frag = 10**6 DIRECTORY_INPUT = 0 FILE_INPUT = 1 +CHROM_PRED_UPPER_BOUND = 0.3 +PLAS_PRED_LOWER_BOUND = 0.7 # progress bar descriptions frag_bar_desc = "fragments " @@ -51,6 +53,7 @@ kmer_model_path = 'models/kmerModule100K.pth' biomer_model_path = 'models/biomer_model.sav' biomer_scalar_path = 'models/biomer_minmax_scaler.sav' +logistic_model_path = 'models/logistic_model.sav' inputFeaturesSize = int((4**k) / 2) layer_array = [512, 256] diff --git a/pipeline/predict/predict.py b/pipeline/predict/predict.py index ac9e006..43a575e 100644 --- a/pipeline/predict/predict.py +++ b/pipeline/predict/predict.py @@ -1,144 +1,10 @@ -from tqdm import tqdm -from sklearn.ensemble import RandomForestClassifier -from sklearn.preprocessing import MinMaxScaler -from scipy.interpolate import make_interp_spline -import numpy as np -import pandas as pd -from numpy.random import randint -import torch -import torch.nn as nn -import matplotlib.pyplot as plt -import seaborn as sns -import pickle -import os - -from pipeline.model.NNModule import Model +from pipeline.predict.predict_nn_rf import predict_nn_rf +from pipeline.predict.predict_combined import predict_combined from pipeline.constants import * -kmer_model = None -biomer_model = None - - -def setup_kmer_model(model_path): - model = Model(inputFeaturesSize, layer_array, outputSize) - model.load_state_dict(torch.load(model_path)) - model.double() - model.eval() - return model - - -def setup_biomer_model(model_path): - model = RandomForestClassifier( - n_estimators=10, max_depth=10, random_state=1) - model = pickle.load(open(model_path, 'rb')) - return model - - -def get_sequence_data(scaler_path): - df = pd.read_csv(all_results_path+'/data.csv') - features = list(df.columns) - features.remove('id') - features.sort() - scaler = MinMaxScaler() - scaler = pickle.load(open(scaler_path, 'rb')) - df[features] = scaler.transform(df[features]) - return df, features - - -def read_kmer_file(seq_id): - array = np.genfromtxt(all_results_path+"/kmers/"+seq_id, dtype=np.float64) - if(len(array.shape) == 1): - temp = [] - temp.append(array) - array = temp - return array - - -def get_prediction(value, model): - yb = model(value) - # print(yb) - _, preds = torch.max(yb, dim=0) - # print(_.item(),preds.item()) - return preds.item(), yb - - -def predict_kmer(sequence_id): - kmer_arrays = read_kmer_file(sequence_id) - # print(f'\nseq_id: {sequence_id} count: {len(kmer_arrays)}') - total = 0 - tot_probs = torch.tensor([0.0, 0.0]) - for m in kmer_arrays: - prediction, prob_tensor = get_prediction(torch.tensor(m), kmer_model) - tot_probs += prob_tensor - total += prediction - # print(f'kmer_plasmid_avg: {total/len(kmer_arrays)} prob_total = {tot_probs}') - probs_list = (tot_probs/(len(kmer_arrays))).tolist() - probs_list.insert(0, len(kmer_arrays)) - probs_list.append(total/len(kmer_arrays)) - return probs_list - - def predict(out_path): - global kmer_model - predictions = [] - biomer_model = setup_biomer_model(biomer_model_path) - kmer_model = setup_kmer_model(kmer_model_path) - sequence_df, features = get_sequence_data(biomer_scalar_path) - - seq_list = list(sequence_df['id'].unique()) - for seq in tqdm(seq_list): - kmer_prediction = predict_kmer(seq) - full_prediction = [seq] - full_prediction.extend(kmer_prediction) - # print(full_prediction) - selected = sequence_df.loc[sequence_df['id'] == seq][features] - full_prediction.extend( - (biomer_model.predict_proba(selected))[0].tolist()) - full_prediction.append((biomer_model.predict(selected)).item()) - predictions.append(full_prediction) - # print( - # f"biomer result: {biomer_model.predict(selected)} biomer result probs: {biomer_model.predict_proba(selected)}") - # print(full_prediction) - print('Writing predictions...') - predictions_df = pd.DataFrame(predictions, columns=['seq_id', 'fragment_count', 'kmer_chro_prob', - 'kmer_plas_prob', 'kmer_prediction_avg', 'biomer_chro_prob', 'biomer_plas_prob', 'biomer_prediction']) - - predictions_df['sum'] = ( - predictions_df['kmer_plas_prob'] + predictions_df['biomer_plas_prob'])/2 - predictions_df['product'] = ( - predictions_df['kmer_plas_prob'] * predictions_df['biomer_plas_prob'])**0.5 - predictions_df.to_csv(os.path.join( - out_path, 'predictions.csv'), index=False) - - print('Plotting graphs...') - - sns.scatterplot(data=predictions_df, - x="kmer_plas_prob", y="biomer_plas_prob", alpha=0.4) - - xu = np.array([0.25, 0.35355, 0.5, 0.7071, 1]) - yu = np.array([1, 0.7071, 0.5, 0.35355, 0.25]) - X_Y_Spline = make_interp_spline(xu, yu) - X_ = np.linspace(xu.min(), xu.max(), 500) - Y_ = X_Y_Spline(X_) - plt.plot(X_, Y_) - - xu = np.array([0, 0.2, 0.4, 0.6, 0.8, 1]) - yu = np.array([1, 0.8, 0.6, 0.4, 0.2, 0]) - X_Y_Spline = make_interp_spline(xu, yu) - X_ = np.linspace(xu.min(), xu.max(), 500) - Y_ = X_Y_Spline(X_) - plt.plot(X_, Y_) + predict_nn_rf(out_path) + predict_combined(out_path) - plt.savefig(os.path.join(out_path, 'predictions_scatter.png')) - plt.clf() - sns.histplot(data=predictions_df, x="sum") - plt.savefig(os.path.join(out_path, 'predictions_sum.png')) - plt.clf() - sns.histplot(data=predictions_df, x="product") - plt.savefig(os.path.join(out_path, 'predictions_predict.png')) - plt.clf() - - - if __name__ == "__main__": predict(all_results_path) diff --git a/pipeline/predict/predict_combined.py b/pipeline/predict/predict_combined.py new file mode 100644 index 0000000..655621f --- /dev/null +++ b/pipeline/predict/predict_combined.py @@ -0,0 +1,46 @@ +import pickle +from tqdm import tqdm +import os +import pandas as pd + +from pipeline.constants import * + +def setup_logistic_model(model_path): + pipe = pickle.load(open(model_path, 'rb')) + return pipe + +def get_feature_data(out_path): + df = pd.read_csv(os.path.join(out_path, 'predictions.csv')) + features = ["fragment_count", "kmer_plas_prob", "biomer_plas_prob"] + return df, features + +def get_sequence_class(prob): + if(prob<=CHROM_PRED_UPPER_BOUND): + return 'chromosome' + if(prob>=PLAS_PRED_LOWER_BOUND): + return 'plasmid' + return 'unclassified' + +def predict_combined(out_path): + predictions = [] + logistic_model = setup_logistic_model(logistic_model_path) + data_df, features = get_feature_data(out_path) + + data_list = list(data_df['seq_id'].unique()) + for seq in tqdm(data_list): + full_prediction = [seq] + selected = data_df.loc[data_df['seq_id'] == seq][features] + proba = logistic_model.predict_proba(selected)[0][1] + seq_class = get_sequence_class(proba) + full_prediction.append(proba) + full_prediction.append(seq_class) + predictions.append(full_prediction) + + predictions_df = pd.DataFrame(predictions, columns=['seq_id', 'plas_prob', 'class']) + data_df['final_plas_prob'] = predictions_df['plas_prob'] + data_df['class'] = predictions_df['class'] + print('Writing combined predictions...') + data_df.to_csv(os.path.join(out_path, 'predictions.csv'), index=False) + +if __name__ == "__main__": + predict_combined(all_results_path) diff --git a/pipeline/predict/predict_nn_rf.py b/pipeline/predict/predict_nn_rf.py new file mode 100644 index 0000000..71399d0 --- /dev/null +++ b/pipeline/predict/predict_nn_rf.py @@ -0,0 +1,144 @@ +from tqdm import tqdm +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import MinMaxScaler +from scipy.interpolate import make_interp_spline +import numpy as np +import pandas as pd +from numpy.random import randint +import torch +import torch.nn as nn +import matplotlib.pyplot as plt +import seaborn as sns +import pickle +import os + +from pipeline.model.NNModule import Model +from pipeline.constants import * + +kmer_model = None +biomer_model = None + + +def setup_kmer_model(model_path): + model = Model(inputFeaturesSize, layer_array, outputSize) + model.load_state_dict(torch.load(model_path)) + model.double() + model.eval() + return model + + +def setup_biomer_model(model_path): + model = RandomForestClassifier( + n_estimators=10, max_depth=10, random_state=1) + model = pickle.load(open(model_path, 'rb')) + return model + + +def get_sequence_data(scaler_path): + df = pd.read_csv(all_results_path+'/data.csv') + features = list(df.columns) + features.remove('id') + features.sort() + scaler = MinMaxScaler() + scaler = pickle.load(open(scaler_path, 'rb')) + df[features] = scaler.transform(df[features]) + return df, features + + +def read_kmer_file(seq_id): + array = np.genfromtxt(all_results_path+"/kmers/"+seq_id, dtype=np.float64) + if(len(array.shape) == 1): + temp = [] + temp.append(array) + array = temp + return array + + +def get_prediction(value, model): + yb = model(value) + # print(yb) + _, preds = torch.max(yb, dim=0) + # print(_.item(),preds.item()) + return preds.item(), yb + + +def predict_kmer(sequence_id): + kmer_arrays = read_kmer_file(sequence_id) + # print(f'\nseq_id: {sequence_id} count: {len(kmer_arrays)}') + total = 0 + tot_probs = torch.tensor([0.0, 0.0]) + for m in kmer_arrays: + prediction, prob_tensor = get_prediction(torch.tensor(m), kmer_model) + tot_probs += prob_tensor + total += prediction + # print(f'kmer_plasmid_avg: {total/len(kmer_arrays)} prob_total = {tot_probs}') + probs_list = (tot_probs/(len(kmer_arrays))).tolist() + probs_list.insert(0, len(kmer_arrays)) + probs_list.append(total/len(kmer_arrays)) + return probs_list + + +def predict_nn_rf(out_path): + global kmer_model + predictions = [] + biomer_model = setup_biomer_model(biomer_model_path) + kmer_model = setup_kmer_model(kmer_model_path) + sequence_df, features = get_sequence_data(biomer_scalar_path) + + seq_list = list(sequence_df['id'].unique()) + for seq in tqdm(seq_list): + kmer_prediction = predict_kmer(seq) + full_prediction = [seq] + full_prediction.extend(kmer_prediction) + # print(full_prediction) + selected = sequence_df.loc[sequence_df['id'] == seq][features] + full_prediction.extend( + (biomer_model.predict_proba(selected))[0].tolist()) + full_prediction.append((biomer_model.predict(selected)).item()) + predictions.append(full_prediction) + # print( + # f"biomer result: {biomer_model.predict(selected)} biomer result probs: {biomer_model.predict_proba(selected)}") + # print(full_prediction) + print('Writing NN, RF predictions...') + predictions_df = pd.DataFrame(predictions, columns=['seq_id', 'fragment_count', 'kmer_chro_prob', + 'kmer_plas_prob', 'kmer_prediction_avg', 'biomer_chro_prob', 'biomer_plas_prob', 'biomer_prediction']) + + predictions_df['sum'] = ( + predictions_df['kmer_plas_prob'] + predictions_df['biomer_plas_prob'])/2 + predictions_df['product'] = ( + predictions_df['kmer_plas_prob'] * predictions_df['biomer_plas_prob'])**0.5 + predictions_df.to_csv(os.path.join( + out_path, 'predictions.csv'), index=False) + + print('Plotting graphs...') + + sns.scatterplot(data=predictions_df, + x="kmer_plas_prob", y="biomer_plas_prob", alpha=0.4) + + xu = np.array([0.25, 0.35355, 0.5, 0.7071, 1]) + yu = np.array([1, 0.7071, 0.5, 0.35355, 0.25]) + X_Y_Spline = make_interp_spline(xu, yu) + X_ = np.linspace(xu.min(), xu.max(), 500) + Y_ = X_Y_Spline(X_) + plt.plot(X_, Y_) + + xu = np.array([0, 0.2, 0.4, 0.6, 0.8, 1]) + yu = np.array([1, 0.8, 0.6, 0.4, 0.2, 0]) + X_Y_Spline = make_interp_spline(xu, yu) + X_ = np.linspace(xu.min(), xu.max(), 500) + Y_ = X_Y_Spline(X_) + plt.plot(X_, Y_) + + plt.savefig(os.path.join(out_path, 'predictions_scatter.png')) + plt.clf() + sns.histplot(data=predictions_df, x="sum") + plt.savefig(os.path.join(out_path, 'predictions_sum.png')) + plt.clf() + sns.histplot(data=predictions_df, x="product") + plt.savefig(os.path.join(out_path, 'predictions_predict.png')) + plt.clf() + + + +if __name__ == "__main__": + predict_nn_rf(all_results_path)