Skip to content

Commit

Permalink
Merge pull request #10 from MetaGSC/seqlen2
Browse files Browse the repository at this point in the history
Add seq length to logistic regression model
  • Loading branch information
gayaldassanayake authored Feb 7, 2022
2 parents ffdb385 + 1e53063 commit 7a6f6f9
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 5 deletions.
Binary file modified models/logistic_model.sav
Binary file not shown.
Binary file added models/logistic_model_v2.1.sav
Binary file not shown.
1 change: 1 addition & 0 deletions pipeline/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
kmer_write_path = "results/kmers"
circ_write_path = "results/circular"
dataset_path = "results/data.csv"
seqlen_path = "results/seqlen.csv"

# temp paths
all_temp_path = "temp"
Expand Down
14 changes: 11 additions & 3 deletions pipeline/predict/predict_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@ def setup_logistic_model(model_path):
pipe = pickle.load(open(model_path, 'rb'))
return pipe

def get_sequence_lengths(out_path):
len_df = pd.read_csv(os.path.join(out_path, 'seqlen.csv'))
return len_df

def get_feature_data(out_path):
df = pd.read_csv(os.path.join(out_path, 'predictions.csv'))
features = ["fragment_count", "kmer_plas_prob", "biomer_plas_prob"]
feature_df = pd.read_csv(os.path.join(out_path, 'predictions.csv'))
len_df = get_sequence_lengths(out_path)
df = pd.merge(feature_df, len_df, on='seq_id', how="left")
features = ["fragment_count", "kmer_plas_prob", "biomer_plas_prob", "length"]
return df, features

def get_sequence_class(prob):
Expand All @@ -31,6 +37,8 @@ def predict_combined(out_path):
for seq in tqdm(data_list):
full_prediction = [seq]
selected = data_df.loc[data_df['seq_id'] == seq][features]
# length = len_df.loc[len_df['seq_id'] == seq]['length'].values[0]

proba = logistic_model.predict_proba(selected)[0][1]
seq_class = get_sequence_class(proba)
full_prediction.append(proba)
Expand All @@ -41,7 +49,7 @@ def predict_combined(out_path):
data_df['final_plas_prob'] = predictions_df['plas_prob']
data_df['class'] = predictions_df['class']
print('Writing combined predictions...')
data_df.to_csv(os.path.join(out_path, 'predictions.csv'), index=False)
data_df.to_csv(os.path.join(out_path, 'final_predictions.csv'), index=False)

if __name__ == "__main__":
out_path = sys.argv[1]
Expand Down
2 changes: 1 addition & 1 deletion pipeline/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def generate_seq_map(files):
for file in files:
try:
for record in SeqIO.parse(file, 'fasta'):
seq_map[record.id] = {'filepath':file}
seq_map[record.id] = {'filepath':file, 'length':len(record.seq)}
except Exception as err:
print_error(f"Error reading fasta file {file}: {err}")
return seq_map
Expand Down
10 changes: 9 additions & 1 deletion pipeline/preprocess/write_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ def write_dataset(seq_map):
]

rows = []
len_rows = []
for key in seq_map.keys():
row = [seq_map[key][x] for x in feature_keys[1:]]
len_rows.append([key, seq_map[key]['length']])
row.insert(0, key)
rows.append(row)

Expand All @@ -27,7 +29,13 @@ def write_dataset(seq_map):
writer = csv.writer(f)
writer.writerow(feature_keys)
writer.writerows(rows)

with open(seqlen_path, 'w') as f:
writer = csv.writer(f)
writer.writerow(["seq_id", "length"])
writer.writerows(len_rows)

print_log("Writing biomarker features completed\n")

except IOError:
print_error("I/O error while writing the biomarkers.")

0 comments on commit 7a6f6f9

Please sign in to comment.