-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathskeleton_script_roc_plot_tsv.py
166 lines (134 loc) · 5.53 KB
/
skeleton_script_roc_plot_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Description:
Draws a ROC plot from three .tsv files.
It should be executed by specifying three input predictor .tsv files from one of the ClinVar datasets
with HGVS IDs and scores (three predictors: SIFT, PolyPhen, and BLOSUM62).
-ititle or --plot_title (optional): with this argument followed by a title string, a title provided by the user
will be added to plot. Otherwise, the default will be used.
Execute with:
python3 skeleton_script_plot_tsv.py -itsv <ROCplot_filename_sift>_xy.tsv -itsv <ROCplot_filename_polyphen>_xy.tsv
-itsv <ROCplot_filename_baseline>_xy.tsv -o <ROCplot_filename_all>_xy.tsv>.png
"""
import argparse
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
def parse_args():
"""
Parses inputs from the commandline.
:return: inputs as a Namespace object
"""
parser = argparse.ArgumentParser(description="Draws and saves a ROC plot for three different predictors "
"(SIFT, Polyphen, and BLOSUM62) in one figure "
"using three .tsv files from one of the three datasets")
parser.add_argument("-itsv", "--tsv_file_of_coordinates", help="tab-separated .tsv file with ROC coordinates. "
"This arguments is required!",
action='append', required=True)
parser.add_argument("-ititle", "--plot_title", help="title of the ROC plot (a default str will be provided)",
required=False)
parser.add_argument("-o", "--output_png", help="output .png file path. This arguments is required!", required=True)
return parser.parse_args()
def parse_tsv_file(filename):
"""
Parses .tsv file
:param filename: a str with the input .tsv file name
:return: list of fpr values and list of tpr values
"""
global type_vep
if 'sift' in filename:
type_vep = 'sift'
elif 'polyphen' in filename:
type_vep = 'polyphen'
elif 'baseline' in filename:
type_vep = 'BLOSUM'
else:
type_vep = ''
fpr = []
tpr = []
with open(filename, 'r') as f:
for line in f:
line = line.rstrip()
arr = line.split("\t")
if len(arr) != 2:
print("Warning: the following line does not have two elements separated by a tab:\n", line)
fpr.append(float(arr[0]))
tpr.append(float(arr[1]))
return fpr, tpr
def integrate(fpr, tpr):
"""
Calculate the Area Under the Curve (AUC) for a given list of coordinates.
:param fpr: a list of fpr
:param tpr: a list of tpr
:return: a float with AUC
"""
auc = 0.
last_fpr = fpr[0]
last_tpr = tpr[0]
for cur_fpr, cur_tpr in list(zip(fpr, tpr))[1:]:
#########################
### START CODING HERE ###
#########################
# Trapezoid rule
auc += (cur_fpr - last_fpr) * (cur_tpr+last_tpr)/2
#########################
### END CODING HERE ###
#########################
last_fpr = cur_fpr
last_tpr = cur_tpr
return auc
def roc_plot_together(list_fpr, list_tpr, labels, out_filepath, title):
"""
Draws a ROC plot for three predictors in one figure without gradient color.
:param list_tpr: a list of lists with TPRs
:param list_fpr: a list of lists with FPRs
:param label: a list with labels for each of the three ROC curves
:param out_filepath: a str with the output .png file path
:param title: a str with the title of the plot
"""
lw = 1
list_color = ['g', 'r', 'm']
figure, axes = matplotlib.pyplot.subplots(1, 1)
for tpr, fpr, color, label in zip(list_tpr, list_fpr, list_color, labels):
auc = integrate(fpr, tpr)
line_label = '{} (AUC= {:.3f})'.format(label, auc)
axes.plot(fpr, tpr, c=color, label=line_label)
axes.plot((0, 1), (0, 1), '--', color='navy', lw=lw, linestyle='--', label='Random')
axes.set_xlim([-0.008, 1.008])
axes.set_ylim([-0.008, 1.008])
axes.legend()
axes.set_title(title)
axes.set_xlabel('False Positive Rate')
axes.set_ylabel('True Positive Rate')
matplotlib.pyplot.savefig(out_filepath)
def main():
# Process arguments
args = parse_args()
tsv_files = args.tsv_file_of_coordinates
out_filepath = args.output_png
title = args.plot_title
if title is None:
title = 'ROC of three impact predictors'
out_dir, out_filename = os.path.split(out_filepath)
# Check if output filename contains .png extension
if '.png' not in out_filename:
sys.exit('Error: filename in the output file path argument should contain .png extension!')
# Check if output directory exists
if not os.path.exists(out_dir):
sys.exit(r'Error: output directory %s to store the ROC plot does not exist! You should create one beforehand.'
% out_dir)
if len(tsv_files) != 3:
sys.exit('Error: to plot baseline, sift, polyphen all together, '
'please input three .tsv files by adding -itsv before each file')
else:
labels = []
list_fpr = []
list_tpr = []
for coordinate_list in tsv_files:
fpr, tpr = parse_tsv_file(coordinate_list)
labels.append(type_vep)
list_fpr.append(fpr)
list_tpr.append(tpr)
roc_plot_together(list_fpr, list_tpr, labels, out_filepath, title)
if __name__ == "__main__":
main()