-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotator_parser.py
115 lines (102 loc) · 3.73 KB
/
annotator_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import os
import json
attribute_source = "infores:multiomics-drugapprovals"
faers = "infores:faers"
dailymed = "infores:dailymed"
kgInfoUrl = "https://db.systemsbiology.net/gestalt/cgi-pub/KGinfo.pl?id="
def load_content(data_folder):
edges_file_path = os.path.join(data_folder, "drug_approvals_kg_edges_v0.3.7.tsv")
nodes_file_path = os.path.join(data_folder, "drug_approvals_kg_nodes_v0.3.7.tsv")
nodes_data = pd.read_csv(nodes_file_path, sep='\t')
id_name_mapping = {}
id_type_mapping = {}
for index,row in nodes_data.iterrows():
id_name_mapping[row["id"]] = row["name"]
id_type_mapping[row["id"]] = row["category"]
edges_data = pd.read_csv(edges_file_path, sep='\t')
for index,line in edges_data.iterrows():
subj = line['subject']
pred = line['predicate']
obj = line['object']
if subj and pred and subj.split(':')[0] and obj.split(':')[0]:
source_record_url = kgInfoUrl + line['id']
prefix = obj.split(':')[0].replace(".","_")
disease = {
prefix.lower(): obj,
"name": id_name_mapping[obj],
}
# properties for predicate/association
edge_attributes = []
# approval status
status = ""
# sources
edge_sources = []
if pred == 'biolink:treats':
status = "approved_for_condition"
edge_sources = [
{
"resource_id": attribute_source,
"resource_role": "primary_knowledge_source",
"source_record_urls": [ source_record_url ]
},
{
"resource_id": dailymed,
"resource_role": "supporting_data_source"
},
{
"resource_id": faers,
"resource_role": "supporting_data_source"
}
]
else:
status = "not_approved_for_condition"
edge_sources = [
{
"resource_id": attribute_source,
"resource_role": "aggregator_knowledge_source",
"source_record_urls": [ source_record_url ]
},
{
"resource_id": faers,
"resource_role": "primary_knowledge_source"
},
{
"resource_id": dailymed,
"resource_role": "supporting_data_source"
}
]
# Yield subject, predicate, and object properties
data = {
"status": status,
"disease": disease,
"edge_id": line['id'],
#"sources": edge_sources
"source_record_urls": [ source_record_url ]
}
yield subj, data
else:
print(f"Cannot find prefix for {line} !")
def load_data(data_folder):
output = {}
final = []
edges = load_content(data_folder)
while 1:
try: subj, entry = next(edges)
except: break
if subj in output:
output[subj].append(entry)
else:
output.update({subj: [entry]})
for key in output:
final.append({"_id": key, "clinical_approval": output[key]})
for entry in final:
yield entry
def main():
gen = load_data('test')
while 1:
try: entry = next(gen)
except: break
print(json.dumps(entry, sort_keys=True, indent=2))
if __name__ == '__main__':
main()