-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
97 lines (80 loc) · 3.15 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import sys
import time
import xml.etree.ElementTree as ET
from collections import OrderedDict
from runner import logging
GB_IN_BYTES = 1024**3
def return_text(element: ET.Element) -> str | None:
return None if element == None else element.text
def extract_data(record: ET.Element) -> list[OrderedDict]:
"""Extracts the important information from a EXPERIMENT_PACKAGE record.
Args:
record (ET.Element): xml element of the EXPERIMENT_PACKAGE record
Returns:
list[dict]: list of all the run records extracted into a dict.
"""
data_rec = OrderedDict()
data_rec["Experiment"] = record.find("EXPERIMENT").attrib.get("accession")
# library info
lib_descriptor = record.find("EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR")
data_rec["LibraryStrategy"] = return_text(lib_descriptor.find("LIBRARY_STRATEGY"))
data_rec["LibrarySelection"] = return_text(lib_descriptor.find("LIBRARY_SELECTION"))
data_rec["LibrarySource"] = return_text(lib_descriptor.find("LIBRARY_SOURCE"))
data_rec["LibraryLayout"] = [i.tag for i in lib_descriptor.find("LIBRARY_LAYOUT")][
0
]
# study info
data_rec["SRAStudy"] = return_text(record.find("STUDY/IDENTIFIERS/PRIMARY_ID"))
data_rec["BioProject"] = return_text(record.find("STUDY/IDENTIFIERS/EXTERNAL_ID"))
# sample info
data_rec["Sample"] = record.find("SAMPLE").attrib.get("accession")
data_rec["BioSample"] = return_text(record.find("SAMPLE/IDENTIFIERS/EXTERNAL_ID"))
data_rec["ScientificName"] = return_text(
record.find("SAMPLE/SAMPLE_NAME/SCIENTIFIC_NAME")
)
# run info for the experiment
run_records = []
for run in record.findall("RUN_SET/RUN"):
data_rec_n = data_rec.copy()
size = (
None
if run.attrib.get("size") == None
else str(round(int(run.attrib.get("size")) / GB_IN_BYTES, 4))
)
data_rec_n.update(
{
"Run": run.attrib.get("accession"),
"Bases": run.attrib.get("total_bases"),
"SizeGB": size,
"Published": run.attrib.get("published"),
"IsPublic": run.attrib.get("is_public"),
}
)
run_records.append(data_rec_n)
return run_records
logging.info("Record parsing started...")
start_run = time.time()
record = []
count = 0
for line in sys.stdin:
if (line := line.strip()) in [
'<?xml version="1.0" encoding="UTF-8" ?>',
"<!DOCTYPE EXPERIMENT_PACKAGE_SET>",
"<EXPERIMENT_PACKAGE_SET>",
]:
continue
record.append(line)
# find the end of the experiment package xml record
if line == "</EXPERIMENT_PACKAGE>":
parsed_records = extract_data(ET.fromstringlist(record))
record = []
count += 1
if count == 1:
print("\t".join(list(parsed_records[0].keys())))
for rec in parsed_records:
data = [str(i) for _, i in rec.items()]
print("\t".join(data), file=sys.stdout)
if count % 1000 == 0:
logging.info(f"{count} records parsed in {round(time.time()-start_run)}s")
logging.info(f"Run records parsed = {count}")
sys.exit(0)