-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparse-xml
executable file
·124 lines (110 loc) · 4.46 KB
/
parse-xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
from ipaddress import IPv4Network, IPv4Address, IPv6Network, IPv6Address
from hashlib import sha1
import gzip
import json
import os
import sys
import tempfile
import time
import xml.etree.cElementTree as ET
RKN_EPOCH = 1343462400 # Sat Jul 28 12:00:00 MSK 2012
JSONZ_SUFFIX = '.kabysdoh.json'
GZIP_LEVEL = 3 # {1,2,3} have ~same speed, 4 is 40% slower.
# Parsing of dump.xml is tricky. E.g. what does ipSubnet mean for blockType="domain"?
#
# <content id="114111" includeTime="2014-11-12T22:04:31" entryType="1" blockType="domain" hash="D993259AF65653E2E7EAFA613DAFCB0E">
# <decision date="2014-11-05" number="2/1/11-28574" org="ФСКН"/>
# <domain><![CDATA[legalrc.biz]]></domain>
# <ip>193.105.213.21</ip>
# <ip>193.105.213.25</ip>
# <ip>193.105.213.26</ip>
# <ip>193.105.213.29</ip>
# <ip>193.105.213.241</ip>
# <ip>193.105.213.35</ip>
# <ip>193.105.213.27</ip>
# <ip>193.105.213.40</ip>
# <ipSubnet>193.105.213.36/30</ipSubnet>
# </content>
def parse_xml(file):
# `file` might be both file path and fileobj.
ctx = ET.iterparse(file, events=('start', 'end')) # FIXME(pylint3): Using deprecated method iterparse().
_, root = next(ctx) # get the root element
ip4, net4 = set(), set()
ip6, net6 = set(), set()
for event, el in ctx:
if event == 'end' and el.tag == 'content':
blockType = el.attrib.get('blockType') # 'domain', 'domain-mask', 'ip' and None
if blockType in ('domain', 'domain-mask'):
isIp = False
elif blockType == 'ip':
isIp = True
else:
# default blockType handling is a bit heuristical
if blockType is not None:
print('Unknown <content blockType="{:s}">'.format(blockType), file=sys.stderr)
for child in el:
if child.tag in ('url', 'domain'):
isIp = False # that's _usually_ passed through DPI and <ip/> is ignored
break
else:
isIp = True
if isIp:
for child in el:
if child.tag == 'ip':
ip4.add(child.text)
elif child.tag == 'ipSubnet':
net4.add(child.text)
elif child.tag == 'ipv6':
ip6.add(child.text)
elif child.tag == 'ipv6Subnet':
net6.add(child.text)
root.clear() # mem clean-up
# Normalization is needed as the folowing are the real datapoints:
# <ipv6>2600:9000:2077:2000:0009:dfa0:31c0:93a1</ipv6>
# <ipv6Subnet ts="2018-11-08T17:00:00+03:00">2a02:4680:22::214/32</ipv6Subnet>
# TODO: aggregation of networks MIGHT be a good idea, but it's not needed right now
net4 = {IPv4Network(_, strict=False) for _ in net4}
net4 = sorted(net4)
# <ip> contained in <ipSubnet> is a usual thing.
ip4 = {IPv4Address(_) for _ in ip4}
ip4 = [_ for _ in ip4 if all(_ not in net for net in net4)]
ip4.sort()
net6 = {IPv6Network(_, strict=False) for _ in net6}
net6 = sorted(net6)
ip6 = {IPv6Address(_) for _ in ip6}
ip6 = [_ for _ in ip6 if all(_ not in net for net in net6)]
ip6.sort()
return ip4, net4, ip6, net6
def make_blob(src):
with gzip.GzipFile(src) as fd:
ip4, net4, ip6, net6 = parse_xml(fd)
ip4 = [str(_) for _ in ip4]
net4 = [str(_) for _ in net4]
ip6 = [str(_) for _ in ip6]
net6 = [str(_) for _ in net6]
# NB: `blob` must have consistent content hash, e.g. there should be no timestamps.
blob = json.dumps({
'ip': ip4,
'ipSubnet': net4,
'ipv6': ip6,
'ipv6Subnet': net6,
}, ensure_ascii=False, sort_keys=True, separators=(',', ':')).encode('utf-8')
return blob
def main():
src, dst = sys.argv[1:]
if os.path.exists(dst):
raise RuntimeError('Already exists', dst)
blob = make_blob(src)
filename = sha1(blob).hexdigest().lower() + JSONZ_SUFFIX
destdir = os.path.dirname(dst)
with tempfile.NamedTemporaryFile(dir=destdir) as raw, \
gzip.GzipFile(fileobj=raw, mode='wb', compresslevel=GZIP_LEVEL, filename=filename, mtime=RKN_EPOCH) as out:
out.write(blob)
out.flush()
raw.flush()
os.link(raw.name, dst)
update_time = os.path.getmtime(src)
os.utime(dst, (time.time(), update_time))
if __name__ == '__main__':
main()