-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxml_to_utf8.py
executable file
·29 lines (24 loc) · 966 Bytes
/
xml_to_utf8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env python
## TODO exclude ascii signs, including < (breaks html files)
import xml.etree.ElementTree as ET
def hex_to_char(s):
"""Return the unicode string of the 5-digit hex codepoint."""
return chr(int(s[1:], 16))
def main():
tree = ET.parse('unicode.xml')
root = tree.getroot()
for child in root:
try:
if child.tag == 'character' and child.attrib['mode'] == 'math':
unicode_id = child.attrib['id']
if len(unicode_id) == 6: # skip unicode ranges
latex = child.find('latex')
description = child.find('description')
if latex is not None and description is not None:
char = hex_to_char(unicode_id)
if ord(char) > 127:
print(char, latex.text, description.text)
except KeyError:
pass
if __name__ == "__main__":
main()