-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautxmlhandler.py
83 lines (71 loc) · 2.64 KB
/
autxmlhandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from pymarc.marcxml import XmlHandler
from pymarc import Field, unicodedata
from autrecord import AutRecord
MARC_XML_NS = "http://www.loc.gov/MARC21/slim"
# SKIPPED = 990000
SKIPPED_MODULO = 10000
SKIPPED = 0
class AutXmlHandler(XmlHandler):
"""
You can subclass XmlHandler and add your own process_record
method that'll be passed a pymarc.Record as it becomes
available. This could be useful if you want to stream the
records elsewhere (like to a rdbms) without having to store
them all in memory.
"""
count = 0
mydata = []
def startElementNS(self, name, qname, attrs):
if self._strict and name[0] != MARC_XML_NS:
return
element = name[1]
self._text = []
if element == 'record':
self.count = self.count + 1
if self.count <= SKIPPED:
if (self.count % SKIPPED_MODULO == 0):
print(self.count)
return
if (self.count == SKIPPED):
print('milion')
self._record = AutRecord()
elif element == 'controlfield':
tag = attrs.getValue((None, 'tag'))
self._field = Field(tag)
elif element == 'datafield':
tag = attrs.getValue((None, 'tag'))
ind1 = attrs.get((None, 'ind1'), ' ')
ind2 = attrs.get((None, 'ind2'), ' ')
self._field = Field(tag, [ind1, ind2])
elif element == 'subfield':
self._subfield_code = attrs[(None, 'code')]
def endElementNS(self, name, qname):
if self._strict and name[0] != MARC_XML_NS:
return
if self.count <= SKIPPED:
if (self.count % SKIPPED_MODULO == 0):
print(self.count)
return
element = name[1]
if self.normalize_form is not None:
text = unicodedata.normalize(self.normalize_form, u''.join(self._text))
else:
text = u''.join(self._text)
if element == 'record':
clear = self.process_record(self._record, self.count, self.mydata)
if (clear == True):
self.mydata = []
self._record = None
elif element == 'leader':
self._record.leader = text
elif element == 'controlfield':
self._field.data = text
self._record.add_field(self._field)
self._field = None
elif element == 'datafield':
self._record.add_field(self._field)
self._field = None
elif element == "subfield":
self._field.add_subfield(self._subfield_code, text)
self._subfield_code = None
self._text = []