-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDOI_B2d.py
165 lines (155 loc) · 8.25 KB
/
DOI_B2d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 3 09:15:57 2016
@author: Alice FREMAND
"""
def xml2B2d(filePath):
from lxml import etree
import io
##############################################################
###################### XML PARSING ##########################
#############################################################
doc = etree.parse(io.BytesIO(filePath))
root = doc.getroot()
tree = etree.ElementTree(root)
data={}
DOI_xml = root.find('{http://datacite.org/schema/kernel-3}identifier')
data['doi'] = DOI_xml.text
creators_xml = root.find('{http://datacite.org/schema/kernel-3}creators')
# for i in range(0, authorCounter - 1):
# author_xml.getparent().addnext(deepcopy(author_xml.getparent()))
# for i in range(0, authorCounter ):
# if 'authorFirstname%d' %i in data.keys():
# creators_xml[i][0].text = data['authorName%d' %i] +', ' + data['authorFirstname%d' %i]
creator_xml = []
orcid_xml = []
affiliation_xml = []
data['authorCounter'] = len(creators_xml)
for i in range(0, len(creators_xml)):
for creators in creators_xml:
creator_xml.append(creators.find('{http://datacite.org/schema/kernel-3}creatorName').text)
orcid_xml.append(creators.find('{http://datacite.org/schema/kernel-3}nameIdentifier'))
affiliation_xml.append(creators.find('{http://datacite.org/schema/kernel-3}affiliation'))
data['authorName%d' %i] = creator_xml[i].encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
if orcid_xml[i] != None:
data['orcid%s' %i] = orcid_xml[i].text
else:
data['orcid%s' %i] = ''
if affiliation_xml[i] != None:
data['affiliation%s' %i] = affiliation_xml[i].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['affiliation%s' %i] = ''
title_xml = root.find('{http://datacite.org/schema/kernel-3}titles')[0]
data['title'] = title_xml.text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
subtitle_xml = root.find('{http://datacite.org/schema/kernel-3}titles')[1]
if subtitle_xml in locals():
data['subtitle'] = subtitle_xml.text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['subtitle'] = ''
publisher_xml = root.find('{http://datacite.org/schema/kernel-3}publisher')
data['publisher'] = publisher_xml.text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
year_xml = root.find('{http://datacite.org/schema/kernel-3}publicationYear')
data['year'] = year_xml.text
subjects_xml = root.find('{http://datacite.org/schema/kernel-3}subjects')
subject_xml = []
if subjects_xml != None:
data['subjectCounter'] = len(subjects_xml)
for i in range(0, len(subjects_xml)):
subject_xml.append(subjects_xml[i].text)
data['subject%s' %i] = subject_xml[i].encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
contributors_xml = root.find('{http://datacite.org/schema/kernel-3}contributors')
contributor_xml = []
contributor_orcid_xml = []
contributor_affiliation_xml = []
contributor_type_xml = []
if contributors_xml != None:
data['contributorCounter'] = len(contributors_xml)
for i in range(0, len(contributors_xml)):
for contributors in contributors_xml:
contributor_xml.append(contributors.find('{http://datacite.org/schema/kernel-3}contributorName'))
contributor_orcid_xml.append(contributors.find('{http://datacite.org/schema/kernel-3}nameIdentifier'))
contributor_affiliation_xml.append(contributors.find('{http://datacite.org/schema/kernel-3}affiliation'))
data['contributorName%s' %i] = contributor_xml[i].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
if contributor_orcid_xml[i] != None:
data['contributorOrcid%s' %i] = contributor_orcid_xml[i].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
if contributor_affiliation_xml[i] != None:
data['contributorAffiliation%s' %i] = contributor_affiliation_xml[i].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
if contributors_xml[i].attrib['contributorType'] != None:
data['contributorType%d' %i] = contributors_xml[i].attrib['contributorType']
date_xml = root.find('{http://datacite.org/schema/kernel-3}dates')[0]
language_xml = root.find('{http://datacite.org/schema/kernel-3}language')
data['language'] = language_xml.text
datatype_xml = root.find('{http://datacite.org/schema/kernel-3}resourceType')
if datatype_xml != None:
if datatype_xml.text != None:
data['datatypeText'] = datatype_xml.text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['datatypeText'] =''
if datatype_xml.attrib['resourceTypeGeneral'] != None:
data['datatype'] = datatype_xml.attrib['resourceTypeGeneral']
else:
data['datatype'] = ''
url_xml = root.find('{http://datacite.org/schema/kernel-3}alternateIdentifiers')[0]
data['url'] = url_xml.text
size_xml = root.find('{http://datacite.org/schema/kernel-3}sizes')
if size_xml != None:
data['size'] = size_xml[0].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['size'] = ''
format1_xml = root.find('{http://datacite.org/schema/kernel-3}formats')
if format1_xml != None:
data['format1'] = format1_xml[0].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['format1'] = ''
version_xml = root.find('{http://datacite.org/schema/kernel-3}version')
if version_xml != None:
data['version'] = version_xml.text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['version'] = ''
license_xml = root.find('{http://datacite.org/schema/kernel-3}rightsList')
if license_xml != None:
data['license'] = license_xml[0].text
else:
data['license'] = 'None'
abstract_xml = root.find('{http://datacite.org/schema/kernel-3}descriptions')
if abstract_xml != None:
data['abstract'] = abstract_xml[0].text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['abstract'] =''
loc_xml = root.find('{http://datacite.org/schema/kernel-3}geoLocations')
if loc_xml != None:
for loc in loc_xml:
geoPoint_xml= loc.find('{http://datacite.org/schema/kernel-3}geoLocationPoint')
geoBox_xml = loc.find('{http://datacite.org/schema/kernel-3}geoLocationBox')
geoPlace_xml = loc.find('{http://datacite.org/schema/kernel-3}geoLocationPlace')
if geoPoint_xml != None:
data['longitude'], data['latitude'] = geoPoint_xml.text.split(' ')
else:
data['longitude']= ''
data['latitude'] = ''
if geoBox_xml != None:
data['east'], data['north'], data['west'], data['south'] = geoBox_xml.text.split(' ')
else:
data['east'] = ''
data['north'] = ''
data['west'] = ''
data['south'] = ''
if geoPlace_xml != None:
data['geoPlace'] = geoPlace_xml.text.encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
else:
data['geoPlace'] = 0
relationType = []
relatedID = []
relationID = []
relations_xml = root.find('{http://datacite.org/schema/kernel-3}relatedIdentifiers')
if relations_xml != None:
data['relationCounter'] = len(relations_xml)
for i in range(0, len(relations_xml)):
relationType.append(relations_xml[i].attrib['relationType'])
relationID.append(relations_xml[i].text)
relatedID.append(relations_xml[i].attrib['relatedIdentifierType'])
data['relationType%s' %i] = relationType[i]
data['relatedID%s' %i] = relatedID[i].encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
data['relationID%s' %i] = relationID[i].encode('ascii', 'xmlcharrefreplace').decode('utf-8', 'ignore')
return data