-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv_from_json_dump.py
122 lines (101 loc) · 3.93 KB
/
csv_from_json_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python
# -*- coding: utf-8 -*-
# def load_with_bigjson():
# import bigjson
#
# with open('dump.json', 'rb') as f:
# j = bigjson.load(f)
# for line in j:
# print(line['type'])
# print(line['id'])
def header(properties, time_properties, wf_writer):
properties_names = {
'qid': 'AUTORITY-QID',
'P1': 'NKČR AUT name',
'P2': 'birth-NKCR',
'P2-prec': 'birth-NKCR-precision',
'P3': 'death-NKCR',
'P3-prec': 'death-NKCR-precision',
'P4': 'NKCR-AUT',
'P5': 'place-of-birth-NKCR',
'P6': 'place-of-death-NKCR',
'P7': 'place-of-birth-WD',
'P8': 'place-of-death-WD',
'P9': 'QID',
'P10': 'note-NKCR',
'P13': 'first-name',
'P14': 'surname',
'P15': 'birth-from-note',
'P15-prec': 'birth-from-note-precision',
'P16': 'death-from-note',
'P16-prec': 'death-from-note-precision',
'P17': 'type-of-record',
}
arr_to_csv = ['qid']
for prop in properties:
if prop in time_properties:
arr_to_csv.append(prop)
arr_to_csv.append(prop + '-prec')
else:
arr_to_csv.append(prop)
final_arr_to_csv = []
for cell in arr_to_csv:
final_arr_to_csv.append(properties_names.get(cell, ''))
wf_writer.writerow(final_arr_to_csv)
def prepare_json_dump():
# import ujson
import bigjson
import csv
properties = ['P1', 'P2', 'P3', 'P15', 'P16', 'P13', 'P14']
time_properties = ['P2', 'P3', 'P15', 'P16']
with open('exp_short.json', 'rb') as f:
print('start')
# data = ujson.load(f)
data = bigjson.load(f)
print('loaded json')
with open('new_nkcr.csv', mode='w') as wf:
wf_writer = csv.writer(wf, dialect='unix')
count = 0
header(properties, time_properties, wf_writer)
for line in data:
count = count + 1
if count % 1000 == 0:
print(count)
if line['type'] != 'property':
qid = line['id']
arr_to_csv = {'qid': qid}
precisions = []
for prop in properties:
if prop in line['claims']:
val = line['claims'][prop][0]['mainsnak']['datavalue']['value']
try:
try:
valitem = val.to_python()
if 'time' in valitem.keys():
whole_val = val
val = whole_val['time']
precisions.append(whole_val['precision'])
arr_to_csv[prop] = val
arr_to_csv[prop + '-prec'] = whole_val['precision']
else:
arr_to_csv[prop] = val
except Exception:
arr_to_csv[prop] = val
except Exception:
pass
else:
if prop in time_properties:
arr_to_csv[prop] = ''
arr_to_csv[prop + '-prec'] = ''
else:
arr_to_csv[prop] = ''
try:
final_arr_to_csv = []
for cell in arr_to_csv.items():
final_arr_to_csv.append(cell[1])
wf_writer.writerow(final_arr_to_csv)
except UnicodeEncodeError as e:
print(e)
print('unicode encode error u polozky: ' + qid)
prepare_json_dump()
# test_big_json()