-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathjson2tsv
executable file
·115 lines (99 loc) · 2.9 KB
/
json2tsv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python2.7
"""
Convert a stream of JSON objects to TSV, extracting the keys
you specify into columns.
If you don't specify any keys, it tries to figure out the set of all keys.
"""
import json as jsonmod
try:
exec "import ujson as jsonmod"
except ImportError:
pass
import sys,re,itertools
#all_json = jsonmod.load(sys.stdin)
#assert isinstance(all_json, list) and len(all_json)>0
#item1 = json[0]
#keys = items1.keys()
#keys.sort()
#json_iter = all_json
def safe_json_iter(raw_json_iter):
for raw in raw_json_iter:
try:
data = jsonmod.loads(raw)
yield data
except Exception, e:
pass
# print>>sys.stderr, type(e), e
# print>>sys.stderr, repr(raw)
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(itertools.islice(iterable, n))
def order_keys(keys, item_sample):
# Some heuristics that may or may not help
from collections import defaultdict
types = defaultdict(list)
lengths = defaultdict(list)
mid = int(len(item_sample)/2)
def median(L): return L[mid]
for item in item_sample:
for key in keys:
value = item[key]
types[key].append(type(value))
if isinstance(value, unicode):
lengths[key].append(len(value))
lengths = dict((k, 0 if not L else median(L)) for (k,L) in lengths.items())
# types = dict((k,tsvutil.mode(L)) for k,L in types.items())
types = dict((k,L[0]) for k,L in types.items())
prios = [
lambda k: k=='id',
lambda k: k.endswith('_id'),
lambda k: k==('docid'),
lambda k: True]
def score(key):
prio = [i for i,f in enumerate(prios) if f(key)][0]
return (prio, lengths.get(key,0))
return sorted(keys, key=score)
args = sys.argv[1:]
PRINT_HEADER = False
if '-c' in args:
PRINT_HEADER = True
args.pop(args.index('-c'))
json_iter = safe_json_iter(sys.stdin)
keys = args
if not keys:
top = take(1000, json_iter)
keys = set()
for item in top: keys |= set(item.keys())
keys = order_keys(keys, top)
json_iter = itertools.chain(top, json_iter)
BAD_WS = re.compile(r"[\r\n\t]")
def clean_cell(x):
if x is None: return ""
if isinstance(x, unicode):
return BAD_WS.sub(" ", x.encode('utf-8'))
if isinstance(x, str):
return BAD_WS.sub(" ", x)
return x
def stringify(x):
if isinstance(x, str) and not isinstance(x, unicode): return x
if isinstance(x, unicode): return x.encode('utf-8')
return str(x)
def lookup(myjson, k):
# return myjson[k]
if '.' in k:
# jpath path
ks = k.split('.')
v = myjson
for k in ks:
if v is None:
pass
else:
v = v.get(k,{})
return v or ""
return myjson.get(k,"")
if PRINT_HEADER:
print "\t".join([k.encode('utf-8') for k in keys])
for json in json_iter:
print "\t".join([stringify(clean_cell(lookup(json,k))) for k in keys])
#print "\t".join([clean_cell(json[k]) for k in keys])
#print(*[clean_cell(json[k]) for k in keys], sep="\t")