-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtab2parquet.py
executable file
·59 lines (49 loc) · 1.27 KB
/
tab2parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
# Sam Shepard - 2020
# See defaults and options: https://arrow.apache.org/docs/python/index.html
import sys
if len(sys.argv) != 3:
print("Usage:\n\tpython " + sys.argv[0] + "\t<input.txt> <output.parquet>")
exit(1)
import pyarrow.parquet as pq
from pyarrow import csv
input_table = sys.argv[1]
output_parquet = sys.argv[2]
# Attribution: https://github.com/apache/arrow/blob/f7ef65e5fc367f1f5649dfcea0754e413fcca394/cpp/src/arrow/csv/options.cc#L28-L30
# Added Impala-style null strings
# Have to remove "NA" if strings can be NULL
null_values = [
"",
"#N/A",
"#N/AN/A",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"N/A",
"NULL",
"NaN",
"n/a",
"nan",
"null",
"\\N",
]
try:
table = csv.read_csv(
input_table,
parse_options=csv.ParseOptions(delimiter="\t"),
read_options=csv.ReadOptions(autogenerate_column_names=True),
convert_options=csv.ConvertOptions(
null_values=null_values, strings_can_be_null=True
),
)
except IOError:
print("Error reading/parsing: " + input_table)
exit(2)
try:
pq.write_table(table, output_parquet)
except IOError:
print("Error writing parquet: " + output_parquet)
exit(2)