Skip to content

Commit 0167dcb

Browse files
New parquet utils (#5)
* add several new utility functions * normalize version * fixing bugs from RCs * 0.2.0 * add prefix param * 0.2.0
1 parent 1576952 commit 0167dcb

File tree

7 files changed

+189
-7
lines changed

7 files changed

+189
-7
lines changed

osc_ingest_trino/__init__.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,27 @@
55
"""
66

77
# defines the release version for this python package
8-
__version__ = "0.1.1"
8+
__version__ = "0.2.0"
99

1010
from .sqlcols import *
1111
from .sqltypes import *
12+
from .boto3_utils import *
13+
from .dotenv_utils import *
14+
from .trino_utils import *
1215

1316
__all__ = [
1417
"sql_compliant_name",
1518
"enforce_sql_column_names",
19+
"enforce_partition_column_order",
1620
"pandas_type_to_sql",
1721
"create_table_schema_pairs",
22+
"attach_s3_bucket",
23+
"upload_directory_to_s3",
24+
"load_credentials_dotenv",
25+
"attach_trino_engine",
26+
"drop_unmanaged_table",
27+
"drop_unmanaged_data",
28+
"ingest_unmanaged_parquet",
29+
"unmanaged_parquet_tabledef",
1830
]
1931

osc_ingest_trino/boto3_utils.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
import boto3
3+
4+
__all__ = [
5+
"upload_directory_to_s3",
6+
"attach_s3_bucket",
7+
]
8+
9+
def upload_directory_to_s3(path, bucket, prefix, verbose=False):
10+
path = str(path)
11+
prefix = str(prefix)
12+
for subdir, dirs, files in os.walk(path):
13+
for f in files:
14+
pfx = subdir.replace(path, prefix)
15+
src = os.path.join(subdir, f)
16+
dst = os.path.join(pfx, f)
17+
if verbose:
18+
print(f'{src} --> {dst}')
19+
bucket.upload_file(src, dst)
20+
21+
def attach_s3_bucket(env_var_prefix):
22+
s3 = boto3.resource(
23+
service_name="s3",
24+
endpoint_url=os.environ[f"{env_var_prefix}_ENDPOINT"],
25+
aws_access_key_id=os.environ[f"{env_var_prefix}_ACCESS_KEY"],
26+
aws_secret_access_key=os.environ[f"{env_var_prefix}_SECRET_KEY"],
27+
)
28+
return s3.Bucket(os.environ[f"{env_var_prefix}_BUCKET"])
29+

osc_ingest_trino/dotenv_utils.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import os
2+
import pathlib
3+
from dotenv import load_dotenv
4+
5+
__all__ = [
6+
"load_credentials_dotenv",
7+
]
8+
9+
def load_credentials_dotenv():
10+
# Load some standard environment variables from a dot-env file, if it exists.
11+
# If no such file can be found, does not fail, and so allows these environment vars to
12+
# be populated in some other way
13+
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
14+
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
15+
if os.path.exists(dotenv_path):
16+
load_dotenv(dotenv_path=dotenv_path, override=True)

osc_ingest_trino/sqlcols.py

+17
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
__all__ = [
55
"sql_compliant_name",
66
"enforce_sql_column_names",
7+
"enforce_partition_column_order",
78
]
89

910
_wsdedup = re.compile(r"\s+")
@@ -44,3 +45,19 @@ def enforce_sql_column_names(df, inplace=False, maxlen=63):
4445
rename_map = dict(list(zip(icols, ocols)))
4546
return df.rename(columns=rename_map, inplace=inplace)
4647

48+
def enforce_partition_column_order(df, pcols, inplace=False):
49+
if not isinstance(df, pd.DataFrame):
50+
raise ValueError("df must be a pandas DataFrame")
51+
if not isinstance(pcols, list):
52+
raise ValueError("pcols must be list of column names")
53+
pcols = [str(e) for e in pcols]
54+
cols = list(df.columns.values)
55+
for c in pcols:
56+
cols.remove(c)
57+
cols.append(c)
58+
if not inplace:
59+
return df[cols]
60+
for c in cols:
61+
s = df[c]
62+
df.drop(columns=[c], inplace=True)
63+
df[c] = s

osc_ingest_trino/sqltypes.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,11 @@ def pandas_type_to_sql(pt, typemap={}):
2929
return st
3030
raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))
3131

32-
# add ability to specify optional dict for specific fields?
33-
# if column name is present, use specified value?
34-
def create_table_schema_pairs(df, typemap={}):
32+
def create_table_schema_pairs(df, typemap = {}, indent = 4):
3533
if not isinstance(df, pd.DataFrame):
3634
raise ValueError("df must be a pandas DataFrame")
3735
ptypes = [str(e) for e in df.dtypes.to_list()]
3836
stypes = [pandas_type_to_sql(e, typemap=typemap) for e in ptypes]
3937
pz = list(zip(df.columns.to_list(), stypes))
40-
return ",\n".join([" {n} {t}".format(n=e[0],t=e[1]) for e in pz])
38+
return ",\n".join([f"{' '*indent}{e[0]} {e[1]}" for e in pz])
4139

osc_ingest_trino/trino_utils.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import os
2+
import shutil
3+
import uuid
4+
5+
import trino
6+
import pandas as pd
7+
from sqlalchemy.engine import create_engine
8+
9+
from .boto3_utils import upload_directory_to_s3
10+
from .sqltypes import create_table_schema_pairs
11+
12+
__all__ = [
13+
"attach_trino_engine",
14+
"drop_unmanaged_table",
15+
"drop_unmanaged_data",
16+
"ingest_unmanaged_parquet",
17+
"unmanaged_parquet_tabledef",
18+
]
19+
20+
_default_prefix = 'trino/{schema}/{table}'
21+
22+
def _remove_trailing_slash(s):
23+
s = str(s)
24+
if len(s) == 0: return s
25+
if (s[-1] != '/'): return s
26+
return _remove_trailing_slash(s[:-1])
27+
28+
def _prefix(pfx, schema, table):
29+
return _remove_trailing_slash(pfx).format(schema = schema, table = table)
30+
31+
def attach_trino_engine(env_var_prefix = 'TRINO'):
32+
sqlstring = 'trino://{user}@{host}:{port}/'.format(
33+
user = os.environ[f'{env_var_prefix}_USER'],
34+
host = os.environ[f'{env_var_prefix}_HOST'],
35+
port = os.environ[f'{env_var_prefix}_PORT']
36+
)
37+
sqlargs = {
38+
'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
39+
'http_scheme': 'https'
40+
}
41+
engine = create_engine(sqlstring, connect_args = sqlargs)
42+
connection = engine.connect()
43+
return engine
44+
45+
def drop_unmanaged_table(catalog, schema, table, engine, bucket, prefix=_default_prefix, verbose=False):
46+
sql = f'drop table if exists {catalog}.{schema}.{table}'
47+
qres = engine.execute(sql)
48+
dres = bucket.objects \
49+
.filter(Prefix = f'{_prefix(prefix, schema, table)}/') \
50+
.delete()
51+
if verbose:
52+
print(dres)
53+
return qres
54+
55+
def drop_unmanaged_data(schema, table, bucket, prefix=_default_prefix, verbose=False):
56+
dres = bucket.objects \
57+
.filter(Prefix = f'{_prefix(prefix, schema, table)}/') \
58+
.delete()
59+
if verbose: print(dres)
60+
return dres
61+
62+
def ingest_unmanaged_parquet(df, schema, table, bucket, partition_columns=[], append=True, workdir='/tmp', prefix=_default_prefix, verbose=False):
63+
if not isinstance(df, pd.DataFrame):
64+
raise ValueError("df must be a pandas DataFrame")
65+
if not isinstance(partition_columns, list):
66+
raise ValueError("partition_columns must be list of column names")
67+
68+
s3pfx = _prefix(prefix, schema, table)
69+
70+
if not append:
71+
dres = bucket.objects.filter(Prefix = f'{s3pfx}/').delete()
72+
if verbose: print(dres)
73+
74+
if len(partition_columns) > 0:
75+
# tell pandas to write a directory tree, using partitions
76+
tmp = f'{workdir}/{table}'
77+
# pandas does not clean out destination directory for you:
78+
shutil.rmtree(tmp, ignore_errors=True)
79+
df.to_parquet(tmp,
80+
partition_cols=partition_columns,
81+
index=False)
82+
# upload the tree onto S3
83+
upload_directory_to_s3(tmp, bucket, s3pfx, verbose=verbose)
84+
else:
85+
# do not use partitions: a single parquet file is created
86+
parquet = f'{uuid.uuid4().hex}.parquet'
87+
tmp = f'{workdir}/{parquet}'
88+
df.to_parquet(tmp, index=False)
89+
dst = f'{s3pfx}/{parquet}'
90+
if verbose: print(f'{tmp} --> {dst}')
91+
bucket.upload_file(tmp, dst)
92+
93+
def unmanaged_parquet_tabledef(df, catalog, schema, table, bucket, partition_columns = [], verbose = False):
94+
if not isinstance(df, pd.DataFrame):
95+
raise ValueError("df must be a pandas DataFrame")
96+
if not isinstance(partition_columns, list):
97+
raise ValueError("partition_columns must be list of column names")
98+
99+
columnschema = create_table_schema_pairs(df)
100+
101+
tabledef = f"create table if not exists {catalog}.{schema}.{table} (\n"
102+
tabledef += f"{columnschema}\n"
103+
tabledef += ") with (\n format = 'parquet',\n"
104+
if len(partition_columns) > 0:
105+
tabledef += f" partitioned_by = array{partition_columns},\n"
106+
tabledef += f" external_location = 's3a://{bucket.name}/trino/{schema}/{table}/'\n)"
107+
108+
if verbose: print(tabledef)
109+
return tabledef
110+

setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setup(
1212
name = "osc-ingest-tools",
13-
version = "0.1.1",
13+
version = "0.2.0",
1414
description = "python tools to assist with standardized data ingestion workflows for the OS-Climate project",
1515
long_description = README,
1616
long_description_content_type = "text/markdown",
@@ -26,7 +26,7 @@
2626
],
2727
packages = find_packages(),
2828
include_package_data = True,
29-
install_requires = ["pandas"],
29+
install_requires = ["pandas", "trino", "boto3", "sqlalchemy", "sqlalchemy-trino", "python-dotenv"],
3030
entry_points = {
3131
"console_scripts": []
3232
},

0 commit comments

Comments
 (0)