Skip to content

Commit df7a3f3

Browse files
expose typemap, add colmap (#8)
1 parent 0472f18 commit df7a3f3

File tree

4 files changed

+26
-13
lines changed

4 files changed

+26
-13
lines changed

osc_ingest_trino/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
# defines the release version for this python package
8-
__version__ = "0.2.1snap1"
8+
__version__ = "0.2.1"
99

1010
from .sqlcols import *
1111
from .sqltypes import *

osc_ingest_trino/sqltypes.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
'Int64': 'bigint',
1818
'bool': 'boolean',
1919
'category': 'varchar',
20-
'datetime64[ns, UTC]': 'timestamp(6)',
20+
'datetime64[ns, UTC]': 'timestamp',
2121
}
2222

2323
def pandas_type_to_sql(pt, typemap={}):
@@ -29,11 +29,11 @@ def pandas_type_to_sql(pt, typemap={}):
2929
return st
3030
raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))
3131

32-
def create_table_schema_pairs(df, typemap = {}, indent = 4):
32+
def create_table_schema_pairs(df, typemap = {}, colmap = {}, indent = 4):
3333
if not isinstance(df, pd.DataFrame):
3434
raise ValueError("df must be a pandas DataFrame")
35-
ptypes = [str(e) for e in df.dtypes.to_list()]
36-
stypes = [pandas_type_to_sql(e, typemap=typemap) for e in ptypes]
37-
pz = list(zip(df.columns.to_list(), stypes))
38-
return ",\n".join([f"{' '*indent}{e[0]} {e[1]}" for e in pz])
39-
35+
if not isinstance(colmap, dict):
36+
raise ValueError("colmap must be a dict")
37+
columns = df.columns.to_list()
38+
types = [colmap.get(col, pandas_type_to_sql(str(df[col].dtype), typemap=typemap)) for col in columns]
39+
return ",\n".join([f"{' '*indent}{e[0]} {e[1]}" for e in zip(columns, types)])

osc_ingest_trino/trino_utils.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,26 @@ def _remove_trailing_slash(s):
2828
def _prefix(pfx, schema, table):
2929
return _remove_trailing_slash(pfx).format(schema = schema, table = table)
3030

31-
def attach_trino_engine(env_var_prefix = 'TRINO'):
32-
sqlstring = 'trino://{user}@{host}:{port}/'.format(
31+
def attach_trino_engine(env_var_prefix = 'TRINO', catalog = None, schema = None, verbose = False):
32+
sqlstring = 'trino://{user}@{host}:{port}'.format(
3333
user = os.environ[f'{env_var_prefix}_USER'],
3434
host = os.environ[f'{env_var_prefix}_HOST'],
3535
port = os.environ[f'{env_var_prefix}_PORT']
3636
)
37+
if catalog is not None:
38+
sqlstring += f'/{catalog}'
39+
if schema is not None:
40+
if catalog is None:
41+
raise ValueError(f'connection schema specified without a catalog')
42+
sqlstring += f'/{schema}'
43+
3744
sqlargs = {
3845
'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
3946
'http_scheme': 'https'
4047
}
48+
49+
if verbose: print(f'using connect string: {sqlstring}')
50+
4151
engine = create_engine(sqlstring, connect_args = sqlargs)
4252
connection = engine.connect()
4353
return engine
@@ -90,13 +100,16 @@ def ingest_unmanaged_parquet(df, schema, table, bucket, partition_columns=[], ap
90100
if verbose: print(f'{tmp} --> {dst}')
91101
bucket.upload_file(tmp, dst)
92102

93-
def unmanaged_parquet_tabledef(df, catalog, schema, table, bucket, partition_columns = [], verbose = False):
103+
def unmanaged_parquet_tabledef(df, catalog, schema, table, bucket,
104+
partition_columns = [],
105+
typemap = {}, colmap = {},
106+
verbose = False):
94107
if not isinstance(df, pd.DataFrame):
95108
raise ValueError("df must be a pandas DataFrame")
96109
if not isinstance(partition_columns, list):
97110
raise ValueError("partition_columns must be list of column names")
98111

99-
columnschema = create_table_schema_pairs(df)
112+
columnschema = create_table_schema_pairs(df, typemap=typemap, colmap=colmap)
100113

101114
tabledef = f"create table if not exists {catalog}.{schema}.{table} (\n"
102115
tabledef += f"{columnschema}\n"

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setup(
1212
name = "osc-ingest-tools",
13-
version = "0.2.1snap1",
13+
version = "0.2.1",
1414
description = "python tools to assist with standardized data ingestion workflows for the OS-Climate project",
1515
long_description = README,
1616
long_description_content_type = "text/markdown",

0 commit comments

Comments
 (0)