Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOP-23318] Avoid suppressing Hive Metastore errors #329

Merged
merged 2 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changelog/next_release/329.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Avoid suppressing Hive Metastore errors while using ``DBWriter``. Previously this could lead to overriding already existing table if Hive Metastore was overloaded and responded with an exception.
33 changes: 23 additions & 10 deletions onetl/connection/db_connection/hive/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,17 +296,8 @@ def write_df_to_target(
) -> None:
write_options = self.WriteOptions.parse(options)

try:
self.get_df_schema(target)
table_exists = True

log.info("|%s| Table %r already exists", self.__class__.__name__, target)
except Exception as e:
log.debug("|%s| Cannot get schema of table %r: %s", self.__class__.__name__, target, e)
table_exists = False

# https://stackoverflow.com/a/72747050
if table_exists and write_options.if_exists != HiveTableExistBehavior.REPLACE_ENTIRE_TABLE:
if self._target_exist(target) and write_options.if_exists != HiveTableExistBehavior.REPLACE_ENTIRE_TABLE:
if write_options.if_exists == HiveTableExistBehavior.ERROR:
raise ValueError("Operation stopped due to Hive.WriteOptions(if_exists='error')")
elif write_options.if_exists == HiveTableExistBehavior.IGNORE:
Expand Down Expand Up @@ -465,6 +456,28 @@ def _sort_df_columns_like_table(self, table: str, df_columns: list[str]) -> list

return sorted(df_columns, key=lambda column: table_columns_normalized.index(column.casefold()))

def _target_exist(self, name: str) -> bool:
from pyspark.sql.functions import col

log.info("|%s| Checking if table %r exists ...", self.__class__.__name__, name)

# Do not use SELECT * FROM table, because it may fail if users have no permissions,
# or Hive Metastore is overloaded.
# Also we ignore VIEW's as they are not insertable.
schema, table = name.split(".", maxsplit=1)
query = f"SHOW TABLES IN {schema} LIKE '{table}'"

log.debug("|%s| Executing SQL query:", self.__class__.__name__)
log_lines(log, query, level=logging.DEBUG)

df = self._execute_sql(query).where(col("tableName") == table)
result = df.count() > 0
if result:
log.info("|%s| Table %r exists.", self.__class__.__name__, name)
else:
log.info("|%s| Table %r does not exist.", self.__class__.__name__, name)
return result

def _insert_into(
self,
df: DataFrame,
Expand Down
Loading