Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e4417a9

Browse files
committedApr 24, 2024·
fixed fmt
1 parent 25bd2d4 commit e4417a9

File tree

4 files changed

+100
-94
lines changed

4 files changed

+100
-94
lines changed
 

‎src/databricks/labs/dqx/dq_engine.py

+29-26
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import functools as ft
22
import itertools
33
import re
4+
from collections.abc import Callable
45
from dataclasses import dataclass, field
56
from enum import Enum
6-
from typing import Any, Callable, Dict, List, Tuple
7+
from typing import Any
78

89
import pyspark.sql.functions as F
910
from pyspark.sql import Column, DataFrame
1011

1112
from databricks.labs.dqx import dq_functions
12-
from databricks.labs.dqx.dq_functions import * # noqa: F403
13+
from databricks.labs.dqx.dq_functions import * # noqa: F403 # pylint: disable=wildcard-import,unused-wildcard-import
1314
from databricks.labs.dqx.utils import get_column_name
1415

1516

@@ -47,7 +48,7 @@ def rule_criticality(self) -> str:
4748
:return: string describing criticality - `warn` or `error`. Raises exception if it's something else
4849
"""
4950
criticality = self.criticality
50-
if criticality != Criticality.WARN.value and criticality != Criticality.ERROR.value:
51+
if criticality not in (Criticality.WARN.value, Criticality.ERROR.value):
5152
criticality = Criticality.ERROR.value
5253

5354
return criticality
@@ -79,13 +80,13 @@ class DQRuleColSet:
7980
* `check_func_kwargs` - keyword /named arguments for the check function after the col_name
8081
"""
8182

82-
columns: List[str]
83+
columns: list[str]
8384
check_func: Callable
8485
criticality: str = "error"
85-
check_func_args: List[Any] = field(default_factory=list)
86-
check_func_kwargs: Dict[str, Any] = field(default_factory=dict)
86+
check_func_args: list[Any] = field(default_factory=list)
87+
check_func_kwargs: dict[str, Any] = field(default_factory=dict)
8788

88-
def get_rules(self) -> List[DQRule]:
89+
def get_rules(self) -> list[DQRule]:
8990
"""Build a list of rules for a set of columns.
9091
9192
:return: list of dq rules
@@ -99,7 +100,7 @@ def get_rules(self) -> List[DQRule]:
99100
]
100101

101102

102-
def _perform_checks(df: DataFrame, checks: List[DQRule]) -> DataFrame:
103+
def _perform_checks(df: DataFrame, checks: list[DQRule]) -> DataFrame:
103104
"""Applies a list of checks to a given dataframe and append results at the end of the dataframe.
104105
105106
:param df: dataframe to check
@@ -110,7 +111,7 @@ def _perform_checks(df: DataFrame, checks: List[DQRule]) -> DataFrame:
110111
return df.select("*", *checks_cols)
111112

112113

113-
def _make_null_filter(cols: List[str]) -> Column:
114+
def _make_null_filter(cols: list[str]) -> Column:
114115
"""Creates a filter condition that check if all specified columns are null.
115116
116117
:param cols: names of the columns to check
@@ -122,14 +123,14 @@ def _make_null_filter(cols: List[str]) -> Column:
122123
def update_nullability_func(func, col):
123124
return func & F.col(col).isNull()
124125

125-
f1 = F.col(cols[0]).isNull()
126-
return ft.reduce(update_nullability_func, cols[1:], f1)
126+
initial = F.col(cols[0]).isNull()
127+
return ft.reduce(update_nullability_func, cols[1:], initial)
127128

128129

129130
remove_criticality_re = re.compile("^(.*)_(error|warn)$")
130131

131132

132-
def _with_checks_as_map(df: DataFrame, dest_col: str, cols: List[str]) -> DataFrame:
133+
def _with_checks_as_map(df: DataFrame, dest_col: str, cols: list[str]) -> DataFrame:
133134
"""Collect individual check columns into corresponding map<string, string> errors or warnings columns.
134135
135136
:param df: dataframe with added check columns of type map<string, string>
@@ -168,7 +169,7 @@ def _with_checks_as_map(df: DataFrame, dest_col: str, cols: List[str]) -> DataFr
168169
return ndf
169170

170171

171-
def _get_check_columns(checks: List[DQRule], criticality: str) -> List[str]:
172+
def _get_check_columns(checks: list[DQRule], criticality: str) -> list[str]:
172173
"""Get check columns based on criticality.
173174
174175
:param checks: list of checks to apply to the dataframe
@@ -191,7 +192,7 @@ def _append_empty_checks(df: DataFrame) -> DataFrame:
191192
)
192193

193194

194-
def apply_checks(df: DataFrame, checks: List[DQRule]) -> DataFrame:
195+
def apply_checks(df: DataFrame, checks: list[DQRule]) -> DataFrame:
195196
"""Applies data quality checks to a given dataframe.
196197
197198
:param df: dataframe to check
@@ -214,7 +215,7 @@ def apply_checks(df: DataFrame, checks: List[DQRule]) -> DataFrame:
214215
return checked_df_map
215216

216217

217-
def apply_checks_and_split(df: DataFrame, checks: List[DQRule]) -> Tuple[DataFrame, DataFrame]:
218+
def apply_checks_and_split(df: DataFrame, checks: list[DQRule]) -> tuple[DataFrame, DataFrame]:
218219
"""Applies data quality checks to a given dataframe and split it into two ("good" and "bad"),
219220
according to the data quality checks.
220221
@@ -234,7 +235,7 @@ def apply_checks_and_split(df: DataFrame, checks: List[DQRule]) -> Tuple[DataFra
234235
return good_df, bad_df
235236

236237

237-
def build_checks_by_metadata(checks: List[dict], glbs=None) -> List[DQRule]:
238+
def build_checks_by_metadata(checks: list[dict], glbs: dict[str, Any] | None = None) -> list[DQRule]:
238239
"""Build checks based on check specification, i.e. function name plus arguments.
239240
240241
:param checks: list of dictionaries describing checks. Each check is a dictionary consisting of following fields:
@@ -248,25 +249,25 @@ def build_checks_by_metadata(checks: List[dict], glbs=None) -> List[DQRule]:
248249
:return: list of data quality check rules
249250
"""
250251
dq_rule_checks = []
251-
for ch in checks:
252-
check = ch.get("check")
252+
for check_def in checks:
253+
check = check_def.get("check")
253254
if not check:
254-
raise Exception(f"'check' block should be provided in the check: {ch}")
255+
raise ValueError(f"'check' block should be provided in the check: {check}")
255256

256257
func_name = check.get("function")
257258
if not func_name:
258-
raise Exception(f"'function' argument should be provided in the check: {ch}")
259+
raise ValueError(f"'function' argument should be provided in the check: {check}")
259260

260261
if glbs:
261262
func = glbs.get(func_name)
262263
else:
263264
func = getattr(dq_functions, func_name)
264265

265266
if not func or not callable(func):
266-
raise Exception(f"function {func_name} is not defined")
267+
raise ValueError(f"function {func_name} is not defined")
267268

268269
func_args = check.get("arguments", {})
269-
criticality = ch.get("criticality", "error")
270+
criticality = check_def.get("criticality", "error")
270271

271272
if "col_names" in func_args:
272273
dq_rule_checks += DQRuleColSet(
@@ -277,14 +278,16 @@ def build_checks_by_metadata(checks: List[dict], glbs=None) -> List[DQRule]:
277278
check_func_kwargs={k: func_args[k] for k in func_args.keys() - {"col_names"}},
278279
).get_rules()
279280
else:
280-
name = ch.get("name", None)
281+
name = check_def.get("name", None)
281282
check_func = func(**func_args)
282283
dq_rule_checks.append(DQRule(check=check_func, name=name, criticality=criticality))
283284

284285
return dq_rule_checks
285286

286287

287-
def apply_checks_by_metadata_and_split(df: DataFrame, checks: List[dict], glbs=None) -> Tuple[DataFrame, DataFrame]:
288+
def apply_checks_by_metadata_and_split(
289+
df: DataFrame, checks: list[dict], glbs: dict[str, Any] | None = None
290+
) -> tuple[DataFrame, DataFrame]:
288291
"""Wrapper around `apply_checks_and_split` for use in the metadata-driven pipelines. The main difference
289292
is how the checks are specified - instead of using functions directly, they are described as function name plus
290293
arguments.
@@ -308,7 +311,7 @@ def apply_checks_by_metadata_and_split(df: DataFrame, checks: List[dict], glbs=N
308311
return good_df, bad_df
309312

310313

311-
def apply_checks_by_metadata(df: DataFrame, checks: List[dict], glbs=None) -> DataFrame:
314+
def apply_checks_by_metadata(df: DataFrame, checks: list[dict], glbs: dict[str, Any] | None = None) -> DataFrame:
312315
"""Wrapper around `apply_checks` for use in the metadata-driven pipelines. The main difference
313316
is how the checks are specified - instead of using functions directly, they are described as function name plus
314317
arguments.
@@ -329,7 +332,7 @@ def apply_checks_by_metadata(df: DataFrame, checks: List[dict], glbs=None) -> Da
329332
return apply_checks(df, dq_rule_checks)
330333

331334

332-
def build_checks(*rules_col_set: DQRuleColSet) -> List[DQRule]:
335+
def build_checks(*rules_col_set: DQRuleColSet) -> list[DQRule]:
333336
"""
334337
Build rules from dq rules and rule sets.
335338

‎src/databricks/labs/dqx/dq_functions.py

+24-23
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ def col_is_not_null_and_not_empty(col_name: str, trim_strings: bool = False) ->
3333
:param trim_strings: boolean flag to trim spaces from strings
3434
:return: Column object for condition
3535
"""
36-
cl = F.col(col_name)
36+
column = F.col(col_name)
3737
if trim_strings:
38-
cl = F.trim(cl).alias(col_name)
39-
condition = cl.isNull() | (cl == "") | (cl == "null")
38+
column = F.trim(column).alias(col_name)
39+
condition = column.isNull() | (column == "") | (column == "null")
4040
return make_condition_col(condition, f"Column {col_name} is null or empty", f"{col_name}_is_null_or_empty")
4141

4242

@@ -46,8 +46,8 @@ def col_is_not_empty(col_name: str) -> Column:
4646
:param col_name: column name to check
4747
:return: Column object for condition
4848
"""
49-
cl = F.col(col_name)
50-
return make_condition_col((cl == ""), f"Column {col_name} is empty", f"{col_name}_is_empty")
49+
column = F.col(col_name)
50+
return make_condition_col((column == ""), f"Column {col_name} is empty", f"{col_name}_is_empty")
5151

5252

5353
def col_is_not_null(col_name: str) -> Column:
@@ -56,8 +56,8 @@ def col_is_not_null(col_name: str) -> Column:
5656
:param col_name: column name to check
5757
:return: Column object for condition
5858
"""
59-
cl = F.col(col_name)
60-
return make_condition_col(cl.isNull(), f"Column {col_name} is null", f"{col_name}_is_null")
59+
column = F.col(col_name)
60+
return make_condition_col(column.isNull(), f"Column {col_name} is null", f"{col_name}_is_null")
6161

6262

6363
def col_value_is_not_null_and_is_in_list(col_name: str, allowed: list) -> Column:
@@ -68,14 +68,14 @@ def col_value_is_not_null_and_is_in_list(col_name: str, allowed: list) -> Column
6868
:return: Column object for condition
6969
"""
7070
allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed]
71-
cl = F.col(col_name)
72-
condition = cl.isNull() | ~cl.isin(*allowed_cols)
71+
column = F.col(col_name)
72+
condition = column.isNull() | ~column.isin(*allowed_cols)
7373
return make_condition_col(
7474
condition,
7575
F.concat_ws(
7676
"",
7777
F.lit("Value "),
78-
F.when(cl.isNull(), F.lit("null")).otherwise(cl),
78+
F.when(column.isNull(), F.lit("null")).otherwise(column),
7979
F.lit(" is not in the allowed list: ["),
8080
F.concat_ws(", ", *allowed_cols),
8181
F.lit("]"),
@@ -92,14 +92,14 @@ def col_value_is_in_list(col_name: str, allowed: list) -> Column:
9292
:return: Column object for condition
9393
"""
9494
allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed]
95-
cl = F.col(col_name)
96-
condition = ~cl.isin(*allowed_cols)
95+
column = F.col(col_name)
96+
condition = ~column.isin(*allowed_cols)
9797
return make_condition_col(
9898
condition,
9999
F.concat_ws(
100100
"",
101101
F.lit("Value "),
102-
F.when(cl.isNull(), F.lit("null")).otherwise(cl),
102+
F.when(column.isNull(), F.lit("null")).otherwise(column),
103103
F.lit(" is not in the allowed list: ["),
104104
F.concat_ws(", ", *allowed_cols),
105105
F.lit("]"),
@@ -137,7 +137,7 @@ def col_sql_expression(
137137
return make_condition_col(expr_col, F.concat_ws("", F.lit(f"Value matches expression: {expression_msg}")), name)
138138

139139

140-
def is_col_older_than_col2_for_N_days(col_name1: str, col_name2: str, days: int) -> Column:
140+
def is_col_older_than_col2_for_N_days(col_name1: str, col_name2: str, days: int) -> Column: # pylint: disable=invalid-name
141141
"""Creates a condition column for case when one date or timestamp column is older than another column by N days.
142142
143143
:param col_name1: first column
@@ -163,7 +163,7 @@ def is_col_older_than_col2_for_N_days(col_name1: str, col_name2: str, days: int)
163163
)
164164

165165

166-
def is_col_older_than_N_days(col_name: str, days: int, curr_date: Column | None = None) -> Column:
166+
def is_col_older_than_N_days(col_name: str, days: int, curr_date: Column | None = None) -> Column: # pylint: disable=invalid-name
167167
"""Creates a condition column for case when specified date or timestamp column is older (compared to current date)
168168
than N days.
169169
@@ -222,6 +222,7 @@ def col_not_in_near_future(col_name: str, offset: int = 0, curr_timestamp: Colum
222222
223223
:param col_name: column name
224224
:param offset: offset (in seconds) to add to the current timestamp at time of execution
225+
:param curr_timestamp: (optional) set current timestamp
225226
:return: new Column
226227
"""
227228
if curr_timestamp is None:
@@ -246,11 +247,11 @@ def col_not_in_near_future(col_name: str, offset: int = 0, curr_timestamp: Colum
246247
)
247248

248249

249-
def col_not_less_than(col_name: str, limit) -> Column:
250+
def col_not_less_than(col_name: str, limit: int) -> Column:
250251
"""Creates a condition column that checks if a value is less than specified limit.
251252
252253
:param col_name: column name
253-
:param limit: limit
254+
:param limit: limit to use in the condition
254255
:return: new Column
255256
"""
256257
condition = F.col(col_name) < limit
@@ -262,11 +263,11 @@ def col_not_less_than(col_name: str, limit) -> Column:
262263
)
263264

264265

265-
def col_not_greater_than(col_name: str, limit) -> Column:
266+
def col_not_greater_than(col_name: str, limit: int) -> Column:
266267
"""Creates a condition column that checks if a value is greater than specified limit.
267268
268269
:param col_name: column name
269-
:param limit: limit
270+
:param limit: limit to use in the condition
270271
:return: new Column
271272
"""
272273
condition = F.col(col_name) > limit
@@ -278,12 +279,12 @@ def col_not_greater_than(col_name: str, limit) -> Column:
278279
)
279280

280281

281-
def col_is_in_range(col_name: str, min_limit, max_limit) -> Column:
282+
def col_is_in_range(col_name: str, min_limit: int, max_limit: int) -> Column:
282283
"""Creates a condition column that checks if a value is smaller than min limit or greater than max limit.
283284
284285
:param col_name: column name
285286
:param min_limit: min limit
286-
:param min_limit: max limit
287+
:param max_limit: max limit
287288
:return: new Column
288289
"""
289290
condition = (F.col(col_name) < min_limit) | (F.col(col_name) > max_limit)
@@ -304,12 +305,12 @@ def col_is_in_range(col_name: str, min_limit, max_limit) -> Column:
304305
)
305306

306307

307-
def col_is_not_in_range(col_name: str, min_limit, max_limit) -> Column:
308+
def col_is_not_in_range(col_name: str, min_limit: int, max_limit: int) -> Column:
308309
"""Creates a condition column that checks if a value is within min and max limits.
309310
310311
:param col_name: column name
311312
:param min_limit: min limit
312-
:param min_limit: max limit
313+
:param max_limit: max limit
313314
:return: new Column
314315
"""
315316
condition = (F.col(col_name) > min_limit) & (F.col(col_name) < max_limit)

‎src/databricks/labs/dqx/profiler/dq_generator.py

+28-28
Original file line numberDiff line numberDiff line change
@@ -4,75 +4,75 @@
44
from databricks.labs.dqx.profiler.profiler import DQRule
55

66

7-
def dq_generate_is_in(cl: str, level: str = "error", **params: dict):
7+
def dq_generate_is_in(col_name: str, level: str = "error", **params: dict):
88
return {
9-
"check": {"function": "col_value_is_in_list", "arguments": {"col_name": cl, "allowed": params["in"]}},
10-
"name": f"{cl}_other_value",
9+
"check": {"function": "col_value_is_in_list", "arguments": {"col_name": col_name, "allowed": params["in"]}},
10+
"name": f"{col_name}_other_value",
1111
"criticality": level,
1212
}
1313

1414

1515
# TODO: rewrite it
16-
def dq_generate_min_max(cl: str, level: str = "error", **params: dict):
17-
mn = params.get("min")
18-
mx = params.get("max")
16+
def dq_generate_min_max(col_name: str, level: str = "error", **params: dict):
17+
min = params.get("min")
18+
max = params.get("max")
1919

20-
if mn is not None and mx is not None:
20+
if min is not None and max is not None:
2121
return {
2222
"check": {
2323
"function": "col_is_in_range",
2424
"arguments": {
25-
"col_name": cl,
26-
"min_limit": val_maybe_to_str(mn, include_sql_quotes=False),
27-
"max_limit": val_maybe_to_str(mx, include_sql_quotes=False),
25+
"col_name": col_name,
26+
"min_limit": val_maybe_to_str(min, include_sql_quotes=False),
27+
"max_limit": val_maybe_to_str(max, include_sql_quotes=False),
2828
},
2929
},
30-
"name": f"{cl}_isnt_in_range",
30+
"name": f"{col_name}_isnt_in_range",
3131
"criticality": level,
3232
}
33-
elif mx is not None:
33+
elif max is not None:
3434
return {
3535
"check": {
3636
"function": "col_not_greater_than",
3737
"arguments": {
38-
"col_name": cl,
39-
"val": val_maybe_to_str(mx, include_sql_quotes=False),
38+
"col_name": col_name,
39+
"val": val_maybe_to_str(max, include_sql_quotes=False),
4040
},
4141
},
42-
"name": f"{cl}_not_greater_than",
42+
"name": f"{col_name}_not_greater_than",
4343
"criticality": level,
4444
}
45-
elif mn is not None:
45+
elif min is not None:
4646
return {
4747
"check": {
4848
"function": "col_not_less_than",
4949
"arguments": {
50-
"col_name": cl,
51-
"val": val_maybe_to_str(mn, include_sql_quotes=False),
50+
"col_name": col_name,
51+
"val": val_maybe_to_str(min, include_sql_quotes=False),
5252
},
5353
},
54-
"name": f"{cl}_not_less_than",
54+
"name": f"{col_name}_not_less_than",
5555
"criticality": level,
5656
}
5757

5858
return None
5959

6060

61-
def dq_generate_is_not_null(cl: str, level: str = "error", **params: dict):
61+
def dq_generate_is_not_null(col_name: str, level: str = "error", **params: dict):
6262
return {
63-
"check": {"function": "col_is_not_null", "arguments": {"col_name": cl}},
64-
"name": f"{cl}_is_null",
63+
"check": {"function": "col_is_not_null", "arguments": {"col_name": col_name}},
64+
"name": f"{col_name}_is_null",
6565
"criticality": level,
6666
}
6767

6868

69-
def dq_generate_is_not_null_or_empty(cl: str, level: str = "error", **params: dict):
69+
def dq_generate_is_not_null_or_empty(col_name: str, level: str = "error", **params: dict):
7070
return {
7171
"check": {
7272
"function": "col_is_not_null_and_not_empty",
73-
"arguments": {"col_name": cl, "trim_strings": params.get("trim_strings", True)},
73+
"arguments": {"col_name": col_name, "trim_strings": params.get("trim_strings", True)},
7474
},
75-
"name": f"{cl}_is_null_or_empty",
75+
"name": f"{col_name}_is_null_or_empty",
7676
"criticality": level,
7777
}
7878

@@ -91,12 +91,12 @@ def generate_dq_rules(rules: Optional[list[DQRule]] = None, level: str = "error"
9191
dq_rules = []
9292
for rule in rules:
9393
nm = rule.name
94-
cl = rule.column
94+
col_name = rule.column
9595
params = rule.parameters or {}
9696
if nm not in dq_mapping:
97-
print(f"No rule '{nm}' for column '{cl}'. skipping...")
97+
print(f"No rule '{nm}' for column '{col_name}'. skipping...")
9898
continue
99-
expr = dq_mapping[nm](cl, level, **params)
99+
expr = dq_mapping[nm](col_name, level, **params)
100100
if expr:
101101
dq_rules.append(expr)
102102

‎src/databricks/labs/dqx/profiler/profiler.py

+19-17
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,14 @@ def extract_min_max(dst: DataFrame, nm: str, typ, metrics, opts: dict[str, Any]
119119
mx = None
120120

121121
outlier_cols = opts.get("outlier_columns", [])
122-
cl = dst.columns[0]
122+
column = dst.columns[0]
123123
if opts.get("remove_outliers", True) and (len(outlier_cols) == 0 or nm in outlier_cols): # detect outliers
124124
if typ == T.DateType():
125-
dst = dst.select(F.col(cl).cast("timestamp").cast("bigint").alias(cl))
125+
dst = dst.select(F.col(column).cast("timestamp").cast("bigint").alias(column))
126126
elif typ == T.TimestampType():
127-
dst = dst.select(F.col(cl).cast("bigint").alias(cl))
127+
dst = dst.select(F.col(column).cast("bigint").alias(column))
128128
# TODO: do summary instead? to get percentiles, etc.?
129-
mn_mx = dst.agg(F.min(cl), F.max(cl), F.mean(cl), F.stddev(cl)).collect()
129+
mn_mx = dst.agg(F.min(column), F.max(column), F.mean(column), F.stddev(column)).collect()
130130
if mn_mx and len(mn_mx) > 0:
131131
metrics["min"] = mn_mx[0][0]
132132
metrics["max"] = mn_mx[0][1]
@@ -175,7 +175,7 @@ def extract_min_max(dst: DataFrame, nm: str, typ, metrics, opts: dict[str, Any]
175175
else:
176176
print(f"Can't get min/max for field {nm}")
177177
else:
178-
mn_mx = dst.agg(F.min(cl), F.max(cl)).collect()
178+
mn_mx = dst.agg(F.min(column), F.max(column)).collect()
179179
if mn_mx and len(mn_mx) > 0:
180180
metrics["min"] = mn_mx[0][0]
181181
metrics["max"] = mn_mx[0][1]
@@ -239,14 +239,14 @@ def profile_dataframe(
239239

240240
# TODO: think, how we can do it in fewer passes. Maybe only for specific things, like, min_max, etc.
241241
for field in get_columns_or_fields(df_cols):
242-
nm = field.name
242+
field_name = field.name
243243
typ = field.dataType
244-
if nm not in summary_stats:
245-
summary_stats[nm] = {}
246-
metrics = summary_stats[nm]
244+
if field_name not in summary_stats:
245+
summary_stats[field_name] = {}
246+
metrics = summary_stats[field_name]
247247

248248
# calculate metrics
249-
dst = df.select(nm).dropna()
249+
dst = df.select(field_name).dropna()
250250
if typ == T.StringType() and trim_strings:
251251
cl = dst.columns[0]
252252
dst = dst.select(F.trim(F.col(cl)).alias(cl))
@@ -263,30 +263,32 @@ def profile_dataframe(
263263
dq_rules.append(
264264
DQRule(
265265
name="is_not_null",
266-
column=nm,
267-
description=f"Column {nm} has {null_percentage * 100:.1f}% of null values "
266+
column=field_name,
267+
description=f"Column {field_name} has {null_percentage * 100:.1f}% of null values "
268268
f"(allowed {max_nulls * 100:.1f}%)",
269269
)
270270
)
271271
else:
272-
dq_rules.append(DQRule(name="is_not_null", column=nm))
272+
dq_rules.append(DQRule(name="is_not_null", column=field_name))
273273

274274
if type_supports_distinct(typ):
275275
dst2 = dst.dropDuplicates()
276276
cnt = dst2.count()
277277
if 0 < cnt < total_count * opts["distinct_ratio"] and cnt < opts["max_in_count"]:
278-
dq_rules.append(DQRule(name="is_in", column=nm, parameters={"in": [row[0] for row in dst2.collect()]}))
278+
dq_rules.append(
279+
DQRule(name="is_in", column=field_name, parameters={"in": [row[0] for row in dst2.collect()]})
280+
)
279281

280282
if typ == T.StringType():
281-
dst2 = dst.filter(F.col(nm) == "")
283+
dst2 = dst.filter(F.col(field_name) == "")
282284
cnt = dst2.count()
283285
if cnt <= (metrics["count"] * opts.get("max_empty_ratio", 0)):
284286
dq_rules.append(
285-
DQRule(name="is_not_null_or_empty", column=nm, parameters={"trim_strings": trim_strings})
287+
DQRule(name="is_not_null_or_empty", column=field_name, parameters={"trim_strings": trim_strings})
286288
)
287289

288290
if metrics["count_non_null"] > 0 and type_supports_min_max(typ):
289-
rule = extract_min_max(dst, nm, typ, metrics, opts)
291+
rule = extract_min_max(dst, field_name, typ, metrics, opts)
290292
if rule:
291293
dq_rules.append(rule)
292294

0 commit comments

Comments
 (0)
Please sign in to comment.