databrickslabs
diff --git a/‎.github/scripts/setup_spark_remote.sh
+61 b/‎.github/scripts/setup_spark_remote.sh
+61
diff --git a/‎.github/workflows/acceptance.yml
-10 b/‎.github/workflows/acceptance.yml
-10
diff --git a/‎.github/workflows/push.yml
+6-1 b/‎.github/workflows/push.yml
+6-1
diff --git a/‎Makefile
+6-1 b/‎Makefile
+6-1
diff --git a/‎demos/dqx_demo_library.py
+53-7 b/‎demos/dqx_demo_library.py
+53-7
diff --git a/‎demos/dqx_demo_tool.py
+2-2 b/‎demos/dqx_demo_tool.py
+2-2
diff --git a/‎docs/dqx/docs/dev/contributing.mdx
+8-3 b/‎docs/dqx/docs/dev/contributing.mdx
+8-3
diff --git a/‎docs/dqx/docs/guide.mdx
+50-14 b/‎docs/dqx/docs/guide.mdx
+50-14
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+set -xve
+echo "Setting up spark-connect"
+
+mkdir -p "$HOME"/spark
+cd "$HOME"/spark || exit 1
+
+version=$(wget -O - https://dlcdn.apache.org/spark/ | grep 'href="spark' | grep -v 'preview' | sed 's:</a>:\n:g' | sed -n 's/.*>//p' | tr -d spark- | tr -d / | sort -r --version-sort | head -1)
+if [ -z "$version" ]; then
+  echo "Failed to extract Spark version"
+   exit 1
+fi
+
+spark=spark-${version}-bin-hadoop3
+spark_connect="spark-connect_2.12"
+
+mkdir -p "${spark}"
+
+
+SERVER_SCRIPT=$HOME/spark/${spark}/sbin/start-connect-server.sh
+
+## check the spark version already exist, if not download the respective version
+if [ -f "${SERVER_SCRIPT}" ];then
+  echo "Spark Version already exists"
+else
+  if [ -f "${spark}.tgz" ];then
+    echo "${spark}.tgz already exists"
+  else
+    wget "https://dlcdn.apache.org/spark/spark-${version}/${spark}.tgz"
+  fi
+  tar -xvf "${spark}.tgz"
+fi
+
+cd "${spark}" || exit 1
+## check spark remote is running,if not start the spark remote
+result=$(${SERVER_SCRIPT} --packages org.apache.spark:${spark_connect}:"${version}" > "$HOME"/spark/log.out; echo $?)
+
+if [ "$result" -ne 0 ]; then
+    count=$(tail "${HOME}"/spark/log.out | grep -c "SparkConnectServer running as process")
+    if [ "${count}" == "0" ]; then
+            echo "Failed to start the server"
+        exit 1
+    fi
+    # Wait for the server to start by pinging localhost:4040
+    echo "Waiting for the server to start..."
+    for i in {1..30}; do
+        if nc -z localhost 4040; then
+            echo "Server is up and running"
+            break
+        fi
+        echo "Server not yet available, retrying in 5 seconds..."
+        sleep 5
+    done
+
+    if ! nc -z localhost 4040; then
+        echo "Failed to start the server within the expected time"
+        exit 1
+    fi
+fi
+echo "Started the Server"
@@ -42,11 +42,6 @@ jobs:
       - name: Install hatch
         run: pip install hatch==1.9.4
 
-      - name: Fetch relevant branches
-        run: |
-          git fetch origin $GITHUB_BASE_REF:$GITHUB_BASE_REF
-          git fetch origin $GITHUB_HEAD_REF:$GITHUB_HEAD_REF
-
       - name: Run integration tests
         uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.2
         with:
@@ -82,11 +77,6 @@ jobs:
       - name: Install hatch
         run: pip install hatch==1.9.4
 
-      - name: Fetch relevant branches
-        run: |
-          git fetch origin $GITHUB_BASE_REF:$GITHUB_BASE_REF
-          git fetch origin $GITHUB_HEAD_REF:$GITHUB_HEAD_REF
-
       - name: Run integration tests on serverless cluster
         uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.2
         with:
 
@@ -35,10 +35,15 @@ jobs:
           cache-dependency-path: '**/pyproject.toml'
           python-version: ${{ matrix.pyVersion }}
 
+      - name: Setup Spark Remote
+        run: |
+          pip install hatch==1.9.4
+          make setup_spark_remote
+
       - name: Run unit tests
         run: |
           pip install hatch==1.9.4
-          make test
+          make ci-test
 
       - name: Publish test coverage
         uses: codecov/codecov-action@v5
 
@@ -17,12 +17,17 @@ lint:
 fmt:
 	hatch run fmt
 
-test:
+ci-test:
 	hatch run test
 
 integration:
 	hatch run integration
 
+setup_spark_remote:
+	.github/scripts/setup_spark_remote.sh
+
+test: setup_spark_remote ci-test
+
 coverage:
 	hatch run coverage && open htmlcov/index.html
 
 
@@ -59,7 +59,7 @@
 print(dlt_expectations)
 
 # save generated checks in a workspace file
-user_name = spark.sql('select current_user() as user').collect()[0]['user']
+user_name = spark.sql("select current_user() as user").collect()[0]["user"]
 checks_file = f"/Workspace/Users/{user_name}/dqx_demo_checks.yml"
 dq_engine = DQEngine(ws)
 dq_engine.save_checks_in_workspace_file(checks, workspace_path=checks_file)
@@ -142,6 +142,13 @@
     arguments:
       col_name: col3
 
+- criticality: error
+  filter: col1 < 3
+  check:
+    function: is_not_null_and_not_empty
+    arguments:
+      col_name: col4
+
 - criticality: warn
   check:
     function: value_is_in_list
@@ -186,12 +193,17 @@
             criticality="error",
             check_func=is_not_null).get_rules() + [
          DQRule( # define rule for a single column
-            name='col3_is_null_or_empty',
-            criticality='error',
-            check=is_not_null_and_not_empty('col3')),
+            name="col3_is_null_or_empty",
+            criticality="error",
+            check=is_not_null_and_not_empty("col3")),
+         DQRule( # define rule with a filter
+            name="col_4_is_null_or_empty",
+            criticality="error",
+            filter="col1 < 3",
+            check=is_not_null_and_not_empty("col4")),
          DQRule( # name auto-generated if not provided
-            criticality='warn',
-            check=value_is_in_list('col4', ['1', '2']))
+            criticality="warn",
+            check=value_is_in_list("col4", ["1", "2"]))
         ]
 
 schema = "col1: int, col2: int, col3: int, col4 int"
@@ -344,4 +356,38 @@ def ends_with_foo(col_name: str) -> Column:
 dq_engine = DQEngine(WorkspaceClient())
 
 valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks, globals())
-display(valid_and_quarantined_df)
+display(valid_and_quarantined_df)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Applying custom column names
+
+# COMMAND ----------
+
+from databricks.sdk import WorkspaceClient
+from databricks.labs.dqx.engine import (
+    DQEngine,
+    ExtraParams,
+    DQRule
+)
+
+from databricks.labs.dqx.col_functions import is_not_null_and_not_empty
+
+# using ExtraParams class to configure the engine with custom column names
+extra_parameters = ExtraParams(column_names={"errors": "dq_errors", "warnings": "dq_warnings"})
+
+ws = WorkspaceClient()
+dq_engine = DQEngine(ws, extra_params=extra_parameters)
+
+schema = "col1: string"
+input_df = spark.createDataFrame([["str1"], ["foo"], ["str3"]], schema)
+
+checks = [ DQRule(
+            name="col_1_is_null_or_empty",
+            criticality="error",
+            check=is_not_null_and_not_empty("col1")),
+        ]
+
+valid_and_quarantined_df = dq_engine.apply_checks(input_df, checks)
+display(valid_and_quarantined_df)
@@ -45,7 +45,7 @@
 import glob
 import os
 
-user_name = spark.sql('select current_user() as user').collect()[0]['user']
+user_name = spark.sql("select current_user() as user").collect()[0]["user"]
 dqx_wheel_files = glob.glob(f"/Workspace/Users/{user_name}/.dqx/wheels/databricks_labs_dqx-*.whl")
 dqx_latest_wheel = max(dqx_wheel_files, key=os.path.getctime)
 %pip install {dqx_latest_wheel}
@@ -210,7 +210,7 @@
 # COMMAND ----------
 
 print(f"Saving quarantined data to {run_config.quarantine_table}")
-quarantine_catalog, quarantine_schema, _ = run_config.quarantine_table.split('.')
+quarantine_catalog, quarantine_schema, _ = run_config.quarantine_table.split(".")
 
 spark.sql(f"CREATE CATALOG IF NOT EXISTS {quarantine_catalog}")
 spark.sql(f"CREATE SCHEMA IF NOT EXISTS {quarantine_catalog}.{quarantine_schema}")
 
@@ -86,13 +86,18 @@ Before every commit, apply the consistent formatting of the code, as we want our
 make fmt
 ```
 
-Before every commit, run automated bug detector (`make lint`) and unit tests (`make test`) to ensure that automated
-pull request checks do pass, before your code is reviewed by others: 
+Before every commit, run automated bug detector and unit tests to ensure that automated
+pull request checks do pass, before your code is reviewed by others:
 ```shell
 make lint
+make setup_spark_remote
 make test
 ```
 
+The command `make setup_spark_remote` sets up the environment for running unit tests and is required one time only.
+DQX uses Databricks Connect as a test dependency, which restricts the creation of a Spark session in local mode.
+To enable spark local execution for unit testing, the command install spark remote.
+
 ### Local setup for integration tests and code coverage
 
 Note that integration tests and code coverage are run automatically when you create a Pull Request in Github.
@@ -215,7 +220,7 @@ Here are the example steps to submit your first contribution:
 7. `make fmt`
 8. `make lint`
 9. .. fix if any
-10. `make test` and `make integration`, optionally `make coverage` to get test coverage report
+10. `make setup_spark_remote`, make test` and `make integration`, optionally `make coverage` to get test coverage report
 11. .. fix if any issues
 12. `git commit -S -a -m "message"`
 
 
@@ -55,7 +55,7 @@ dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="Python_D
 print(dlt_expectations)
 ```
 
-### Using CLI 
+### Using CLI
 
 You can optionally install DQX in the workspace, see the [Installation Guide](/docs/installation#dqx-installation-in-a-databricks-workspace).
 As part of the installation, a config, dashboards and profiler workflow is installed. The workflow can be run manually in the workspace UI or using the CLI as below.
@@ -116,7 +116,7 @@ print(status)
 ```
 
 Note that checks are validated automatically when applied as part of the
-`apply_checks_by_metadata_and_split` and `apply_checks_by_metadata` methods 
+`apply_checks_by_metadata_and_split` and `apply_checks_by_metadata` methods
 (see [Quality rules defined as config](#quality-rules-defined-as-config)).
 
 ### Using CLI
@@ -178,7 +178,7 @@ checks = dq_engine.load_checks_from_installation(assume_user=True, run_config_na
 
 input_df = spark.read.table("catalog1.schema1.table1")
 
-# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes 
+# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
 valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
 
 # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
@@ -198,7 +198,7 @@ checks = dq_engine.load_checks_from_workspace_file(workspace_path="/Shared/App1/
 
 input_df = spark.read.table("catalog1.schema1.table1")
 
-# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes 
+# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
 valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
 
 # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
@@ -220,7 +220,7 @@ dq_engine = DQEngine(WorkspaceClient())
 
 input_df = spark.read.table("catalog1.schema1.table1")
 
-# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes 
+# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
 valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
 
 # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
@@ -241,21 +241,26 @@ from databricks.sdk import WorkspaceClient
 dq_engine = DQEngine(WorkspaceClient())
 
 checks = DQRuleColSet( # define rule for multiple columns at once
-            columns=["col1", "col2"], 
-            criticality="error", 
+            columns=["col1", "col2"],
+            criticality="error",
             check_func=is_not_null).get_rules() + [
          DQRule( # define rule for a single column
-            name='col3_is_null_or_empty',
-            criticality='error', 
-            check=is_not_null_and_not_empty('col3')),
+            name="col3_is_null_or_empty",
+            criticality="error",
+            check=is_not_null_and_not_empty("col3")),
+         DQRule( # define rule with a filter
+            name="col_4_is_null_or_empty",
+            criticality="error", 
+            filter="col1 < 3",
+            check=is_not_null_and_not_empty("col4")),
          DQRule( # name auto-generated if not provided       
-            criticality='warn', 
-            check=value_is_in_list('col4', ['1', '2']))
+            criticality="warn",
+            check=value_is_in_list("col4", ["1", "2"]))
         ]
 
 input_df = spark.read.table("catalog1.schema1.table1")
 
-# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes 
+# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
 valid_df, quarantined_df = dq_engine.apply_checks_and_split(input_df, checks)
 
 # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
@@ -288,6 +293,13 @@ checks = yaml.safe_load("""
     arguments:
       col_name: col3
 
+- criticality: error
+  filter: col1 < 3
+  check:
+    function: is_not_null_and_not_empty
+    arguments:
+      col_name: col4
+
 - criticality: warn
   check:
     function: value_is_in_list
@@ -300,7 +312,7 @@ checks = yaml.safe_load("""
 
 input_df = spark.read.table("catalog1.schema1.table1")
 
-# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes 
+# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
 valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
 
 # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
@@ -411,3 +423,27 @@ dq_engine = DQEngine(ws)
 For details on the specific methods available in the engine, visit to the [reference](/docs/reference#dq-engine-methods) section.
 
 Information on testing applications that use `DQEngine` can be found [here](/docs/reference#testing-applications-using-dqx).
+
+## Additional Configuration
+
+### Customizing Reporting Error and Warning Columns
+
+By default, DQX appends `_error` and `_warning` reporting columns to the output DataFrame to flag quality issues.
+
+You can customize the names of these reporting columns by specifying additional configurations in the engine.
+
+```python
+from databricks.sdk import WorkspaceClient
+from databricks.labs.dqx.engine import (
+    DQEngine,
+    ExtraParams,
+)
+
+# customize reporting column names
+extra_parameters = ExtraParams(column_names={"errors": "dq_errors", "warnings": "dq_warnings"})
+
+ws = WorkspaceClient()
+dq_engine = DQEngine(ws, extra_params=extra_parameters)
+```
+
+