wip

databrickslabs · ronanstokes-db · Mar 26, 2023 · Mar 27, 2023 · Apr 7, 2023 · Apr 7, 2023
commit b64262f6409cf76befbc8bfd64bc49fd58e7f949
@@ -12,20 +12,18 @@
 
 """
 import logging
-from collections import namedtuple
 import pprint
+from collections import namedtuple
 
 import numpy as np
-
+import pyspark.sql.functions as F
+from pyspark import sql
 from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
     TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType, MapType
 
-from pyspark import sql
-import pyspark.sql.functions as F
-
-from .utils import strip_margins, json_value_from_path
-from .spark_singleton import SparkSingleton
 from .html_utils import HtmlUtils
+from .spark_singleton import SparkSingleton
+from .utils import strip_margins, json_value_from_path
 
 
 class DataAnalyzer:
@@ -148,12 +146,9 @@ def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, d
         # add measures for fields
         exprs.extend(fieldExprs)
 
-        if dfSummary is not None:
-            dfResult = dfSummary.union(dfData.selectExpr(*exprs).limit(rowLimit))
-        else:
-            dfResult = dfData.selectExpr(*exprs).limit(rowLimit)
+        dfMeasure = dfData.selectExpr(*exprs).limit(rowLimit) if rowLimit is not None else dfData.selectExpr(*exprs)
 
-        return dfResult
+        return dfSummary.union(dfMeasure) if dfSummary is not None else dfMeasure
 
     @staticmethod
     def _is_numeric_type(dtype):
@@ -223,6 +218,112 @@ def _compute_pattern_match_clauses(self):
         result = stmts  # "\n".join(stmts)
         return result
 
+    def generateTextFeatures(self, sourceDf):
+        """ Generate text features from source dataframe
+
+        Generates set of text features for each column (analyzing string representation of each column value)
+
+        :param sourceDf: Source datafame
+        :return: Dataframe of text features
+        """
+        # generate named struct of text features for each column
+
+        # we need to double escape backslashes in regular expressions as they will be lost in string expansion
+        WORD_REGEX = r"\\b\\w+\\b"
+        SPACE_REGEX = r"\\s+"
+        DIGIT_REGEX = r"\\d"
+        PUNCTUATION_REGEX = r"[\\?\\.\\;\\,\\!\\{\\}\\[\\]\\(\\)\\>\\<]"
+        AT_REGEX = r"\\@"
+        PERIOD_REGEX = r"\\."
+        HTTP_REGEX = r"^http[s]?\\:\\/\\/"
+        ALPHA_REGEX = r"[a-zA-Z]"
+        ALPHA_UPPER_REGEX = r"[A-Z]"
+        ALPHA_LOWER_REGEX = r"[a-z]"
+        HEX_REGEX = r"[0-9a-fA-F]"
+
+        # for each column, extract text features from string representation of column value (leftmost 4096 characters)
+        def left4k(name):
+            return f"left(string({name}), 4096)"
+
+        fieldTextFeatures = []
+
+        for colInfo in self.columnsInfo:
+            fieldTextFeatures.append(
+                strip_margins(
+                    f"""named_struct(
+                    |   'print_len', length(string({colInfo.name})), 
+                    |   'word_count', size(regexp_extract_all({left4k(colInfo.name)}, '{WORD_REGEX}',0)),
+                    |   'space_count', size(regexp_extract_all({left4k(colInfo.name)}, '{SPACE_REGEX}',0)),
+                    |   'digit_count', size(regexp_extract_all({left4k(colInfo.name)}, '{DIGIT_REGEX}',0)),
+                    |   'punctuation_count', size(regexp_extract_all({left4k(colInfo.name)}, '{PUNCTUATION_REGEX}',0)),
+                    |   'at_count', size(regexp_extract_all({left4k(colInfo.name)}, '{AT_REGEX}',0)),
+                    |   'period_count', size(regexp_extract_all({left4k(colInfo.name)}, '{PERIOD_REGEX}',0)),
+                    |   'http_count', size(regexp_extract_all({left4k(colInfo.name)}, '{HTTP_REGEX}',0)),
+                    |   'alpha_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_REGEX}',0)),
+                    |   'alpha_lower_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_LOWER_REGEX}',0)),
+                    |   'alpha_upper_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_UPPER_REGEX}',0)),
+                    |   'hex_digit_count', size(regexp_extract_all({left4k(colInfo.name)}, '{HEX_REGEX}',0))
+                    |   )                       
+                    |   as {colInfo.name}""", marginChar="|")
+            )
+
+        dfTextFeatures = self._addMeasureToSummary(
+            'text_features',
+            fieldExprs=fieldTextFeatures,
+            dfData=sourceDf,
+            dfSummary=None,
+            rowLimit=None)
+
+        return dfTextFeatures
+
+    def _summarizeTextFeatures(self, textFeaturesDf):
+        """
+        Generate summary of text features
+
+        :param textFeaturesDf: Text features dataframe
+        :return: dataframe of summary text features
+        """
+        assert textFeaturesDf is not None, "textFeaturesDf must be specified"
+
+        # generate named struct of summary text features for each column
+        fieldTextFeatures = []
+
+        # TODO: use json syntax asin:print_len when migrating to Spark 10.4LTS as minimum version
+
+        for colInfo in self.columnsInfo:
+            cname = colInfo.name
+            fieldTextFeatures.append(strip_margins(
+                f"""to_json(named_struct(
+                |   'print_len', array(min({cname}.print_len), max({cname}.print_len), avg({cname}.print_len)), 
+                |   'word_count', array(min({cname}.word_count), max({cname}.word_count), avg({cname}.word_count)),
+                |   'space_count',array(min({cname}.space_count), max({cname}.space_count), avg({cname}.space_count)),
+                |   'digit_count', array(min({cname}.digit_count), max({cname}.digit_count), avg({cname}.digit_count)),
+                |   'punctuation_count', array(min({cname}.punctuation_count), max({cname}.punctuation_count),
+                |                              avg({cname}.punctuation_count)),
+                |   'at_count', array(min({cname}.at_count), max({cname}.at_count), avg({cname}.at_count)),
+                |   'period_count', array(min({cname}.period_count), max({cname}.period_count), 
+                |                         avg({cname}.period_count)),
+                |   'http_count', array(min({cname}.http_count), max({cname}.http_count), avg({cname}.http_count)),
+                |   'alpha_count', array(min({cname}.alpha_count), max({cname}.alpha_count), avg({cname}.alpha_count)),
+                |   'alpha_lower_count', array(min({cname}.alpha_lower_count), max({cname}.alpha_lower_count), 
+                |                              avg({cname}.alpha_lower_count)),
+                |   'alpha_upper_count', array(min({cname}.alpha_upper_count), max({cname}.alpha_upper_count), 
+                |                             avg({cname}.alpha_upper_count)),
+                |   'hex_digit_count', array(min({cname}.hex_digit_count), max({cname}.hex_digit_count),
+                |                            avg({cname}.hex_digit_count))
+                |   ))                        
+                |   as {cname}""", marginChar="|")
+            )
+
+        dfSummaryTextFeatures = self._addMeasureToSummary(
+            'summary_text_features',
+            fieldExprs=fieldTextFeatures,
+            dfData=textFeaturesDf,
+            dfSummary=None,
+            rowLimit=1)
+
+        return dfSummaryTextFeatures
+
     def summarizeToDF(self):
         """ Generate summary analysis of data set as dataframe
 
@@ -368,6 +469,14 @@ def summarizeToDF(self):
             dfData=df_under_analysis,
             dfSummary=dfDataSummary)
 
+        logger.info("Analyzing text features")
+        dfTextFeatures = self.generateTextFeatures(self._getExpandedSourceDf())
+
+        logger.info("Summarizing text features")
+        dfTextFeaturesSummary = self._summarizeTextFeatures(dfTextFeatures)
+
+        dfDataSummary = dfDataSummary.union(dfTextFeaturesSummary)
+
         return dfDataSummary
 
     def summarize(self, suppressOutput=False):

@@ -17,7 +17,7 @@ def setupLogging():
 
 
 class TestGenerationFromData:
-    SMALL_ROW_COUNT = 10000
+    SMALL_ROW_COUNT = 1000
 
     @pytest.fixture
     def testLogger(self):
@@ -68,66 +68,86 @@ def generation_spec(self):
             .withColumn("int_value", "int", min=100, max=200, percentNulls=0.1)
             .withColumn("byte_value", "tinyint", max=127)
             .withColumn("decimal_value", "decimal(10,2)", max=1000000)
-            .withColumn("decimal_value", "decimal(10,2)", max=1000000)
             .withColumn("date_value", "date", expr="current_date()", random=True)
             .withColumn("binary_value", "binary", expr="cast('spark' as binary)", random=True)
 
         )
         return spec
 
-    def test_code_generation1(self, generation_spec, setupLogging):
+    @pytest.fixture
+    def source_data_df(self, generation_spec):
         df_source_data = generation_spec.build()
-        df_source_data.show()
+        return df_source_data.cache()
+
+    def test_code_generation1(self, source_data_df, setupLogging):
+        source_data_df.show()
 
-        analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)
 
         generatedCode = analyzer.scriptDataGeneratorFromData()
 
-        for fld in df_source_data.schema:
+        for fld in source_data_df.schema:
             assert f"withColumn('{fld.name}'" in generatedCode
 
         # check generated code for syntax errors
         ast_tree = ast.parse(generatedCode)
         assert ast_tree is not None
 
-    def test_code_generation_from_schema(self, generation_spec, setupLogging):
-        df_source_data = generation_spec.build()
-        generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema)
+    def test_code_generation_from_schema(self, source_data_df, setupLogging):
+        generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(source_data_df.schema)
 
-        for fld in df_source_data.schema:
+        for fld in source_data_df.schema:
             assert f"withColumn('{fld.name}'" in generatedCode
 
         # check generated code for syntax errors
         ast_tree = ast.parse(generatedCode)
         assert ast_tree is not None
 
-    def test_summarize(self, testLogger, generation_spec):
-        testLogger.info("Building test data")
-
-        df_source_data = generation_spec.build()
+    def test_summarize(self, testLogger, source_data_df):
 
         testLogger.info("Creating data analyzer")
 
-        analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)
 
         testLogger.info("Summarizing data analyzer results")
         analyzer.summarize()
 
-    def test_summarize_to_df(self, generation_spec, testLogger):
-        testLogger.info("Building test data")
-
-        df_source_data = generation_spec.build()
-
+    def test_summarize_to_df(self, source_data_df, testLogger):
         testLogger.info("Creating data analyzer")
 
-        analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)
 
         testLogger.info("Summarizing data analyzer results")
         df = analyzer.summarizeToDF()
 
-        #df.show()
+        df.show()
+
+    def test_generate_text_features(self, source_data_df, testLogger):
+        testLogger.info("Creating data analyzer")
+
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)
+
+        df_text_features = analyzer.generateTextFeatures(source_data_df).limit(10)
+        df_text_features.show()
+
+        #data = df_text_features.selectExpr("get_json_object(asin, '$.print_len') as asin").limit(10).collect()
+        data = df_text_features.selectExpr("asin.print_len as asin").limit(10).collect()
+        assert data[0]['asin'] is not None
+        print(data[0]['asin'] )
+
+    def test_summarize_text_features(self, source_data_df, testLogger):
+        testLogger.info("Creating data analyzer")
+
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)
+
+        df_text_features = analyzer.generateTextFeatures(source_data_df)
+        df_summary_text_features = analyzer._summarizeTextFeatures(df_text_features)
+        df_summary_text_features.show()
+
+        data = df_summary_text_features.selectExpr("get_json_object(asin, '$.print_len') as asin").limit(10).collect()
+        assert data[0]['asin'] is not None
+        print(data[0]['asin'])
 
-        df_source_data.where("title is null or length(title) = 0").show()
 
     @pytest.mark.parametrize("sampleString, expectedMatch",
                              [("0234", "digits"),
@@ -141,10 +161,8 @@ def test_summarize_to_df(self, generation_spec, testLogger):
                               ("test_function", "identifier"),
                               ("10.0.0.1", "ip_addr")
                               ])
-    def test_match_patterns(self, sampleString, expectedMatch, generation_spec):
-        df_source_data = generation_spec.build()
-
-        analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
+    def test_match_patterns(self, sampleString, expectedMatch, source_data_df):
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)
 
         pattern_match_result = ""
         for k, v in analyzer._regex_patterns.items():
@@ -156,11 +174,10 @@ def test_match_patterns(self, sampleString, expectedMatch, generation_spec):
 
         assert pattern_match_result == expectedMatch, f"expected match to be {expectedMatch}"
 
-    def test_source_data_property(self, generation_spec):
-        df_source_data = generation_spec.build()
-        analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data, maxRows=1000)
+    def test_source_data_property(self, source_data_df):
+        analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df, maxRows=500)
 
         count_rows = analyzer.sourceSampleDf.count()
         print(count_rows)
-        assert abs(count_rows - 1000) < 100, "expected count to be close to 1000"
+        assert abs(count_rows - 500) < 50, "expected count to be close to 500"