Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature generate from data revised #204

Open
wants to merge 56 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
aebbb06
wip
ronanstokes-db Mar 26, 2023
5f0ffc0
merge from origin
ronanstokes-db Mar 27, 2023
1eda552
wip
ronanstokes-db Apr 7, 2023
7de014c
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 7, 2023
c859475
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 9, 2023
0671ccd
added generated docs
ronanstokes-db Apr 9, 2023
7583538
wip
ronanstokes-db Apr 9, 2023
70fc782
wip
ronanstokes-db Apr 11, 2023
8a02f07
wip
ronanstokes-db Apr 11, 2023
1acc134
wip
ronanstokes-db Apr 12, 2023
693aeac
wip
ronanstokes-db Apr 12, 2023
bd86b40
wip
ronanstokes-db Apr 12, 2023
f4e486f
wip
ronanstokes-db Apr 13, 2023
ed0540f
wip
ronanstokes-db Apr 13, 2023
73472d4
wip
ronanstokes-db Apr 13, 2023
03b6062
wip
ronanstokes-db Apr 13, 2023
54bcc84
wip
ronanstokes-db Apr 13, 2023
32f594a
wip
ronanstokes-db Apr 13, 2023
71a7b73
wip
ronanstokes-db Apr 13, 2023
42397d3
Merge branch 'master' into feature_generate_from_data_revised
ronanstokes-db Apr 13, 2023
5156ded
wip
ronanstokes-db Apr 15, 2023
f8aebc1
wip
ronanstokes-db Apr 15, 2023
0243ec5
wip
ronanstokes-db Apr 15, 2023
80d26f6
wip
ronanstokes-db Apr 15, 2023
907fc65
wip
ronanstokes-db Apr 15, 2023
306a3b5
wip
ronanstokes-db Apr 15, 2023
1ceff85
wip
ronanstokes-db Apr 15, 2023
02d8ff9
wip
ronanstokes-db Apr 16, 2023
575d259
wip
ronanstokes-db Apr 17, 2023
3ecca3e
Merge branch 'master' into feature_generate_from_data_revised
ronanstokes-db Apr 18, 2023
e9c707e
Merge branch 'master' into feature_generate_from_data_revised
ronanstokes-db Apr 18, 2023
256c604
Merge branch 'master' into feature_generate_from_data_revised
ronanstokes-db Apr 20, 2023
00e7528
Merge branch 'feature_generate_from_data_revised' of https://github.c…
ronanstokes-db Apr 21, 2023
7b84aee
wip
ronanstokes-db Apr 21, 2023
b64262f
wip
ronanstokes-db Apr 21, 2023
7f19a57
wip
ronanstokes-db Apr 23, 2023
4a172d6
wip
ronanstokes-db Apr 23, 2023
9e99800
wip
ronanstokes-db Apr 23, 2023
95f7483
wip
ronanstokes-db Apr 23, 2023
58edcd3
wip
ronanstokes-db Apr 24, 2023
c9438c8
wip
ronanstokes-db Apr 24, 2023
802cc84
wip
ronanstokes-db May 9, 2023
64a7e0b
Merge branch 'master' into feature_generate_from_data_revised
ronanstokes-db May 30, 2023
2b1f46a
Merge branch 'master' into feature_generate_from_data_revised
ronanstokes-db Aug 25, 2023
f61be38
Update data_analyzer.py
ronanstokes-db Aug 26, 2023
ffe47ca
Update generating_from_existing_data.rst
ronanstokes-db Aug 26, 2023
2e1b36b
merged changes from upstream
ronanstokes-db Aug 26, 2023
afefde8
corrected typo
ronanstokes-db Aug 28, 2023
98a3e63
Update test_generation_from_data.py
ronanstokes-db Aug 28, 2023
79f3463
Update data_analyzer.py
ronanstokes-db Aug 28, 2023
f1d2dab
Update data_analyzer.py
ronanstokes-db Aug 29, 2023
0648b03
Merge branch 'feature_generate_from_data_revised' of https://github.c…
ronanstokes-db Aug 29, 2023
cd0b85f
wip
ronanstokes-db Aug 29, 2023
c5aab78
wip
ronanstokes-db Aug 29, 2023
79cc979
disable warning about undocumented abstract method
ronanstokes-db Aug 29, 2023
d6f14eb
wip
ronanstokes-db Aug 31, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
wip
ronanstokes-db committed Apr 21, 2023
commit b64262f6409cf76befbc8bfd64bc49fd58e7f949
133 changes: 121 additions & 12 deletions dbldatagen/data_analyzer.py
Original file line number Diff line number Diff line change
@@ -12,20 +12,18 @@
"""
import logging
from collections import namedtuple
import pprint
from collections import namedtuple

import numpy as np

import pyspark.sql.functions as F
from pyspark import sql
from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType, MapType

from pyspark import sql
import pyspark.sql.functions as F

from .utils import strip_margins, json_value_from_path
from .spark_singleton import SparkSingleton
from .html_utils import HtmlUtils
from .spark_singleton import SparkSingleton
from .utils import strip_margins, json_value_from_path


class DataAnalyzer:
@@ -148,12 +146,9 @@ def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, d
# add measures for fields
exprs.extend(fieldExprs)

if dfSummary is not None:
dfResult = dfSummary.union(dfData.selectExpr(*exprs).limit(rowLimit))
else:
dfResult = dfData.selectExpr(*exprs).limit(rowLimit)
dfMeasure = dfData.selectExpr(*exprs).limit(rowLimit) if rowLimit is not None else dfData.selectExpr(*exprs)

return dfResult
return dfSummary.union(dfMeasure) if dfSummary is not None else dfMeasure

@staticmethod
def _is_numeric_type(dtype):
@@ -223,6 +218,112 @@ def _compute_pattern_match_clauses(self):
result = stmts # "\n".join(stmts)
return result

def generateTextFeatures(self, sourceDf):
""" Generate text features from source dataframe
Generates set of text features for each column (analyzing string representation of each column value)
:param sourceDf: Source datafame
:return: Dataframe of text features
"""
# generate named struct of text features for each column

# we need to double escape backslashes in regular expressions as they will be lost in string expansion
WORD_REGEX = r"\\b\\w+\\b"
SPACE_REGEX = r"\\s+"
DIGIT_REGEX = r"\\d"
PUNCTUATION_REGEX = r"[\\?\\.\\;\\,\\!\\{\\}\\[\\]\\(\\)\\>\\<]"
AT_REGEX = r"\\@"
PERIOD_REGEX = r"\\."
HTTP_REGEX = r"^http[s]?\\:\\/\\/"
ALPHA_REGEX = r"[a-zA-Z]"
ALPHA_UPPER_REGEX = r"[A-Z]"
ALPHA_LOWER_REGEX = r"[a-z]"
HEX_REGEX = r"[0-9a-fA-F]"

# for each column, extract text features from string representation of column value (leftmost 4096 characters)
def left4k(name):
return f"left(string({name}), 4096)"

fieldTextFeatures = []

for colInfo in self.columnsInfo:
fieldTextFeatures.append(
strip_margins(
f"""named_struct(
| 'print_len', length(string({colInfo.name})),
| 'word_count', size(regexp_extract_all({left4k(colInfo.name)}, '{WORD_REGEX}',0)),
| 'space_count', size(regexp_extract_all({left4k(colInfo.name)}, '{SPACE_REGEX}',0)),
| 'digit_count', size(regexp_extract_all({left4k(colInfo.name)}, '{DIGIT_REGEX}',0)),
| 'punctuation_count', size(regexp_extract_all({left4k(colInfo.name)}, '{PUNCTUATION_REGEX}',0)),
| 'at_count', size(regexp_extract_all({left4k(colInfo.name)}, '{AT_REGEX}',0)),
| 'period_count', size(regexp_extract_all({left4k(colInfo.name)}, '{PERIOD_REGEX}',0)),
| 'http_count', size(regexp_extract_all({left4k(colInfo.name)}, '{HTTP_REGEX}',0)),
| 'alpha_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_REGEX}',0)),
| 'alpha_lower_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_LOWER_REGEX}',0)),
| 'alpha_upper_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_UPPER_REGEX}',0)),
| 'hex_digit_count', size(regexp_extract_all({left4k(colInfo.name)}, '{HEX_REGEX}',0))
| )
| as {colInfo.name}""", marginChar="|")
)

dfTextFeatures = self._addMeasureToSummary(
'text_features',
fieldExprs=fieldTextFeatures,
dfData=sourceDf,
dfSummary=None,
rowLimit=None)

return dfTextFeatures

def _summarizeTextFeatures(self, textFeaturesDf):
"""
Generate summary of text features
:param textFeaturesDf: Text features dataframe
:return: dataframe of summary text features
"""
assert textFeaturesDf is not None, "textFeaturesDf must be specified"

# generate named struct of summary text features for each column
fieldTextFeatures = []

# TODO: use json syntax asin:print_len when migrating to Spark 10.4LTS as minimum version

for colInfo in self.columnsInfo:
cname = colInfo.name
fieldTextFeatures.append(strip_margins(
f"""to_json(named_struct(
| 'print_len', array(min({cname}.print_len), max({cname}.print_len), avg({cname}.print_len)),
| 'word_count', array(min({cname}.word_count), max({cname}.word_count), avg({cname}.word_count)),
| 'space_count',array(min({cname}.space_count), max({cname}.space_count), avg({cname}.space_count)),
| 'digit_count', array(min({cname}.digit_count), max({cname}.digit_count), avg({cname}.digit_count)),
| 'punctuation_count', array(min({cname}.punctuation_count), max({cname}.punctuation_count),
| avg({cname}.punctuation_count)),
| 'at_count', array(min({cname}.at_count), max({cname}.at_count), avg({cname}.at_count)),
| 'period_count', array(min({cname}.period_count), max({cname}.period_count),
| avg({cname}.period_count)),
| 'http_count', array(min({cname}.http_count), max({cname}.http_count), avg({cname}.http_count)),
| 'alpha_count', array(min({cname}.alpha_count), max({cname}.alpha_count), avg({cname}.alpha_count)),
| 'alpha_lower_count', array(min({cname}.alpha_lower_count), max({cname}.alpha_lower_count),
| avg({cname}.alpha_lower_count)),
| 'alpha_upper_count', array(min({cname}.alpha_upper_count), max({cname}.alpha_upper_count),
| avg({cname}.alpha_upper_count)),
| 'hex_digit_count', array(min({cname}.hex_digit_count), max({cname}.hex_digit_count),
| avg({cname}.hex_digit_count))
| ))
| as {cname}""", marginChar="|")
)

dfSummaryTextFeatures = self._addMeasureToSummary(
'summary_text_features',
fieldExprs=fieldTextFeatures,
dfData=textFeaturesDf,
dfSummary=None,
rowLimit=1)

return dfSummaryTextFeatures

def summarizeToDF(self):
""" Generate summary analysis of data set as dataframe
@@ -368,6 +469,14 @@ def summarizeToDF(self):
dfData=df_under_analysis,
dfSummary=dfDataSummary)

logger.info("Analyzing text features")
dfTextFeatures = self.generateTextFeatures(self._getExpandedSourceDf())

logger.info("Summarizing text features")
dfTextFeaturesSummary = self._summarizeTextFeatures(dfTextFeatures)

dfDataSummary = dfDataSummary.union(dfTextFeaturesSummary)

return dfDataSummary

def summarize(self, suppressOutput=False):
79 changes: 48 additions & 31 deletions tests/test_generation_from_data.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@ def setupLogging():


class TestGenerationFromData:
SMALL_ROW_COUNT = 10000
SMALL_ROW_COUNT = 1000

@pytest.fixture
def testLogger(self):
@@ -68,66 +68,86 @@ def generation_spec(self):
.withColumn("int_value", "int", min=100, max=200, percentNulls=0.1)
.withColumn("byte_value", "tinyint", max=127)
.withColumn("decimal_value", "decimal(10,2)", max=1000000)
.withColumn("decimal_value", "decimal(10,2)", max=1000000)
.withColumn("date_value", "date", expr="current_date()", random=True)
.withColumn("binary_value", "binary", expr="cast('spark' as binary)", random=True)

)
return spec

def test_code_generation1(self, generation_spec, setupLogging):
@pytest.fixture
def source_data_df(self, generation_spec):
df_source_data = generation_spec.build()
df_source_data.show()
return df_source_data.cache()

def test_code_generation1(self, source_data_df, setupLogging):
source_data_df.show()

analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)

generatedCode = analyzer.scriptDataGeneratorFromData()

for fld in df_source_data.schema:
for fld in source_data_df.schema:
assert f"withColumn('{fld.name}'" in generatedCode

# check generated code for syntax errors
ast_tree = ast.parse(generatedCode)
assert ast_tree is not None

def test_code_generation_from_schema(self, generation_spec, setupLogging):
df_source_data = generation_spec.build()
generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema)
def test_code_generation_from_schema(self, source_data_df, setupLogging):
generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(source_data_df.schema)

for fld in df_source_data.schema:
for fld in source_data_df.schema:
assert f"withColumn('{fld.name}'" in generatedCode

# check generated code for syntax errors
ast_tree = ast.parse(generatedCode)
assert ast_tree is not None

def test_summarize(self, testLogger, generation_spec):
testLogger.info("Building test data")

df_source_data = generation_spec.build()
def test_summarize(self, testLogger, source_data_df):

testLogger.info("Creating data analyzer")

analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)

testLogger.info("Summarizing data analyzer results")
analyzer.summarize()

def test_summarize_to_df(self, generation_spec, testLogger):
testLogger.info("Building test data")

df_source_data = generation_spec.build()

def test_summarize_to_df(self, source_data_df, testLogger):
testLogger.info("Creating data analyzer")

analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)

testLogger.info("Summarizing data analyzer results")
df = analyzer.summarizeToDF()

#df.show()
df.show()

def test_generate_text_features(self, source_data_df, testLogger):
testLogger.info("Creating data analyzer")

analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)

df_text_features = analyzer.generateTextFeatures(source_data_df).limit(10)
df_text_features.show()

#data = df_text_features.selectExpr("get_json_object(asin, '$.print_len') as asin").limit(10).collect()
data = df_text_features.selectExpr("asin.print_len as asin").limit(10).collect()
assert data[0]['asin'] is not None
print(data[0]['asin'] )

def test_summarize_text_features(self, source_data_df, testLogger):
testLogger.info("Creating data analyzer")

analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)

df_text_features = analyzer.generateTextFeatures(source_data_df)
df_summary_text_features = analyzer._summarizeTextFeatures(df_text_features)
df_summary_text_features.show()

data = df_summary_text_features.selectExpr("get_json_object(asin, '$.print_len') as asin").limit(10).collect()
assert data[0]['asin'] is not None
print(data[0]['asin'])

df_source_data.where("title is null or length(title) = 0").show()

@pytest.mark.parametrize("sampleString, expectedMatch",
[("0234", "digits"),
@@ -141,10 +161,8 @@ def test_summarize_to_df(self, generation_spec, testLogger):
("test_function", "identifier"),
("10.0.0.1", "ip_addr")
])
def test_match_patterns(self, sampleString, expectedMatch, generation_spec):
df_source_data = generation_spec.build()

analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data)
def test_match_patterns(self, sampleString, expectedMatch, source_data_df):
analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df)

pattern_match_result = ""
for k, v in analyzer._regex_patterns.items():
@@ -156,11 +174,10 @@ def test_match_patterns(self, sampleString, expectedMatch, generation_spec):

assert pattern_match_result == expectedMatch, f"expected match to be {expectedMatch}"

def test_source_data_property(self, generation_spec):
df_source_data = generation_spec.build()
analyzer = dg.DataAnalyzer(sparkSession=spark, df=df_source_data, maxRows=1000)
def test_source_data_property(self, source_data_df):
analyzer = dg.DataAnalyzer(sparkSession=spark, df=source_data_df, maxRows=500)

count_rows = analyzer.sourceSampleDf.count()
print(count_rows)
assert abs(count_rows - 1000) < 100, "expected count to be close to 1000"
assert abs(count_rows - 500) < 50, "expected count to be close to 500"