Skip to content

Commit 2482dca

Browse files
Feature hotfixes (#274)
* hot fixes for data analyzer issues * hotfixes for issues in DataAnalyzer * changed comment wording
1 parent 1113d73 commit 2482dca

10 files changed

+110
-15
lines changed

CHANGELOG.md

+11
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,17 @@
33
## Change History
44
All notable changes to the Databricks Labs Data Generator will be documented in this file.
55

6+
7+
### Version 0.3.6 Post 1
8+
9+
#### Changed
10+
* Updated docs for complex data types / JSON to correct code examples
11+
* Updated license file in public docs
12+
13+
#### Fixed
14+
* Fixed scenario where `DataAnalyzer` is used on dataframe containing a column named `summary`
15+
16+
617
### Version 0.3.6
718

819
#### Changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ details of use and many examples.
6464

6565
Release notes and details of the latest changes for this specific release
6666
can be found in the GitHub repository
67-
[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.6/CHANGELOG.md)
67+
[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.6post1/CHANGELOG.md)
6868

6969
# Installation
7070

dbldatagen/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def get_version(version):
3434
return version_info
3535

3636

37-
__version__ = "0.3.6" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
37+
__version__ = "0.3.6post1" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
3838
__version_info__ = get_version(__version__)
3939

4040

dbldatagen/data_analyzer.py

+64-8
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,18 @@
77
88
This code is experimental and both APIs and code generated is liable to change in future versions.
99
"""
10-
from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
11-
TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType
10+
import logging
1211

1312
import pyspark.sql as ssql
14-
import pyspark.sql.functions as F
13+
from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
14+
TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType
1515

16-
from .utils import strip_margins
1716
from .spark_singleton import SparkSingleton
17+
from .utils import strip_margins
18+
19+
SUMMARY_FIELD_NAME = "summary"
20+
SUMMARY_FIELD_NAME_RENAMED = "__summary__"
21+
DATA_SUMMARY_FIELD_NAME = "__data_summary__"
1822

1923

2024
class DataAnalyzer:
@@ -23,6 +27,8 @@ class DataAnalyzer:
2327
2428
:param df: Spark dataframe to analyze
2529
:param sparkSession: Spark session instance to use when performing spark operations
30+
:param debug: If True, additional debug information is logged
31+
:param verbose: If True, additional information is logged
2632
2733
.. warning::
2834
Experimental
@@ -43,11 +49,17 @@ class DataAnalyzer:
4349
|# Column definitions are stubs only - modify to generate correct data
4450
|#""", '|')
4551

46-
def __init__(self, df=None, sparkSession=None):
52+
def __init__(self, df=None, sparkSession=None, debug=False, verbose=False):
4753
""" Constructor:
4854
:param df: Dataframe to analyze
4955
:param sparkSession: Spark session to use
5056
"""
57+
# set up logging
58+
self.verbose = verbose
59+
self.debug = debug
60+
61+
self._setupLogger()
62+
5163
assert df is not None, "dataframe must be supplied"
5264

5365
self._df = df
@@ -58,6 +70,19 @@ def __init__(self, df=None, sparkSession=None):
5870
self._sparkSession = sparkSession
5971
self._dataSummary = None
6072

73+
def _setupLogger(self):
74+
"""Set up logging
75+
76+
This will set the logger at warning, info or debug levels depending on the instance construction parameters
77+
"""
78+
self.logger = logging.getLogger("DataAnalyzer")
79+
if self.debug:
80+
self.logger.setLevel(logging.DEBUG)
81+
elif self.verbose:
82+
self.logger.setLevel(logging.INFO)
83+
else:
84+
self.logger.setLevel(logging.WARNING)
85+
6186
def _displayRow(self, row):
6287
"""Display details for row"""
6388
results = []
@@ -95,6 +120,31 @@ def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, d
95120

96121
return dfResult
97122

123+
def _get_dataframe_describe_stats(self, df):
124+
""" Get summary statistics for dataframe handling renaming of summary field if necessary"""
125+
print("schema", df.schema)
126+
127+
src_fields = [fld.name for fld in df.schema.fields]
128+
print("src_fields", src_fields)
129+
renamed_summary = False
130+
131+
# get summary statistics handling the case where a field named 'summary' exists
132+
# if the `summary` field name exists, we'll rename it to avoid a conflict
133+
if SUMMARY_FIELD_NAME in src_fields:
134+
renamed_summary = True
135+
df = df.withColumnRenamed(SUMMARY_FIELD_NAME, SUMMARY_FIELD_NAME_RENAMED)
136+
137+
# The dataframe describe method produces a field named `summary`. We'll rename this to avoid conflict with
138+
# any natural fields using the same name.
139+
summary_df = df.describe().withColumnRenamed(SUMMARY_FIELD_NAME, DATA_SUMMARY_FIELD_NAME)
140+
141+
# if we renamed a field called `summary` in the data, we'll rename it back.
142+
# The data summary field produced by the describe method has already been renamed so there will be no conflict.
143+
if renamed_summary:
144+
summary_df = summary_df.withColumnRenamed(SUMMARY_FIELD_NAME_RENAMED, SUMMARY_FIELD_NAME)
145+
146+
return summary_df
147+
98148
def summarizeToDF(self):
99149
""" Generate summary analysis of data set as dataframe
100150
@@ -154,11 +204,12 @@ def summarizeToDF(self):
154204
dfData=self._df,
155205
dfSummary=dfDataSummary)
156206

157-
descriptionDf = self._df.describe().where("summary in ('mean', 'stddev')")
207+
descriptionDf = (self._get_dataframe_describe_stats(self._df)
208+
.where(f"{DATA_SUMMARY_FIELD_NAME} in ('mean', 'stddev')"))
158209
describeData = descriptionDf.collect()
159210

160211
for row in describeData:
161-
measure = row['summary']
212+
measure = row[DATA_SUMMARY_FIELD_NAME]
162213

163214
values = {k[0]: '' for k in dtypes}
164215

@@ -401,7 +452,12 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
401452
402453
"""
403454
assert self._df is not None
404-
assert type(self._df) is ssql.DataFrame, "sourceDf must be a valid Pyspark dataframe"
455+
456+
if not isinstance(self._df, ssql.DataFrame):
457+
self.logger.warning(strip_margins(
458+
"""The parameter `sourceDf` should be a valid Pyspark dataframe.
459+
|Note this warning may false due to use of remote connection to a Spark cluster""",
460+
'|'))
405461

406462
if self._dataSummary is None:
407463
df_summary = self.summarizeToDF()

dbldatagen/utils.py

-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ def deprecated(message=""):
2727
"""
2828

2929
# create closure around function that follows use of the decorator
30-
3130
def deprecated_decorator(func):
3231
@functools.wraps(func)
3332
def deprecated_func(*args, **kwargs):
@@ -290,7 +289,6 @@ def split_list_matching_condition(lst, cond):
290289
x = ['id', 'city_name', 'id', 'city_id', 'city_pop', 'id', 'city_id', 'city_pop','city_id', 'city_pop','id']
291290
splitListOnCondition(x, lambda el: el == 'id')
292291
293-
294292
Result:
295293
`[['id'], ['city_name'], ['id'], ['city_id', 'city_pop'],
296294
['id'], ['city_id', 'city_pop', 'city_id', 'city_pop'], ['id']]`

docs/source/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
author = 'Databricks Inc'
2929

3030
# The full version, including alpha/beta/rc tags
31-
release = "0.3.6" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
31+
release = "0.3.6post1" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
3232

3333

3434
# -- General configuration ---------------------------------------------------

docs/source/generating_json_data.rst

+24
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Generating JSON and Structured Column Data
77
This section explores generating JSON and structured column data. By structured columns,
88
we mean columns that are some combination of `struct`, `array` and `map` of other types.
99

10+
*Note that some of the examples are code fragments for illustration purposes only.*
11+
1012
Generating JSON data
1113
--------------------
1214
There are several methods for generating JSON data:
@@ -25,6 +27,7 @@ The following example illustrates the basic technique for generating JSON data f
2527
2628
import dbldatagen as dg
2729
30+
device_population = 100000
2831
2932
country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
3033
'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
@@ -106,6 +109,7 @@ Note that in the current release, the `expr` attribute will override other colum
106109
107110
import dbldatagen as dg
108111
112+
device_population = 100000
109113
110114
country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
111115
'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
@@ -221,6 +225,7 @@ functions such as `named_struct` and `to_json`.
221225
222226
import dbldatagen as dg
223227
228+
device_population = 100000
224229
225230
country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
226231
'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
@@ -341,6 +346,25 @@ populated.
341346

342347
The following example illustrates this:
343348

349+
.. code-block:: python
350+
351+
import dbldatagen as dg
352+
353+
column_count = 10
354+
data_rows = 10 * 1000
355+
df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=data_rows)
356+
.withIdOutput()
357+
.withColumn("r", FloatType(), expr="floor(rand() * 350) * (86400 + 3600)",
358+
numColumns=column_count, structType="array")
359+
.withColumn("code1", "integer", minValue=100, maxValue=200)
360+
.withColumn("code2", "integer", minValue=0, maxValue=10)
361+
.withColumn("code3", "string", values=['one', 'two', 'three'])
362+
.withColumn("code4", "string", values=['one', 'two', 'three'])
363+
.withColumn("code5", dg.INFER_DATATYPE, expr="current_date()")
364+
.withColumn("code6", dg.INFER_DATATYPE, expr="code1 + code2")
365+
.withColumn("code7", dg.INFER_DATATYPE, expr="concat(code3, code4)")
366+
)
367+
344368
345369
Using multi feature columns to generate arrays
346370
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

python/.bumpversion.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.6
2+
current_version = 0.3.6post1
33
commit = False
44
tag = False
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+){0,1}(?P<release>\D*)(?P<build>\d*)

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
setuptools.setup(
3333
name="dbldatagen",
34-
version="0.3.6",
34+
version="0.3.6post1",
3535
author="Ronan Stokes, Databricks",
3636
description="Databricks Labs - PySpark Synthetic Data Generator",
3737
long_description=long_description,

tests/test_generation_from_data.py

+6
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,9 @@ def test_summarize_to_df(self, generation_spec, testLogger):
107107
df = analyzer.summarizeToDF()
108108

109109
df.show()
110+
111+
def test_df_containing_summary(self):
112+
df = spark.range(10).withColumnRenamed("id", "summary")
113+
summary_df = dg.DataAnalyzer(sparkSession=spark, df=df).summarizeToDF()
114+
115+
assert summary_df.count() == 10

0 commit comments

Comments
 (0)