From 344631ac1033779329f0aa339e128826019e11b0 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Fri, 17 Mar 2023 11:56:38 -0700 Subject: [PATCH 01/12] wip --- dbldatagen/text_generators.py | 14 +- tests/test_text_templates.py | 276 ++++++++++++++++++++++++++++++++++ 2 files changed, 288 insertions(+), 2 deletions(-) create mode 100644 tests/test_text_templates.py diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 96c03a4e..4def1f70 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +import logging #: list of hex digits for template generation _HEX_LOWER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'] @@ -243,9 +244,18 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): self._np_letters_all = np.array(_LETTERS_ALL) self._lenWords = len(self._wordList) - # get the template metadata + # get the template metadata - this will be list of metadata entries for each template + # for each template, metadata will be tuple of number of placeholders followed by list of random bounds + # to be computed when replacing non static placeholder template_info = [self._prepareTemplateStrings(template, escapeSpecialMeaning=escapeSpecialChars) for template in self._templates] + + logger = logging.getLogger(__name__) + + #if logger.isEnabledFor(logging.DEBUG): + for ix, ti in template_info: + logger.info(f"templates - {ix} {ti}") + self._max_placeholders = max([ x[0] for x in template_info]) self._max_rnds_needed = max([ len(x[1]) for x in template_info]) self._placeholders_needed = [ x[0] for x in template_info] @@ -637,7 +647,7 @@ def pandasGenerateText(self, v): for m in masked_matrices: np.ma.harden_mask(m) - # expand values into placeholders + # expand values into placeholders without affect masked values #self._applyTemplateStringsForTemplate(v.to_numpy(dtype=np.object_), #masked_base_values, self._applyTemplateStringsForTemplate(v, #masked_base_values, diff --git a/tests/test_text_templates.py b/tests/test_text_templates.py new file mode 100644 index 00000000..0968724b --- /dev/null +++ b/tests/test_text_templates.py @@ -0,0 +1,276 @@ +import re +import pytest +import pandas as pd +import numpy as np + +import pyspark.sql.functions as F +from pyspark.sql.types import BooleanType, DateType +from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType + +import dbldatagen as dg +from dbldatagen import TemplateGenerator, TextGenerator + +# add the following if using pandas udfs +# .config("spark.sql.execution.arrow.maxRecordsPerBatch", "1000") \ + + +spark = dg.SparkSingleton.getLocalInstance("unit tests") + +spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "20000") +spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") + + +# Test manipulation and generation of test data for a large schema +class TestTextTemplates: + testDataSpec = None + row_count = 100000 + partitions_requested = 4 + + @pytest.mark.parametrize("template_provided, escapeSpecial, useTemplateObject", + [ (r'\\w \\w|\\w A. \\w', False, False), + (r'\w \w|\w A. \w', False, False), + (r'\w A. \w', False, False), + (r'\\w \\w|\\w a. \\w', False, False), + (r'\\w \\w|\\w k. \\w', False, False), + (r'\\w \\w|\\w K. \\w', False, False), + (r'\\w \\w|\\w x. \\w', False, False), + (r'\\w \\w|\\w X. \\w', False, False), + (r'\\w \\w|\\w A. \\w', False, True), + (r'\\w \\w|\\w a. \\w', False, True), + (r'\\w \\w|\\w k. \\w', False, True), + (r'\\w \\w|\\w K. \\w', False, True), + (r'\\w \\w|\\w x. \\w', False, True), + (r'\\w \\w|\\w X. \\w', False, True), + (r'\\w \\w|\\w A. \\w', True, True), + (r'\w \w|\w \A. \w', True, True), + (r'\\w \\w|\\w a. \\w', True, True), + (r'\w \w|\w \a. \w', True, True), + (r'\\w \\w|\\w k. \\w', True, True), + (r'\\w \\w|\\w K. \\w', True, True), + (r'\\w \\w|\\w x. \\w', True, True), + (r'\\w \\w|\\w X. \\w', True, True), + (r'\\w a. \\w|\\w \\w', False, False), + (r'\\w k. \\w|\\w \\w', False, False), + (r'\\w a. \\w', False, True), + (r'\\w k. \\w', False, True), + (r'\w \a. \w', True, True), + (r'\w \k. \w', True, True), + + ]) + + def test_rnd_compute(self, template_provided, escapeSpecial, useTemplateObject): + template1 = TemplateGenerator(template_provided, escapeSpecialChars=escapeSpecial) + print(f"template [{template_provided}]") + + arr = np.arange(100) + + template_choices, template_rnd_bounds, template_rnds = template1._prepare_random_bounds(arr) + + assert len(template_choices) == len(template_rnds) + assert len(template_choices) == len(template_rnd_bounds) + + for ix in range(len(template_choices)): + bounds = template_rnd_bounds[ix] + rnds = template_rnds[ix] + + assert len(bounds) == len(rnds) + + for iy in range(len(bounds)): + assert bounds[iy] == -1 or (rnds[iy] < bounds[iy]) + + @pytest.mark.parametrize("template_provided, escapeSpecial, useTemplateObject", + [ (r'\\w \\w|\\w A. \\w|\w n n \w', False, False), + (r'\w \w|\w A. \w', False, False), + (r'\w A. \w', False, False), + (r'\\w \\w|\\w a. \\w', False, False), + (r'\\w \\w|\\w k. \\w', False, False), + (r'\\w \\w|\\w K. \\w', False, False), + (r'\\w \\w|\\w x. \\w', False, False), + (r'\\w \\w|\\w X. \\w', False, False), + (r'\\w \\w|\\w A. \\w', False, True), + (r'\\w \\w|\\w a. \\w', False, True), + (r'\\w \\w|\\w k. \\w', False, True), + (r'\\w \\w|\\w K. \\w', False, True), + (r'\\w \\w|\\w x. \\w', False, True), + (r'\\w \\w|\\w X. \\w', False, True), + (r'\\w \\w|\\w A. \\w', True, True), + (r'\w \w|\w \A. \w', True, True), + (r'\\w \\w|\\w a. \\w', True, True), + (r'\w \w|\w \a. \w', True, True), + (r'\\w \\w|\\w k. \\w', True, True), + (r'\\w \\w|\\w K. \\w', True, True), + (r'\\w \\w|\\w x. \\w', True, True), + (r'\\w \\w|\\w X. \\w', True, True), + (r'\\w a. \\w|\\w \\w', False, False), + (r'\\w k. \\w|\\w \\w', False, False), + (r'\\w a. \\w', False, True), + (r'\\w k. \\w', False, True), + (r'\w \a. \w', True, True), + (r'\w \k. \w', True, True), + + ]) + + def test_masking(self, template_provided, escapeSpecial, useTemplateObject): + template1 = TemplateGenerator(template_provided, escapeSpecialChars=escapeSpecial) + print(f"template [{template_provided}]") + + arr = np.arange(100) + + template_choices, template_rnd_bounds, template_rnds = template1._prepare_random_bounds(arr) + + assert len(template_choices) == len(template_rnds) + assert len(template_choices) == len(template_rnd_bounds) + + arr2 = np.full(shape=(arr.shape[0], 2), fill_value="testing 1 2 3 4 ", dtype=np.object_) + print(template_choices) + + template_choices_t = template_choices.T + + masked_placeholders = np.ma.MaskedArray(arr2, mask=False) + for x in range(len(template1._templates)-1): + masked_placeholders[template_choices_t != x] = np.ma.masked + np.ma.harden_mask(masked_placeholders) + + masked_placeholders[:, 0] = template1.templates[x] + masked_placeholders[:, 1] = template1.templates[x] + + np.ma.soften_mask(masked_placeholders) + masked_placeholders.mask = False + + + print(arr2) + + + + + @pytest.mark.parametrize("template_provided, escapeSpecial, useTemplateObject", + [ (r'\\w \\w|\\w A. \\w', False, False), + (r'\w \w|\w A. \w', False, False), + (r'\w A. \w', False, False), + (r'\\w \\w|\\w a. \\w', False, False), + (r'\\w \\w|\\w k. \\w', False, False), + (r'\\w \\w|\\w K. \\w', False, False), + (r'\\w \\w|\\w x. \\w', False, False), + (r'\\w \\w|\\w X. \\w', False, False), + (r'\\w \\w|\\w A. \\w', False, True), + (r'\\w \\w|\\w a. \\w', False, True), + (r'\\w \\w|\\w k. \\w', False, True), + (r'\\w \\w|\\w K. \\w', False, True), + (r'\\w \\w|\\w x. \\w', False, True), + (r'\\w \\w|\\w X. \\w', False, True), + (r'\\w \\w|\\w A. \\w', True, True), + (r'\w \w|\w \A. \w', True, True), + (r'\\w \\w|\\w a. \\w', True, True), + (r'\w \w|\w \a. \w', True, True), + (r'\\w \\w|\\w k. \\w', True, True), + (r'\\w \\w|\\w K. \\w', True, True), + (r'\\w \\w|\\w x. \\w', True, True), + (r'\\w \\w|\\w X. \\w', True, True), + (r'\\w a. \\w|\\w \\w', False, False), + (r'\\w k. \\w|\\w \\w', False, False), + (r'\\w a. \\w', False, True), + (r'\\w k. \\w', False, True), + (r'\w \a. \w', True, True), + (r'\w \k. \w', True, True), + + ]) + + def test_text_templates1(self, template_provided, escapeSpecial, useTemplateObject): + template1 = TemplateGenerator(template_provided, escapeSpecialChars=escapeSpecial) + print(f"template [{template_provided}]") + + print("max_placeholders", template1._max_placeholders ) + print("max_rnds", template1._max_rnds_needed) + print("placeholders", template1._placeholders_needed ) + print("bounds", template1._template_rnd_bounds) + + print("templates", template1.templates) + + arr = np.arange(100) + + template_choices, template_rnd_bounds, template_rnds = template1._prepare_random_bounds(arr) + + print("choices", template_choices) + print("rnd bounds", template_rnd_bounds) + print("template_rnds", template_rnds) + + assert len(template_choices) == len(template_rnds) + assert len(template_choices) == len(template_rnd_bounds) + + for ix in range(len(template_choices)): + bounds = template_rnd_bounds[ix] + rnds = template_rnds[ix] + + assert len(bounds) == len(rnds) + + for iy in range(len(bounds)): + assert bounds[iy] == -1 or (rnds[iy] < bounds[iy]) + + + results = template1.pandasGenerateText(arr) + print(results) + + + + + + + @pytest.mark.parametrize("template_provided, escapeSpecial, useTemplateObject", + [ (r'\\w \\w|\\w A. \\w', False, False), + (r'\w \w|\w A. \w', False, False), + (r'\w A. \w', False, False), + (r'\\w \\w|\\w a. \\w', False, False), + (r'\\w \\w|\\w k. \\w', False, False), + (r'\\w \\w|\\w K. \\w', False, False), + (r'\\w \\w|\\w x. \\w', False, False), + (r'\\w \\w|\\w X. \\w', False, False), + (r'\\w \\w|\\w A. \\w', False, True), + (r'\\w \\w|\\w a. \\w', False, True), + (r'\\w \\w|\\w k. \\w', False, True), + (r'\\w \\w|\\w K. \\w', False, True), + (r'\\w \\w|\\w x. \\w', False, True), + (r'\\w \\w|\\w X. \\w', False, True), + (r'\\w \\w|\\w A. \\w', True, True), + (r'\w \w|\w \A. \w', True, True), + (r'\\w \\w|\\w a. \\w', True, True), + (r'\w \w|\w \a. \w', True, True), + (r'\\w \\w|\\w k. \\w', True, True), + (r'\\w \\w|\\w K. \\w', True, True), + (r'\\w \\w|\\w x. \\w', True, True), + (r'\\w \\w|\\w X. \\w', True, True), + (r'\\w a. \\w|\\w \\w', False, False), + (r'\\w k. \\w|\\w \\w', False, False), + (r'\\w a. \\w', False, True), + (r'\\w k. \\w', False, True), + (r'\w \a. \w', True, True), + (r'\w \k. \w', True, True), + (r'\w \w|\w \w \w|\w \n \w|\w \w \w \w', True, True), + (r'\w \n \w', True, True), + + ]) + + def test_text_templates2(self, template_provided, escapeSpecial, useTemplateObject): + import dbldatagen as dg + print(f"template [{template_provided}]") + + data_rows = 10 * 1000 + + uniqueCustomers = 10 * 1000 + + dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=4) + .withColumn("customer_id", "long", uniqueValues=uniqueCustomers) + ) + + if useTemplateObject or escapeSpecial: + template1 = TemplateGenerator(template_provided, escapeSpecialChars=escapeSpecial) + dataspec = dataspec.withColumn("name", percentNulls=0.01, text=template1) + else: + dataspec = dataspec.withColumn("name", percentNulls=0.01, template=template_provided) + + df1 = dataspec.build() + df1.show() + + count = df1.where("name is not null").count() + assert count > 0 + + From b6e1eb306be69900f7a5933a98da7ab6464a5f2a Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Fri, 17 Mar 2023 14:49:45 -0700 Subject: [PATCH 02/12] merged updates to master --- CHANGELOG.md | 5 ++ README.md | 38 +++++---- dbldatagen/_version.py | 3 +- dbldatagen/data_generator.py | 2 +- dbldatagen/spark_singleton.py | 3 +- tests/test_logging.py | 149 ++++++++++++++++++++++++++++++++++ 6 files changed, 183 insertions(+), 17 deletions(-) create mode 100644 tests/test_logging.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b5450447..e8866f1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ ## Change History All notable changes to the Databricks Labs Data Generator will be documented in this file. +### Unreleased + +#### Changed +* Fixed use of logger in _version.py and in spark_singleton.py + ### Version 0.3.2 #### Changed diff --git a/README.md b/README.md index 2e8ff2ab..a2ec0e10 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,10 @@ [![build](https://github.com/databrickslabs/dbldatagen/workflows/build/badge.svg?branch=master)](https://github.com/databrickslabs/dbldatagen/actions?query=workflow%3Abuild+branch%3Amaster) +[![PyPi package](https://img.shields.io/pypi/v/dbldatagen?color=green)](https://pypi.org/project/dbldatagen/) [![codecov](https://codecov.io/gh/databrickslabs/dbldatagen/branch/master/graph/badge.svg)](https://codecov.io/gh/databrickslabs/dbldatagen) -[![PyPi downloads](https://img.shields.io/pypi/dm/dbldatagen?label=PyPi%20Downloads)](https://pypi.org/project/dbldatagen/) +[![PyPi downloads](https://img.shields.io/pypi/dm/dbldatagen?label=PyPi%20Downloads)](https://pypistats.org/packages/dbldatagen) +