diff --git a/README.md b/README.md index de0dd83..073b25b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ # gbq-schema_to_json-schema Python library to convert google bigquery schema to jsonschema + +## Test + +```bash +pytest -W ignore -vv . +``` \ No newline at end of file diff --git a/gbqschema_converter/__init__.py b/gbqschema_converter/__init__.py new file mode 100644 index 0000000..bca37eb --- /dev/null +++ b/gbqschema_converter/__init__.py @@ -0,0 +1,13 @@ +# Dmitry Kisler © 2020 +# www.dkisler.com + +r""" +Author: Dmitry Kisler +Email: admin@dkisler.com +Objective: To convert Google Biq Query table schema to Json Schema +References: +- https://cloud.google.com/bigquery/docs/schemas#creating_a_json_schema_file +- https://json-schema.org/ +""" +__version__ = "1.0" +__all__ = ('__version__', 'gbqschema_to_jsonschema', 'jsonschema_to_gbqschema') diff --git a/gbqschema_converter/gbqschema_to_jsonschema/convert.py b/gbqschema_converter/gbqschema_to_jsonschema/convert.py new file mode 100644 index 0000000..c34799f --- /dev/null +++ b/gbqschema_converter/gbqschema_to_jsonschema/convert.py @@ -0,0 +1,155 @@ +# Dmitry Kisler © 2020 +# www.dkisler.com + +from typing import Union, Tuple, List +from collections import namedtuple +from google.cloud import bigquery +import fastjsonschema + + +gbq_schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "items": { + "type": "object", + "required": [ + "name", + "type", + "mode" + ], + "properties": { + "description": { + "type": "string" + }, + "name": { + "type": "string", + "examples": [ + "att1" + ] + }, + "type": { + "type": "string", + "enum": [ + "INT", + "INTEGER", + "INT64", + "FLOAT", + "FLOAT64", + "NUMERIC", + "BOOL", + "BOOLEAN", + "STRING", + "BYTES", + "DATE", + "DATETIME", + "TIME", + "TIMESTAMP" + ] + }, + "mode": { + "type": "string", + "enum": [ + "REQUIRED", + "NULLABLE" + ] + } + }, + "additionalProperties": False, + }, +} + +validate = fastjsonschema.compile(gbq_schema) + +TEMPLATE = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "items": { + "$ref": "#/definitions/element" + }, + "definitions": { + "element": { + "type": "object", + "properties": { + + }, + "additionalProperties": False, + "required": [ + ], + }, + }, +} + +MapTypes = namedtuple("map_types", + gbq_schema['items']['properties']['type']['enum']) + +map_types = MapTypes( + INT={"type": "integer"}, + INTEGER={"type": "integer"}, + INT64={"type": "integer"}, + FLOAT={"type": "number"}, + FLOAT64={"type": "number"}, + NUMERIC={"type": "number"}, + BOOL={"type": "boolean"}, + BOOLEAN={"type": "boolean"}, + STRING={"type": "string"}, + BYTES={"type": "string"}, + DATE={"type": "string", "format": "date"}, + DATETIME={"type": "string", "format": "date-time"}, + TIME={"type": "string", "format": "time"}, + TIMESTAMP={"type": "string", "format": "time"} +) + + +def representation_json(gbq_schema: dict, + additional_properties: bool = False) -> dict: + """Function to convert Google Big Query schema in JSON representation to json schema. + + Args: + gbq_schema: Bigquery schema, JSON representation + read https://cloud.google.com/bigquery/docs/schemas#creating_a_json_schema_file + for details. + additional_properties: Json schema should contain "additionalProperties". + + Returns: + Json schema as dict. + + Raises: + fastjsonschema.JsonSchemaException: Error occured if input Google Big Query schema is invalid. + """ + try: + validate(gbq_schema) + except fastjsonschema.JsonSchemaException as ex: + raise ex + + output = TEMPLATE.copy() + + for element in gbq_schema: + key = element['name'] + + output['definitions']['element']['properties'][key] = getattr(map_types, element['type']) + + if 'description' in element: + output['definitions']['element']['properties'][key]['description'] = element['description'] + + if element['mode'] == "REQUIRED": + output['definitions']['element']['required'].append(key) + + output['definitions']['element']['additionalProperties'] = additional_properties + + return output + + +def representation_google_sdk(gbq_schema: List[bigquery.SchemaField], + restrictive: bool = False) -> dict: + """Function to convert Google Big Query schema in Google SDK representation to json schema. + + Args: + gbq_schema: bigquery schema, SDK repsentation + read https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.schema.SchemaField.html + for details. + additional_properties: jsonschema should contain "additionalProperties". + + Returns: + json schema as dict. + """ + pass diff --git a/gbqschema_converter/jsonschema_to_gbqschema/convert.py b/gbqschema_converter/jsonschema_to_gbqschema/convert.py new file mode 100644 index 0000000..9c83893 --- /dev/null +++ b/gbqschema_converter/jsonschema_to_gbqschema/convert.py @@ -0,0 +1,43 @@ +# Dmitry Kisler © 2020 +# www.dkisler.com + +from typing import Union, Tuple, List +from collections import namedtuple +from google.cloud import bigquery +import fastjsonschema + + +def representation_json(json_schema: dict) -> dict: + """Function to convert json schema to Google Big Query schema in JSON representation. + + Args: + json_schema: Json schema + read https://json-schema.org/ + for details. + + Returns: + Json schema as dict. + + Raises: + fastjsonschema.JsonSchemaDefinitionException: Error occured if input json schema is invalid. + """ + try: + fastjsonschema.compile(json_schema) + except fastjsonschema.JsonSchemaDefinitionException as ex: + raise ex + + pass + + +def representation_google_sdk(json_schema: dict) -> List[bigquery.SchemaField]: + """Function to convert json schema to Google Big Query schema in Google SDK representation. + + Args: + json_schema: json schema + read https://json-schema.org/ + for details. + + Returns: + List of SchemaField objects. + """ + pass diff --git a/gbqschema_converter/tests/test_gbqschema_to_jsonschema.py b/gbqschema_converter/tests/test_gbqschema_to_jsonschema.py new file mode 100644 index 0000000..8b14449 --- /dev/null +++ b/gbqschema_converter/tests/test_gbqschema_to_jsonschema.py @@ -0,0 +1,259 @@ +import os +import pytest +import importlib.util +from types import ModuleType + + +DIR = os.path.dirname(os.path.abspath(__file__)) +MODULE = "convert" + +FUNCTIONS = set(['representation_json', 'representation_google_sdk']) + + +def load_module(module_name: str) -> ModuleType: + """Function to load the module. + + Args: + module_name: module name + + Returns: + module object + """ + file_path = f"{DIR}/../gbqschema_to_jsonschema/{module_name}.py" + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_module_exists(): + try: + _ = load_module(MODULE) + except Exception as ex: + raise ex + return + + +module = load_module(MODULE) + + +def test_module_miss_functions() -> None: + missing = FUNCTIONS.difference(set(module.__dir__())) + assert not missing, f"""Function(s) '{"', '".join(missing)}' is(are) missing.""" + return + + +def test_representation_json_validator() -> None: + schema_in = [ + { + "description": "Att 1", + "name": "att_01", + "type": "INT64", + "mode": "NULLABLE" + }, + { + "name": "att_02", + "type": "FFA", + "mode": "REQUIRED" + }, + ] + + try: + module.representation_json(schema_in) + except Exception as ex: + assert "data[1].type must be one of" in str(ex),\ + "Input validation doesn't work" + + schema_in = [ + { + "description1": "Att 1", + "name": "att_01", + "type": "INT64", + "mode": "NULLABLE" + }, + ] + + try: + module.representation_json(schema_in) + except Exception as ex: + assert "data[0] must contain only specified properties" in str(ex),\ + "Input validation doesn't work" + + schema_in = [ + { + "name": "att_01", + "type": "INT64", + "mode": "NULLABLE1" + }, + ] + + try: + module.representation_json(schema_in) + except Exception as ex: + assert "data[0].mode must be one of" in str(ex),\ + "Input validation doesn't work" + + return + + +def test_representation_json_conversion() -> None: + schema_in = [ + { + "description": "Att 1", + "name": "att_01", + "type": "INT64", + "mode": "NULLABLE" + }, + { + "description": "Att 2", + "name": "att_02", + "type": "FLOAT64", + "mode": "REQUIRED" + }, + { + "name": "att_03", + "type": "NUMERIC", + "mode": "REQUIRED" + }, + { + "name": "att_04", + "type": "STRING", + "mode": "REQUIRED" + }, + { + "name": "att_05", + "type": "BOOL", + "mode": "REQUIRED" + }, + { + "name": "att_06", + "type": "BOOLEAN", + "mode": "REQUIRED" + }, + { + "name": "att_07", + "type": "STRING", + "mode": "REQUIRED" + }, + { + "name": "att_08", + "type": "DATE", + "mode": "REQUIRED" + }, + { + "name": "att_09", + "type": "DATETIME", + "mode": "REQUIRED" + }, + { + "name": "att_10", + "type": "TIMESTAMP", + "mode": "REQUIRED" + }, + { + "name": "att_11", + "type": "TIME", + "mode": "REQUIRED" + }, + { + "name": "att_12", + "type": "INT", + "mode": "REQUIRED" + }, + { + "name": "att_13", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "att_14", + "type": "FLOAT", + "mode": "REQUIRED" + }, + ] + + schema_out = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "items": { + "$ref": "#/definitions/element" + }, + "definitions": { + "element": { + "type": "object", + "properties": { + "att_01": { + "type": "integer", + "description": "Att 1" + }, + "att_02": { + "type": "number", + "description": "Att 2" + }, + "att_03": { + "type": "number" + }, + "att_04": { + "type": "string" + }, + "att_05": { + "type": "boolean" + }, + "att_06": { + "type": "boolean" + }, + "att_07": { + "type": "string" + }, + "att_08": { + "type": "string", + "format": "date" + }, + "att_09": { + "type": "string", + "format": "date-time" + }, + "att_10": { + "type": "string", + "format": "time" + }, + "att_11": { + "type": "string", + "format": "time" + }, + "att_12": { + "type": "integer" + }, + "att_13": { + "type": "integer" + }, + "att_14": { + "type": "number" + } + }, + "additionalProperties": True, + "required": [ + "att_02", + "att_03", + "att_04", + "att_05", + "att_06", + "att_07", + "att_08", + "att_09", + "att_10", + "att_11", + "att_12", + "att_13", + "att_14" + ] + } + } + } + + schema_convert = module.representation_json(schema_in, True) + + assert schema_convert == schema_out,\ + "Convertion doesn't work" + + return diff --git a/gbqschema_converter/tests/test_jsonschema_to_gbqschema.py b/gbqschema_converter/tests/test_jsonschema_to_gbqschema.py new file mode 100644 index 0000000..1df2255 --- /dev/null +++ b/gbqschema_converter/tests/test_jsonschema_to_gbqschema.py @@ -0,0 +1,92 @@ +import os +import pytest +import importlib.util +from types import ModuleType + + +DIR = os.path.dirname(os.path.abspath(__file__)) +MODULE = "convert" + +FUNCTIONS = set(['representation_json', 'representation_google_sdk']) + + +def load_module(module_name: str) -> ModuleType: + """Function to load the module. + + Args: + module_name: module name + + Returns: + module object + """ + file_path = f"{DIR}/../jsonschema_to_gbqschema/{module_name}.py" + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_module_exists(): + try: + _ = load_module(MODULE) + except Exception as ex: + raise ex + return + + +module = load_module(MODULE) + + +def test_module_miss_functions() -> None: + missing = FUNCTIONS.difference(set(module.__dir__())) + assert not missing, f"""Function(s) '{"', '".join(missing)}' is(are) missing.""" + return + + +def test_representation_json_validator() -> None: + schema_in = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array1", + "items": { + "$ref": "#/definitions/element" + }, + "definitions": { + "element": { + "type": "object", + "properties": { + "att_01": { + "type": "integer", + "description": "Att 1" + }, + } + } + } + } + + try: + module.representation_json(schema_in) + except Exception as ex: + assert "Unknown type: 'array1'" in str(ex),\ + "Input validation doesn't work" + + schema_in = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "items": { + "$ref": "#/definitions/element" + }, + "definitions": { + "element": { + "type": "objects", + "properties": { + "att_01": {"type": "integer"} + } + } + } + } + + try: + module.representation_json(schema_in) + except Exception as ex: + assert "Unknown type: 'objects'" in str(ex),\ + "Input validation doesn't work" diff --git a/requirements.txt b/requirements.txt index 7cca548..119acfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -fastjsonschema >= 2.14.4 \ No newline at end of file +fastjsonschema >= 2.14.4 +google-cloud-bigquery >= 1.24.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 80cf07c..220e75c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # www.dkisler.com import os -from setuptools import setup +from setuptools import setup, find_namespace_packages DIR = os.path.abspath(os.path.dirname(__file__)) @@ -12,7 +12,7 @@ setup( - name='gbqschema_to_jsonschema', + name='gbqschema_converter', version='1.0', description="Library to convert Google BigQuery Schema to Json Schema", classifiers=[ @@ -25,7 +25,9 @@ author="Dmitry Kisler", author_email=["admin@dkisler.com"], license='MIT', - packages=["gbqschema_to_jsonschema"], + packages=("gbqschema_converter", + "gbqschema_converter.gbqschema_to_jsonschema", + "gbqschema_converter.jsonschema_to_gbqschema"), install_requires=requirements, include_package_data=True, zip_safe=False