diff --git a/poetry.lock b/poetry.lock index 20f0b279..b2509cb4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -31,7 +31,7 @@ typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} [package.extras] doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1) ; python_version >= \"3.10\"", "uvloop (>=0.21) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\" and python_version < \"3.14\""] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"] trio = ["trio (>=0.26.1)"] [[package]] @@ -182,12 +182,12 @@ files = [ ] [package.extras] -benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] -cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] -dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] -tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] -tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "babel" @@ -202,7 +202,7 @@ files = [ ] [package.extras] -dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""] +dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"] [[package]] name = "beartype" @@ -217,10 +217,10 @@ files = [ ] [package.extras] -dev = ["autoapi (>=0.9.0)", "click", "coverage (>=5.5)", "equinox ; sys_platform == \"linux\"", "jax[cpu] ; sys_platform == \"linux\"", "jaxtyping ; sys_platform == \"linux\"", "langchain", "mypy (>=0.800) ; platform_python_implementation != \"PyPy\"", "numba ; python_version < \"3.13.0\"", "numpy ; sys_platform != \"darwin\" and platform_python_implementation != \"PyPy\"", "pandera", "pydata-sphinx-theme (<=0.7.2)", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx (>=4.2.0,<6.0.0)", "sphinx ; python_version >= \"3.8.0\"", "sphinxext-opengraph (>=0.7.5)", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)", "xarray"] +dev = ["autoapi (>=0.9.0)", "click", "coverage (>=5.5)", "equinox", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "numba", "numpy", "pandera", "pydata-sphinx-theme (<=0.7.2)", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)", "xarray"] doc-rtd = ["autoapi (>=0.9.0)", "pydata-sphinx-theme (<=0.7.2)", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)"] -test = ["click", "coverage (>=5.5)", "equinox ; sys_platform == \"linux\"", "jax[cpu] ; sys_platform == \"linux\"", "jaxtyping ; sys_platform == \"linux\"", "langchain", "mypy (>=0.800) ; platform_python_implementation != \"PyPy\"", "numba ; python_version < \"3.13.0\"", "numpy ; sys_platform != \"darwin\" and platform_python_implementation != \"PyPy\"", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx ; python_version >= \"3.8.0\"", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)", "xarray"] -test-tox = ["click", "equinox ; sys_platform == \"linux\"", "jax[cpu] ; sys_platform == \"linux\"", "jaxtyping ; sys_platform == \"linux\"", "langchain", "mypy (>=0.800) ; platform_python_implementation != \"PyPy\"", "numba ; python_version < \"3.13.0\"", "numpy ; sys_platform != \"darwin\" and platform_python_implementation != \"PyPy\"", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx ; python_version >= \"3.8.0\"", "typing-extensions (>=3.10.0.0)", "xarray"] +test = ["click", "coverage (>=5.5)", "equinox", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "numba", "numpy", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)", "xarray"] +test-tox = ["click", "equinox", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "numba", "numpy", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)", "xarray"] test-tox-coverage = ["coverage (>=5.5)"] [[package]] @@ -831,21 +831,6 @@ files = [ {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, ] -[[package]] -name = "elementpath" -version = "4.7.0" -description = "XPath 1.0/2.0/3.0/3.1 parsers and selectors for ElementTree and lxml" -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "elementpath-4.7.0-py3-none-any.whl", hash = "sha256:607804a1b4250ac448c1e2bfaec4ee1c980b0a07cfdb0d9057b57102038ed480"}, - {file = "elementpath-4.7.0.tar.gz", hash = "sha256:a2029dc8752fcfec49663d1ed1b412c6daf278c0c91938f50f63c4fe9ed1848e"}, -] - -[package.extras] -dev = ["Sphinx", "coverage", "flake8", "lxml", "lxml-stubs", "memory-profiler", "memray", "mypy", "tox", "xmlschema (>=3.3.2)"] - [[package]] name = "et-xmlfile" version = "2.0.0" @@ -871,7 +856,7 @@ files = [ ] [package.extras] -tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] [[package]] name = "fastjsonschema" @@ -964,7 +949,7 @@ files = [ [package.extras] docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] -typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] +typing = ["typing-extensions (>=4.12.2)"] [[package]] name = "fqdn" @@ -1162,7 +1147,7 @@ httpcore = "==1.*" idna = "*" [package.extras] -brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] +brotli = ["brotli", "brotlicffi"] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] @@ -1333,12 +1318,12 @@ files = [ zipp = ">=3.20" [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] perf = ["ipython"] -test = ["flufl.flake8", "importlib_resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["pytest-mypy"] [[package]] @@ -1354,7 +1339,7 @@ files = [ ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] @@ -1375,7 +1360,7 @@ files = [ [package.extras] docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["pygments", "pytest (>=6)", "pytest-black (>=0.3.7) ; platform_python_implementation != \"PyPy\"", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1) ; platform_python_implementation != \"PyPy\""] +testing = ["pygments", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [[package]] name = "iniconfig" @@ -1449,7 +1434,7 @@ traitlets = ">=5.13.0" [package.extras] all = ["ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole]", "ipython[test,test-extra]"] black = ["black"] -doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "tomli ; python_version < \"3.11\"", "typing_extensions"] +doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "tomli", "typing_extensions"] kernel = ["ipykernel"] matplotlib = ["matplotlib"] nbconvert = ["nbconvert"] @@ -1690,7 +1675,7 @@ traitlets = ">=5.3" [package.extras] docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko ; sys_platform == \"win32\"", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] +test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] [[package]] name = "jupyter-console" @@ -2079,7 +2064,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" -groups = ["main", "dev", "docs"] +groups = ["dev", "docs"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -2213,7 +2198,7 @@ tornado = ">=6.2.0" [package.extras] dev = ["hatch", "pre-commit"] docs = ["myst-parser", "nbsphinx", "pydata-sphinx-theme", "sphinx (>=1.3.6)", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["importlib-resources (>=5.0) ; python_version < \"3.10\"", "ipykernel", "jupyter-server[test] (>=2.4.0,<3)", "jupyterlab-server[test] (>=2.27.1,<3)", "nbval", "pytest (>=7.0)", "pytest-console-scripts", "pytest-timeout", "pytest-tornasync", "requests"] +test = ["importlib-resources (>=5.0)", "ipykernel", "jupyter-server[test] (>=2.4.0,<3)", "jupyterlab-server[test] (>=2.27.1,<3)", "nbval", "pytest (>=7.0)", "pytest-console-scripts", "pytest-timeout", "pytest-tornasync", "requests"] [[package]] name = "notebook-shim" @@ -2420,42 +2405,6 @@ sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-d test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] xml = ["lxml (>=4.9.2)"] -[[package]] -name = "pandera" -version = "0.22.1" -description = "A light-weight and flexible data validation and testing tool for statistical data objects." -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "pandera-0.22.1-py3-none-any.whl", hash = "sha256:2a35531b4b533ac83e606a6dcc3cd41561774ff3d872117228e931f22e72f330"}, - {file = "pandera-0.22.1.tar.gz", hash = "sha256:091ebc353383ba642e5a20ee0df763ed2059ab99cb4b2ac3e83f482de8493645"}, -] - -[package.dependencies] -numpy = ">=1.19.0" -packaging = ">=20.0" -pandas = ">=1.2.0" -polars = {version = ">=0.20.0", optional = true, markers = "extra == \"polars\""} -pydantic = "*" -typeguard = "*" -typing_inspect = ">=0.6.0" - -[package.extras] -all = ["black", "dask[dataframe]", "fastapi", "frictionless (<=4.40.8)", "geopandas", "hypothesis (>=6.92.7)", "modin", "pandas-stubs", "polars (>=0.20.0)", "pyspark[connect] (>=3.2.0)", "pyyaml (>=5.1)", "ray", "scipy", "shapely"] -dask = ["dask[dataframe]"] -fastapi = ["fastapi"] -geopandas = ["geopandas", "shapely"] -hypotheses = ["scipy"] -io = ["black", "frictionless (<=4.40.8)", "pyyaml (>=5.1)"] -modin = ["dask[dataframe]", "modin", "ray"] -modin-dask = ["dask[dataframe]", "modin"] -modin-ray = ["modin", "ray"] -mypy = ["pandas-stubs"] -polars = ["polars (>=0.20.0)"] -pyspark = ["pyspark[connect] (>=3.2.0)"] -strategies = ["hypothesis (>=6.92.7)"] - [[package]] name = "pandocfilters" version = "1.5.1" @@ -2632,7 +2581,7 @@ pyarrow = ["pyarrow (>=7.0.0)"] pydantic = ["pydantic"] sqlalchemy = ["polars[pandas]", "sqlalchemy"] style = ["great-tables (>=0.8.0)"] -timezone = ["tzdata ; platform_system == \"Windows\""] +timezone = ["tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] xlsxwriter = ["xlsxwriter"] @@ -2768,7 +2717,7 @@ typing-extensions = ">=4.12.2" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] +timezone = ["tzdata"] [[package]] name = "pydantic-core" @@ -2947,7 +2896,7 @@ files = [ ] [package.extras] -dev = ["backports.zoneinfo ; python_version < \"3.9\"", "black", "build", "freezegun", "mdx_truly_sane_lists", "mike", "mkdocs", "mkdocs-awesome-pages-plugin", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-material (>=8.5)", "mkdocstrings[python]", "msgspec ; implementation_name != \"pypy\" and python_version < \"3.13\"", "msgspec-python313-pre ; implementation_name != \"pypy\" and python_version == \"3.13\"", "mypy", "orjson ; implementation_name != \"pypy\"", "pylint", "pytest", "tzdata", "validate-pyproject[all]"] +dev = ["backports.zoneinfo", "black", "build", "freezegun", "mdx_truly_sane_lists", "mike", "mkdocs", "mkdocs-awesome-pages-plugin", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-material (>=8.5)", "mkdocstrings[python]", "msgspec", "msgspec-python313-pre", "mypy", "orjson", "pylint", "pytest", "tzdata", "validate-pyproject[all]"] [[package]] name = "python-slugify" @@ -3510,9 +3459,9 @@ files = [ ] [package.extras] -nativelib = ["pyobjc-framework-Cocoa ; sys_platform == \"darwin\"", "pywin32 ; sys_platform == \"win32\""] -objc = ["pyobjc-framework-Cocoa ; sys_platform == \"darwin\""] -win32 = ["pywin32 ; sys_platform == \"win32\""] +nativelib = ["pyobjc-framework-Cocoa", "pywin32"] +objc = ["pyobjc-framework-Cocoa"] +win32 = ["pywin32"] [[package]] name = "setuptools" @@ -3527,13 +3476,13 @@ files = [ ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] -core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] +core = ["importlib_metadata (>=6)", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "shellingham" @@ -3861,25 +3810,6 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] -[[package]] -name = "typeguard" -version = "4.4.2" -description = "Run-time type checker for Python" -optional = false -python-versions = ">=3.9" -groups = ["main"] -files = [ - {file = "typeguard-4.4.2-py3-none-any.whl", hash = "sha256:77a78f11f09777aeae7fa08585f33b5f4ef0e7335af40005b0c422ed398ff48c"}, - {file = "typeguard-4.4.2.tar.gz", hash = "sha256:a6f1065813e32ef365bc3b3f503af8a96f9dd4e0033a02c28c4a4983de8c6c49"}, -] - -[package.dependencies] -typing_extensions = ">=4.10.0" - -[package.extras] -doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.3.0)"] -test = ["coverage[toml] (>=7)", "mypy (>=1.2.0) ; platform_python_implementation != \"PyPy\"", "pytest (>=7)"] - [[package]] name = "typer" version = "0.15.1" @@ -3922,22 +3852,6 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] -[[package]] -name = "typing-inspect" -version = "0.9.0" -description = "Runtime inspection utilities for typing module." -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, - {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, -] - -[package.dependencies] -mypy-extensions = ">=0.3.0" -typing-extensions = ">=3.7.4" - [[package]] name = "tzdata" version = "2025.1" @@ -3978,7 +3892,7 @@ files = [ ] [package.extras] -brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -4017,7 +3931,7 @@ platformdirs = ">=3.9.1,<5" [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] -test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] [[package]] name = "watchdog" @@ -4168,26 +4082,6 @@ files = [ {file = "xlwt-1.3.0.tar.gz", hash = "sha256:c59912717a9b28f1a3c2a98fd60741014b06b043936dcecbc113eaaada156c88"}, ] -[[package]] -name = "xmlschema" -version = "3.4.3" -description = "An XML Schema validator and decoder" -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "xmlschema-3.4.3-py3-none-any.whl", hash = "sha256:eea4e5a1aac041b546ebe7b2eb68eb5eaebf5c5258e573cfc182375676b2e4e3"}, - {file = "xmlschema-3.4.3.tar.gz", hash = "sha256:0c638dac81c7d6c9da9a8d7544402c48cffe7ee0e13cc47fc0c18794d1395dfb"}, -] - -[package.dependencies] -elementpath = ">=4.4.0,<5.0.0" - -[package.extras] -codegen = ["elementpath (>=4.4.0,<5.0.0)", "jinja2"] -dev = ["Sphinx", "coverage", "elementpath (>=4.4.0,<5.0.0)", "flake8", "jinja2", "lxml", "lxml-stubs", "memory-profiler", "mypy", "sphinx-rtd-theme", "tox"] -docs = ["Sphinx", "elementpath (>=4.4.0,<5.0.0)", "jinja2", "sphinx-rtd-theme"] - [[package]] name = "zipp" version = "3.21.0" @@ -4201,14 +4095,14 @@ files = [ ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources ; python_version < \"3.9\"", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "ec1d9783f047d4c409c2fed3fa527b219d445183337968111a2fb4a272b12bf3" +content-hash = "460a05c892c36c33ccfd69c254484b392fe0ec852a292ea07e75e7a1c2952f88" diff --git a/pyproject.toml b/pyproject.toml index f16d7fe6..5f2926de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,8 +32,6 @@ requests = "^2.32.3" platformdirs = "^4.3.2" jsonschema = "^4.23.0" dacite = "^1.8.1" -xmlschema = "^3.4.3" -pandera = {extras = ["polars"], version = "^0.22.1"} [tool.poetry.group.test.dependencies] pytest = "^8.3.2" diff --git a/src/seedcase_sprout/core/sprout_checks/check_column_data_types.py b/src/seedcase_sprout/core/sprout_checks/check_column_data_types.py new file mode 100644 index 00000000..ab7fb284 --- /dev/null +++ b/src/seedcase_sprout/core/sprout_checks/check_column_data_types.py @@ -0,0 +1,216 @@ +import json + +import polars as pl + +from seedcase_sprout.core.map_data_types import FRICTIONLESS_TO_POLARS +from seedcase_sprout.core.properties import FieldType + +# https://datapackage.org/standard/table-schema/#boolean +BOOLEAN_VALUES = {"false", "False", "FALSE", "0", "true", "True", "TRUE", "1"} + + +def check_is_boolean(column_name: str) -> pl.Expr: + """Checks if the column contains only boolean values. + + Failed values are marked with False. + + Args: + column_name: The name of the column to check. + + Returns: + A Polars expression for checking the column. + """ + return pl.col(column_name).is_in(BOOLEAN_VALUES) + + +def check_is_type_by_conversion(column_name: str, data_type: FieldType) -> pl.Expr: + """Checks if the column contains only values of the given type. + + The check is done by attempting to convert (cast) the column to the + appropriate Polars data type. If it fails, the values are marked with + False. + + Args: + column_name: The name of the column to check. + data_type: The type of the column. + + Returns: + A Polars expression for checking the column. + """ + return ( + pl.col(column_name) + # Strict is false means map to None if it fails rather than give an error. + .cast(FRICTIONLESS_TO_POLARS[data_type], strict=False) + .is_not_null() + ) + + +def check_is_yearmonth(column_name: str) -> pl.Expr: + """Checks if the column contains only yearmonth values. + + Failed values are marked with False. + + Args: + column_name: The name of the column to check. + + Returns: + A Polars expression for checking the column. + """ + return ( + # Fail negative values starting with `-`. + pl.col(column_name).str.starts_with("-").not_() + & (pl.col(column_name) + "-01") + # Strict is false means map to None if it fails rather than give an error. + .str.to_date(format="%Y-%m-%d", strict=False) + .is_not_null() + ) + + +def check_is_datetime(data_frame: pl.DataFrame, column_name: str) -> pl.Expr: + """Checks if the column contains only datetime values. + + Mixing values with and without timezone information is not allowed. + Mixing values with different timezones is allowed, as they will be standardised + before saving to Parquet. + + Failed values are marked with False. + + Args: + data_frame: The data frame being operated on. + column_name: The name of the column to check. + + Returns: + A Polars expression for checking the column. + """ + first_datetime = ( + data_frame.get_column(column_name) + .drop_nulls() + # Strict is false means map to None if it fails rather than give an error. + .str.to_datetime(strict=False) + .first() + ) + has_timezone = bool(first_datetime.tzinfo) if first_datetime else False + datetime_format = "%Y-%m-%dT%H:%M:%S%.f" + ("%z" if has_timezone else "") + + return ( + # Fail negative values starting with `-`. + pl.col(column_name).str.starts_with("-").not_() + & pl.col(column_name) + .str.replace("Z", "+00:00") + .str.to_datetime(time_unit="ms", format=datetime_format, strict=False) + .dt.convert_time_zone(time_zone="UTC") + .is_not_null() + ) + + +def check_is_date(column_name: str) -> pl.Expr: + """Checks if the column contains only date values. + + Failed values are marked with False. + + Args: + column_name: The name of the column to check. + + Returns: + A Polars expression for checking the column. + """ + return ( + # Fail negative values starting with `-`. + pl.col(column_name).str.starts_with("-").not_() + & pl.col(column_name).str.to_date(format="%Y-%m-%d", strict=False).is_not_null() + ) + + +def check_is_time(column_name: str) -> pl.Expr: + """Checks if the column contains only time values. + + Failed values are marked with False. + + Args: + column_name: The name of the column to check. + + Returns: + A Polars expression for checking the column. + """ + return ( + pl.col(column_name) + .str.to_time(format="%H:%M:%S%.f", strict=False) + .is_not_null() + ) + + +# https://stackoverflow.com/a/18690202 +GEOPOINT_PATTERN = ( + r"^(?:[-+]?(?:[1-8]?\d(?:\.\d+)?|90(?:\.0+)?)),\s*" + r"(?:[-+]?(?:180(?:\.0+)?|(?:1[0-7]\d|[1-9]?\d)(?:\.\d+)?))$" +) + + +def check_is_geopoint(column_name: str) -> pl.Expr: + """Checks if the column contains only geopoint values. + + Failed values are marked with False. + + Args: + column_name: The name of the column to check. + + Returns: + A Polars expression for checking the column. + """ + return pl.col(column_name).str.contains(GEOPOINT_PATTERN) + + +def check_is_json(column_name: str, expected_type: type[list | dict]) -> pl.Expr: + """Checks if the column contains only JSON values. + + Failed values are marked with False. + + Warning: uses `map_elements` to check the formatting of each value and may run + slowly on large datasets. + + Args: + column_name: The name of the column to check. + expected_type: The expected JSON type: an object or an array. + + Returns: + A Polars expression for checking the column. + """ + return pl.col(column_name).map_elements( + lambda value: check_value_is_json(value, expected_type), + return_dtype=pl.Boolean, + ) + + +def check_value_is_json(value: str, expected_type: type[list | dict]) -> bool: + """Checks if the `value` is correctly formatted as a JSON object or array. + + Args: + value: The value to check. + expected_type: The expected JSON type: an object or an array. + + Returns: + True if the value is a correct JSON type, False otherwise. + """ + try: + return isinstance(json.loads(value), expected_type) + except json.JSONDecodeError: + return False + + +FRICTIONLESS_TO_COLUMN_CHECK = { + "boolean": check_is_boolean, + "integer": lambda col_name: check_is_type_by_conversion(col_name, "integer"), + "number": lambda col_name: check_is_type_by_conversion(col_name, "number"), + "year": lambda col_name: check_is_type_by_conversion(col_name, "year"), + "yearmonth": check_is_yearmonth, + "datetime": check_is_datetime, + "date": check_is_date, + "time": check_is_time, + "geopoint": check_is_geopoint, + "array": lambda value: check_is_json(value, list), + "object": lambda value: check_is_json(value, dict), + "geojson": lambda value: check_is_json(value, dict), + "string": lambda _: pl.lit(True), + "any": lambda _: pl.lit(True), + "duration": lambda _: pl.lit(True), +} diff --git a/src/seedcase_sprout/core/sprout_checks/check_data_types.py b/src/seedcase_sprout/core/sprout_checks/check_data_types.py deleted file mode 100644 index 28b7a295..00000000 --- a/src/seedcase_sprout/core/sprout_checks/check_data_types.py +++ /dev/null @@ -1,94 +0,0 @@ -import json -import re -import uuid - -import xmlschema - -# Data types for validating time and date values -XML_SCHEMA_TYPES = xmlschema.XMLSchema11( - '' -).types - - -def check_is_xml_type(value: str, type: str) -> bool: - """Checks if `value` is correctly formatted as an XML data type. - - The Frictionless Data Package standard follows the definitions at - https://www.w3.org/TR/xmlschema-2/ for time and date-related data types. - This function is for checking values against these XML data type definitions. - - Args: - value: The value to check. - type: The XML data type to check against. - - Returns: - True if the value is the correct type, False otherwise. - """ - try: - XML_SCHEMA_TYPES[type].decode(value) - return True - except (xmlschema.XMLSchemaDecodeError, KeyError): - return False - - -def check_is_json(value: str, expected_type: type[list | dict]) -> bool: - """Checks if the `value` is correctly formatted as a JSON object or array. - - Args: - value: The value to check. - expected_type: The expected JSON type: an object or an array. - - Returns: - True if the value is a correct JSON type, False otherwise. - """ - try: - return isinstance(json.loads(value), expected_type) - except json.JSONDecodeError: - return False - - -def check_is_geopoint(value: str) -> bool: - """Checks if the `value` is correctly formatted as a geographic point. - - Args: - value: The value to check. - - Returns: - True if the value is a geographic point, False otherwise. - """ - try: - lat, long = value.split(",") - return abs(float(lat.strip())) <= 90 and abs(float(long.strip())) <= 180 - except ValueError: - return False - - -EMAIL_PATTERN = r"^[^@\s]+@[^@\s]+\.[^@\s]+$" - - -def check_is_email(value: str) -> bool: - """Checks if `value` meets the main format constraints of email addresses. - - Args: - value: The value to check. - - Returns: - True if the value meets the main format constraints, False otherwise. - """ - return bool(re.match(EMAIL_PATTERN, value)) and len(value) <= 254 - - -def check_is_uuid(value: str) -> bool: - """Checks if `value` can be parsed as an UUID. - - Args: - value: The value to check. - - Returns: - True if the value can be parsed as a UUID, False otherwise. - """ - try: - uuid.UUID(value) - return True - except ValueError: - return False diff --git a/src/seedcase_sprout/core/sprout_checks/get_pandera_checks.py b/src/seedcase_sprout/core/sprout_checks/get_pandera_checks.py deleted file mode 100644 index b6a8cc13..00000000 --- a/src/seedcase_sprout/core/sprout_checks/get_pandera_checks.py +++ /dev/null @@ -1,193 +0,0 @@ -import pandera.polars as pa -from xmlschema.names import ( - XSD_BASE64_BINARY, - XSD_DATE, - XSD_DATETIME, - XSD_DURATION, - XSD_GYEAR, - XSD_GYEAR_MONTH, - XSD_TIME, -) - -from seedcase_sprout.core.properties import FieldProperties -from seedcase_sprout.core.sprout_checks.check_data_types import ( - check_is_email, - check_is_geopoint, - check_is_json, - check_is_uuid, - check_is_xml_type, -) - -# https://datapackage.org/standard/table-schema/#boolean -BOOLEAN_VALUES = {"false", "False", "FALSE", "0", "true", "True", "TRUE", "1"} - -STRING_FORMAT_CHECKS = { - "email": pa.Check( - check_is_email, - element_wise=True, - error="The given value doesn't seem to be a correctly formatted email address.", - ), - "binary": pa.Check( - lambda value: check_is_xml_type(value, XSD_BASE64_BINARY), - element_wise=True, - error=( - "The given value doesn't seem to be formatted correctly as binary data. " - "Binary data is expected to be Base64-encoded." - ), - ), - "uuid": pa.Check( - check_is_uuid, - element_wise=True, - error="The given value doesn't seem to be a correctly formatted UUID.", - ), -} - - -def get_pandera_checks(field: FieldProperties) -> list[pa.Check]: - """Returns the Pandera checks appropriate for the field's format and data type. - - Args: - field: The field to get the checks for. - - Returns: - The appropriate Pandera checks. - """ - if field.type == "string" and ( - format_check := STRING_FORMAT_CHECKS.get(field.format, None) - ): - return [format_check] - - match field.type: - case "boolean": - return [ - pa.Check( - lambda value: value in BOOLEAN_VALUES, - element_wise=True, - error=f"The given value needs to be one of {BOOLEAN_VALUES}.", - ) - ] - - case "time": - return [ - pa.Check( - lambda value: check_is_xml_type(value, XSD_TIME), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "time value. The expected format for time values is HH:MM:SS. " - "See https://www.w3.org/TR/xmlschema-2/#time for more " - "information." - ), - ) - ] - - case "datetime": - return [ - pa.Check( - lambda value: check_is_xml_type(value, XSD_DATETIME), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "datetime value. The expected format for datetime values is " - "YYYY-MM-DDTHH:MM:SS with optional milliseconds and time zone " - "information. See https://www.w3.org/TR/xmlschema-2/#dateTime " - "for more information." - ), - ) - ] - - case "date": - return [ - pa.Check( - lambda value: check_is_xml_type(value, XSD_DATE), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "date value. The expected format for date values is YYYY-MM-DD." - " See https://www.w3.org/TR/xmlschema-2/#date for more " - "information." - ), - ) - ] - - case "year": - return [ - pa.Check( - lambda value: check_is_xml_type(value, XSD_GYEAR), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "year value. The expected format for year values is YYYY. " - "See https://www.w3.org/TR/xmlschema-2/#gYear for more " - "information." - ), - ) - ] - - case "yearmonth": - return [ - pa.Check( - lambda value: check_is_xml_type(value, XSD_GYEAR_MONTH), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "yearmonth value. The expected format for yearmonth values is " - "YYYY-MM. See https://www.w3.org/TR/xmlschema-2/#gYearMonth " - "for more information." - ), - ) - ] - - case "duration": - return [ - pa.Check( - lambda value: check_is_xml_type(value, XSD_DURATION), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "duration value. The expected format for duration values is " - "PnYnMnDTnHnMnS. See https://www.w3.org/TR/xmlschema-2/#duration" - " for more information." - ), - ) - ] - - case "object": - return [ - pa.Check( - lambda value: check_is_json(value, dict), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "JSON object." - ), - ) - ] - - case "array": - return [ - pa.Check( - lambda value: check_is_json(value, list), - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "JSON array." - ), - ) - ] - - case "geopoint": - return [ - pa.Check( - check_is_geopoint, - element_wise=True, - error=( - "The given value doesn't seem to be a correctly formatted " - "geographical point. The expected format for geographical " - "points is LAT, LONG." - ), - ) - ] - - case _: - return [] diff --git a/src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py b/src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py deleted file mode 100644 index fac571c4..00000000 --- a/src/seedcase_sprout/core/sprout_checks/get_polars_data_type.py +++ /dev/null @@ -1,52 +0,0 @@ -import polars as pl - -from seedcase_sprout.core.properties import FieldType - - -def get_polars_data_type(field_type: FieldType | None) -> pl.DataType: - """Maps Frictionless field types to Polars data types. - - If the Frictionless field type has formatting constraints that are not included - in any specialised Polars data type, the mapping is to string. The formatting - constraints are then checked without Polars. - - Args: - field_type: The Frictionless field type to map. - - Returns: - The Polars data type the field is mapped to. - - Raises: - NotImplementedError: If Sprout doesn't yet support the Frictionless field type. - """ - match field_type: - case "geojson": - raise NotImplementedError() - # While Polars does have most of these data types, there isn't a - # perfect overlap between them and what Frictionless has, even - # if they have similar/same names for the types. For example, - # checks against date/datetimes/times types are different between - # Polars and Frictionless. Or the way booleans get treated. Polars - # may cast `123` to True, but Frictionless will indicate it is not - # a boolean. We'll slowly improve on this as we use Sprout. - case ( - "string" - | "boolean" - | "datetime" - | "date" - | "time" - | "year" - | "yearmonth" - | "duration" - | "list" - | "array" - | "object" - | "geopoint" - ): - return pl.String - case "number": - return pl.Float64 - case "integer": - return pl.Int64 - case _: - return pl.String diff --git a/src/seedcase_sprout/core/sprout_checks/resource_properties_to_pandera_schema.py b/src/seedcase_sprout/core/sprout_checks/resource_properties_to_pandera_schema.py deleted file mode 100644 index b0960bd9..00000000 --- a/src/seedcase_sprout/core/sprout_checks/resource_properties_to_pandera_schema.py +++ /dev/null @@ -1,40 +0,0 @@ -import pandera.polars as pa - -from seedcase_sprout.core.get_nested_attr import get_nested_attr -from seedcase_sprout.core.properties import FieldProperties, ResourceProperties -from seedcase_sprout.core.sprout_checks.get_pandera_checks import ( - get_pandera_checks, -) -from seedcase_sprout.core.sprout_checks.get_polars_data_type import ( - get_polars_data_type, -) - - -def resource_properties_to_pandera_schema( - resource_properties: ResourceProperties, -) -> pa.DataFrameSchema: - """Converts a set of resource properties to a Pandera schema. - - Args: - resource_properties: The resource properties to convert. - - Returns: - The resulting Pandera schema. - """ - fields: list[FieldProperties] = get_nested_attr( - resource_properties, - "schema.fields", - default=[], - ) - - columns = { - field.name: pa.Column( - dtype=get_polars_data_type(field.type), - checks=get_pandera_checks(field), - nullable=not get_nested_attr(field, "constraints.required", default=False), - coerce=True, - ) - for field in fields - } - - return pa.DataFrameSchema(columns, strict=True) diff --git a/tests/core/sprout_checks/test_check_column_data_types.py b/tests/core/sprout_checks/test_check_column_data_types.py new file mode 100644 index 00000000..a8322589 --- /dev/null +++ b/tests/core/sprout_checks/test_check_column_data_types.py @@ -0,0 +1,262 @@ +import polars as pl +from pytest import mark + +from seedcase_sprout.core.sprout_checks.check_column_data_types import ( + BOOLEAN_VALUES, + FRICTIONLESS_TO_COLUMN_CHECK, +) + +# Boolean +BOOLEAN_BAD_VALUES = ["", "yes", "maybe", "99"] +BOOLEAN_GOOD_VALUES = list(BOOLEAN_VALUES) + +# Yearmonth +YEARMONTH_BAD_VALUES = [ + "", + "2002/10", + "2002-10-10", + "2014", + "2014-01-01", + "2014-01-01-01", + "abc", + "-0100-09", + "1001-13", + "10001-11", + "10-13", +] +YEARMONTH_GOOD_VALUES = ["2014-12", "0001-01", "0000-01"] + +# Date +DATE_BAD_VALUES = [ + "", + "-0001-01-01", + "--0001-01-01", + "01-01", + "99", + "abc", + "2002-10-10-05:00", + "2002-10-10Z", + "2002-02-31", + "20022-02-02", + "2002-02-02T06:00:00", +] +DATE_GOOD_VALUES = ["2002-10-10", "0001-01-01", "0000-01-01"] + +# Time +TIME_BAD_VALUES = [ + "15:00:69", + "-15:00:00", + "2002-10-10T12:00:00", + "4", + "", + "abc", + "06:23:22Z", + "12:00:00-05:00", + "12:00:00.34-05:00", +] +TIME_GOOD_VALUES = [ + "15:00:59", + "00:00:00", + "12:00:00.3", + "12:00:00.345345", +] + +# Integer, Year +INTEGER_BAD_VALUES = ["", "12.23", "abc", "2E3", "INF", "NAN"] +INTEGER_GOOD_VALUES = ["12223", "-123", "+4", "000"] + +# Number +NUMBER_BAD_VALUES = ["", "abc", "++4", "2,00"] +NUMBER_GOOD_VALUES = [ + "123", + "123.123", + "-23", + "+45.5", + "0003", + "2.0000", + "NaN", + "NAN", + "nan", + "inf", + "INF", + "-inf", + "-INF", + "2E3", + "2E-33", +] + +# Geopoint +GEOPOINT_BAD_VALUES = [ + "", + "45", + "5 45", + "5 , 45", + "180, 90", + "91, 181", + "-91, -181", + "A, B", + "abc", + "NAN", + "INF", +] +GEOPOINT_GOOD_VALUES = [ + "90, 180", + "-90, -180", + "0, 0", + "5, 45", + "5.9999, 45.0000", + "5,45", +] + +# Array, Object, Geojson +ARRAY_GOOD_VALUES = [ + "[]", + '[{"prop1": "value"}, {"prop2": 123}]', +] +OBJECT_GOOD_VALUES = [ + "{}", + '{"outer": "value", "inner": {"prop1": 123, "prop2": [1, 2, null], "prop3": true}}', +] +OBJECT_BAD_VALUES = ["not,json,,"] + ARRAY_GOOD_VALUES +ARRAY_BAD_VALUES = ["not,json,,"] + OBJECT_GOOD_VALUES + +# Any +ANY_VALUES = ["some text", 99, "[]", "2030-12-12", True] + +# String +STRING_VALUES = ["some_text", "£$%^&*()\\''", "μῆνιν ἄειδε θεὰ", "æøåäöü"] + +# Duration +DURATION_VALUES = ["P1Y2M3DT10H30M45.343S"] + + +@mark.parametrize( + "bad_values, good_values, field_type", + [ + (BOOLEAN_BAD_VALUES, BOOLEAN_GOOD_VALUES, "boolean"), + (YEARMONTH_BAD_VALUES, YEARMONTH_GOOD_VALUES, "yearmonth"), + (DATE_BAD_VALUES, DATE_GOOD_VALUES, "date"), + (TIME_BAD_VALUES, TIME_GOOD_VALUES, "time"), + (GEOPOINT_BAD_VALUES, GEOPOINT_GOOD_VALUES, "geopoint"), + (INTEGER_BAD_VALUES, INTEGER_GOOD_VALUES, "integer"), + (INTEGER_BAD_VALUES, INTEGER_GOOD_VALUES, "year"), + (NUMBER_BAD_VALUES, NUMBER_GOOD_VALUES, "number"), + (ARRAY_BAD_VALUES, ARRAY_GOOD_VALUES, "array"), + (OBJECT_BAD_VALUES, OBJECT_GOOD_VALUES, "object"), + (OBJECT_BAD_VALUES, OBJECT_GOOD_VALUES, "geojson"), + ([], STRING_VALUES, "string"), + ([], DURATION_VALUES, "duration"), + ([], ANY_VALUES, "any"), + ], +) +def test_check_data_type(bad_values, good_values, field_type): + """Given a column with both correct and incorrect values, it should mark incorrect + values with False in another column.""" + values = bad_values + good_values + expected_fails = list(range(len(bad_values))) + df = pl.DataFrame({"my_values": values}, strict=False) + check_fn = FRICTIONLESS_TO_COLUMN_CHECK[field_type] + + df = df.with_columns(check_fn("my_values").alias("result")) + + fails = ( + df.with_row_index() + .filter(pl.col("result").not_()) + .get_column("index") + .to_list() + ) + + assert fails == expected_fails + + +# Datetime +DATETIME_BAD_VALUES_WHEN_TIMEZONE = [ + "", + "2002-10-10T12:00:00", + "2002-10-10T12:00:00", + "-0001-01-01T00:00:00", + "--0001-01-01T00:00:00", + "2023-01-01", + "2023-01-01T", + "04:04:22", + "2002-13-01T00:00:00", + "2002-11-01T99:00:00", + "2002-11-01 T 06:00:00", + "2002-10-10T17:00:00X", + "T", + "4", + "abc", +] +DATETIME_GOOD_VALUES_WHEN_TIMEZONE = [ + "2002-10-10T12:00:00+01:00", + "2002-10-10T12:00:40-05:00", + "2002-10-10T12:00:00.34Z", + "2002-10-10T12:40:00.34-04:30", + "2002-10-10T17:00:55Z", + "0000-01-01T00:00:00Z", +] + +DATETIME_BAD_VALUES_WHEN_NO_TIMEZONE = [ + "", + "2002-10-10T12:00:00+01:00", + "2002-10-10T12:00:40-05:00", + "2002-10-10T12:00:00.34Z", + "-0001-01-01T00:00:00", + "--0001-01-01T00:00:00", + "2023-01-01", + "2023-01-01T", + "04:04:22", + "2002-13-01T00:00:00", + "2002-11-01T99:00:00", + "2002-11-01 T 06:00:00", + "2002-10-10T17:00:00X", + "T", + "4", + "abc", +] +DATETIME_GOOD_VALUES_WHEN_NO_TIMEZONE = [ + "2002-10-10T12:44:10", + "2002-10-10T12:00:00.34", + "0000-01-01T00:00:00", +] + + +@mark.parametrize( + "first_value, good_values, bad_values", + [ + ( + DATETIME_GOOD_VALUES_WHEN_TIMEZONE[0], + DATETIME_GOOD_VALUES_WHEN_TIMEZONE, + DATETIME_BAD_VALUES_WHEN_TIMEZONE, + ), + ( + DATETIME_GOOD_VALUES_WHEN_NO_TIMEZONE[0], + DATETIME_GOOD_VALUES_WHEN_NO_TIMEZONE, + DATETIME_BAD_VALUES_WHEN_NO_TIMEZONE, + ), + ( + "abc", + DATETIME_GOOD_VALUES_WHEN_NO_TIMEZONE, + DATETIME_GOOD_VALUES_WHEN_TIMEZONE, + ), + ], +) +def test_check_is_datetime(first_value, good_values, bad_values): + """Given a column with both correct and incorrect datetimes, it should mark + incorrect datetimes with False in another column. The first value should decide if + the column is treated as timezone-aware or timezone-naive.""" + values = [first_value] + good_values + bad_values + expected_fails = [i for i, value in enumerate(values) if value not in good_values] + df = pl.DataFrame({"my_values": values}) + check_fn = FRICTIONLESS_TO_COLUMN_CHECK["datetime"] + + df = df.with_columns(check_fn(df, "my_values").alias("result")) + + fails = ( + df.with_row_index() + .filter(pl.col("result").not_()) + .get_column("index") + .to_list() + ) + + assert fails == expected_fails diff --git a/tests/core/sprout_checks/test_check_data_types.py b/tests/core/sprout_checks/test_check_data_types.py deleted file mode 100644 index 8c60dacc..00000000 --- a/tests/core/sprout_checks/test_check_data_types.py +++ /dev/null @@ -1,134 +0,0 @@ -from pytest import mark -from xmlschema.names import ( - XSD_BASE64_BINARY, - XSD_DATE, - XSD_DATETIME, - XSD_DURATION, - XSD_GYEAR, - XSD_GYEAR_MONTH, - XSD_TIME, -) - -from seedcase_sprout.core.sprout_checks.check_data_types import ( - check_is_email, - check_is_geopoint, - check_is_json, - check_is_uuid, - check_is_xml_type, -) - - -@mark.parametrize( - "value,xml_type,expected", - [ - ("2002-10-10T12:00:00.34-05:00", XSD_DATETIME, True), - ("2002-10-10T17:00:00X", XSD_DATETIME, False), - ("2002-10-10", XSD_DATE, True), - ("99", XSD_DATE, False), - ("15:00:59", XSD_TIME, True), - ("2002-10-10T12:00:00", XSD_TIME, False), - ("2014", XSD_GYEAR, True), - ("99", XSD_GYEAR, False), - ("2014-12", XSD_GYEAR_MONTH, True), - ("2014-13", XSD_GYEAR_MONTH, False), - ("P1Y2M3DT10H30M45.343S", XSD_DURATION, True), - ("0Y1347M0D", XSD_DURATION, False), - ("c29tZSB0ZXh0IDEyMw==", XSD_BASE64_BINARY, True), - ("some text 123", XSD_BASE64_BINARY, False), - ("2022", "unknown_type", False), - ], -) -def test_checks_xml_type(value, xml_type, expected): - """Should determine if values is a XML data types.""" - assert check_is_xml_type(value, xml_type) is expected - - -@mark.parametrize( - "json_object,expected", - [ - ("{}", True), - ( - ( - '{"outer": "value", "inner": {"prop1": 123, ' - '"prop2": [1, 2, null], "prop3": true}}' - ), - True, - ), - ("[]", False), - ("not,json,,", False), - ('"[{"prop1": "value"}, {"prop2": 123}]"', False), - ], -) -def test_checks_json_object(json_object, expected): - """Should determine if the input is a JSON object.""" - assert check_is_json(json_object, dict) is expected - - -@mark.parametrize( - "json_array,expected", - [ - ("[]", True), - ('[{"prop1": "value"}, {"prop2": 123}]', True), - ("{}", False), - ("not,json,,", False), - ('{"name": "value"}', False), - ], -) -def test_checks_json_array(json_array, expected): - """Should determine if the input is a JSON array.""" - assert check_is_json(json_array, list) is expected - - -@mark.parametrize( - "geopoint,expected", - [ - ("90, 180", True), - ("-90, -180", True), - ("0, 0", True), - ("5, 45", True), - ("5.9999, 45.0000", True), - ("5,45", True), - ("5 , 45", True), - ("5 45", False), - ("45", False), - ("", False), - ("180, 90", False), - ("91, 181", False), - ("-91, -181", False), - ("A, B", False), - ], -) -def test_checks_geopoint(geopoint, expected): - """Should determine if the input is a geopoint.""" - assert check_is_geopoint(geopoint) is expected - - -@mark.parametrize( - "email,expected", - [ - ("j_ane.d-oe99@email.co.uk", True), - ("@", False), - ("@email.co.uk", False), - ("jane@@email.co.uk", False), - ("jane@", False), - ("jane@email", False), - ("jane@email.", False), - (f"jane.doe{'x' * 256}@email.co.uk", False), - ], -) -def test_checks_email(email, expected): - """Should determine if the input is (likely to be) an email address.""" - assert check_is_email(email) is expected - - -@mark.parametrize( - "uuid,expected", - [ - ("8c085e68-a36f-4cf7-9341-cdf2b1792657", True), - ("some text", False), - ("1234", False), - ], -) -def test_checks_uuid(uuid, expected): - """Should determine if the input can be parsed as a UUID.""" - assert check_is_uuid(uuid) is expected diff --git a/tests/core/sprout_checks/test_get_pandera_checks.py b/tests/core/sprout_checks/test_get_pandera_checks.py deleted file mode 100644 index a78e3eb7..00000000 --- a/tests/core/sprout_checks/test_get_pandera_checks.py +++ /dev/null @@ -1,48 +0,0 @@ -from pytest import mark - -from seedcase_sprout.core.properties import FieldProperties -from seedcase_sprout.core.sprout_checks.get_pandera_checks import ( - get_pandera_checks, -) - - -@mark.parametrize( - "field_type", - [ - "boolean", - "time", - "datetime", - "date", - "year", - "yearmonth", - "duration", - "object", - "array", - "geopoint", - ], -) -def test_returns_check_for_field_type(field_type): - """Should return at least 1 check for the listed field types.""" - assert get_pandera_checks(FieldProperties(type=field_type)) - - -@mark.parametrize("format", ["email", "binary", "uuid"]) -def test_returns_check_for_string_format(format): - """Should return at least 1 check for the listed string formats.""" - field = FieldProperties(type="string", format=format) - - assert get_pandera_checks(field) - - -@mark.parametrize("format", ["default", None]) -def test_returns_no_checks_when_string_format_is_default(format): - """Should return no checks for string fields with the default format.""" - field = FieldProperties(type="string", format=format) - - assert get_pandera_checks(field) == [] - - -@mark.parametrize("field_type", ["any", None, "unknown"]) -def test_returns_no_checks_when_field_type_any_or_unknown(field_type): - """Should return no checks for fields whose type is unknown, any, or the default.""" - assert get_pandera_checks(FieldProperties(type=field_type)) == [] diff --git a/tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py b/tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py deleted file mode 100644 index eca14d06..00000000 --- a/tests/core/sprout_checks/test_resource_properties_to_pandera_schema.py +++ /dev/null @@ -1,117 +0,0 @@ -import polars as pl -from pytest import mark - -from seedcase_sprout.core.properties import ( - ConstraintsProperties, - FieldProperties, - ResourceProperties, - TableSchemaProperties, -) -from seedcase_sprout.core.sprout_checks.resource_properties_to_pandera_schema import ( - resource_properties_to_pandera_schema, -) - - -@mark.parametrize( - "resource_properties", - [ - ResourceProperties(), - ResourceProperties( - schema=TableSchemaProperties(), - ), - ResourceProperties( - schema=TableSchemaProperties(fields=None), - ), - ResourceProperties( - schema=TableSchemaProperties(fields=[]), - ), - ], -) -def test_converts_properties_without_fields(resource_properties): - """When the properties have no fields, the Pandera schema should have no columns.""" - schema = resource_properties_to_pandera_schema(resource_properties) - - assert schema.columns == {} - assert schema.strict - - -@mark.parametrize( - "field_type,data_type,num_checks", - [ - ("number", pl.Float64, 0), - ("integer", pl.Int64, 0), - ("string", pl.String, 0), - ("boolean", pl.String, 1), - ("object", pl.String, 1), - ("array", pl.String, 1), - ("list", pl.String, 0), - ("datetime", pl.String, 1), - ("date", pl.String, 1), - ("time", pl.String, 1), - ("year", pl.String, 1), - ("yearmonth", pl.String, 1), - ("duration", pl.String, 1), - ("geopoint", pl.String, 1), - ("any", pl.String, 0), - (None, pl.String, 0), - ], -) -def test_converts_individual_fields_correctly(field_type, data_type, num_checks): - """Should convert each type of field to a Pandera column correctly.""" - resource_properties = ResourceProperties( - schema=TableSchemaProperties( - fields=[FieldProperties(name="my_field", type=field_type)] - ) - ) - - schema = resource_properties_to_pandera_schema(resource_properties) - - assert schema.strict - assert len(schema.columns) == 1 - column = list(schema.columns.values())[0] - assert column.name == "my_field" - assert column.dtype.type == data_type - assert len(column.checks) == num_checks - assert column.coerce - assert column.nullable - assert column.required - - -def test_converts_multiple_fields(): - """Should convert multiple fields to multiple Pandera columns correctly.""" - resource_properties = ResourceProperties( - schema=TableSchemaProperties( - fields=[ - FieldProperties(name="my_date", type="date"), - FieldProperties(name="my_boolean", type="boolean"), - ] - ) - ) - - schema = resource_properties_to_pandera_schema(resource_properties) - - assert [(column.name, column.dtype.type) for column in schema.columns.values()] == [ - ("my_date", pl.String), - ("my_boolean", pl.String), - ] - - -@mark.parametrize("required,expected", [(True, False), (False, True), (None, True)]) -def test_converts_required_constraint(required, expected): - """Should convert the required constraint to Pandera's nullable correctly.""" - resource_properties = ResourceProperties( - schema=TableSchemaProperties( - fields=[ - FieldProperties( - name="my_date", - type="date", - constraints=ConstraintsProperties(required=required), - ) - ] - ) - ) - - schema = resource_properties_to_pandera_schema(resource_properties) - - column = list(schema.columns.values())[0] - assert column.nullable is expected