From 511f0674300e56c6e008c76d253dcf4c77aa973d Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 30 Jan 2025 09:40:18 -0500 Subject: [PATCH 01/21] Use open_dataset_kerchunk in roundtrip tests that don't otherwise require kerchunk --- virtualizarr/tests/__init__.py | 14 ++++++++ virtualizarr/tests/test_backend.py | 3 +- virtualizarr/tests/test_integration.py | 36 ++++++++++--------- .../test_hdf/test_hdf_integration.py | 19 ++++++---- 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index f38d5c2c..19de6345 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -1,8 +1,10 @@ import importlib import itertools +import fsspec import numpy as np import pytest +import xarray as xr from packaging.version import Version from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -105,3 +107,15 @@ def offset_from_chunk_key(ind: tuple[int, ...]) -> int: def length_from_chunk_key(ind: tuple[int, ...]) -> int: return sum(ind) + 5 + + +def open_dataset_kerchunk( + filename_or_obj: str, *, storage_options=None, **kwargs +) -> xr.Dataset: + """Equivalent to ``xr.open_dataset(..., engine="kerchunk")`` but without depending on + kerchunk library + """ + m = fsspec.filesystem( + "reference", fo=filename_or_obj, **(storage_options or {}) + ).get_mapper() + return xr.open_dataset(m, engine="zarr", consolidated=False, **kwargs) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index e4157b73..41bf9c2d 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -15,6 +15,7 @@ from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( has_astropy, + open_dataset_kerchunk, parametrize_over_hdf_backends, requires_hdf5plugin, requires_imagecodecs, @@ -321,7 +322,7 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend): ) tmpref = "/tmp/cmip6.json" vds.virtualize.to_kerchunk(tmpref, format="json") - dsV = xr.open_dataset(tmpref, engine="kerchunk") + dsV = open_dataset_kerchunk(tmpref) # xrt.assert_identical(dsXR, dsV) #Attribute order changes xrt.assert_equal(dsXR, dsV) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 95be3de8..4e7dd6b6 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -8,7 +8,12 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk +from virtualizarr.tests import ( + has_kerchunk, + open_dataset_kerchunk, + parametrize_over_hdf_backends, + requires_kerchunk, +) from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, ) @@ -84,8 +89,9 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] -@requires_kerchunk -@pytest.mark.parametrize("format", ["dict", "json", "parquet"]) +@pytest.mark.parametrize( + "format", ["dict", "json", "parquet"] if has_kerchunk else ["dict", "json"] +) class TestKerchunkRoundtrip: @parametrize_over_hdf_backends def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): @@ -103,14 +109,14 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): ds_refs = vds.virtualize.to_kerchunk(format=format) # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) + roundtrip = open_dataset_kerchunk(ds_refs, decode_times=False) else: # write those references to disk as kerchunk references format vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset( - f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False + roundtrip = open_dataset_kerchunk( + f"{tmpdir}/refs.{format}", decode_times=False ) # assert all_close to original dataset @@ -164,16 +170,14 @@ def test_kerchunk_roundtrip_concat( ds_refs = vds.virtualize.to_kerchunk(format=format) # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset( - ds_refs, engine="kerchunk", decode_times=decode_times - ) + roundtrip = open_dataset_kerchunk(ds_refs, decode_times=decode_times) else: # write those references to disk as kerchunk references format vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset( - f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times + roundtrip = open_dataset_kerchunk( + f"{tmpdir}/refs.{format}", decode_times=decode_times ) if decode_times is False: @@ -214,14 +218,14 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): ds_refs = vds.virtualize.to_kerchunk(format=format) # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) + roundtrip = open_dataset_kerchunk(ds_refs, decode_times=False) else: # write those references to disk as kerchunk references format vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset( - f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False + roundtrip = open_dataset_kerchunk( + f"{tmpdir}/refs.{format}", decode_times=False ) # assert equal to original dataset @@ -265,13 +269,13 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): ds_refs = ds.virtualize.to_kerchunk(format=format) # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset(ds_refs, engine="kerchunk") + roundtrip = open_dataset_kerchunk(ds_refs) else: # write those references to disk as kerchunk references format ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") + roundtrip = open_dataset_kerchunk(f"{tmpdir}/refs.{format}") assert roundtrip.a.attrs == ds.a.attrs diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index bebc4560..e43ae1f4 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -4,16 +4,21 @@ import virtualizarr from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import ( + open_dataset_kerchunk, + requires_hdf5plugin, + requires_imagecodecs, +) -@requires_kerchunk +@requires_hdf5plugin +@requires_imagecodecs class TestIntegration: @pytest.mark.xfail( reason="0 time start is being interpreted as fillvalue see issues/280" ) def test_filters_h5netcdf_roundtrip( - self, tmpdir, filter_encoded_roundtrip_hdf5_file, backend=HDFVirtualBackend + self, tmpdir, filter_encoded_roundtrip_hdf5_file ): ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( @@ -24,7 +29,7 @@ def test_filters_h5netcdf_roundtrip( ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) + roundtrip = open_dataset_kerchunk(kerchunk_file, decode_times=True) xrt.assert_allclose(ds, roundtrip) @pytest.mark.xfail( @@ -37,8 +42,8 @@ def test_filters_netcdf4_roundtrip( ds = xr.open_dataset(filepath) vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend) kerchunk_file = f"{tmpdir}/kerchunk.json" - vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + vds.virtualize.to_kerchunk(kerchunk_file, format="dict") + roundtrip = open_dataset_kerchunk(kerchunk_file) xrt.assert_equal(ds, roundtrip) def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): @@ -48,5 +53,5 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file ) kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + roundtrip = open_dataset_kerchunk(kerchunk_file) xrt.assert_allclose(ds, roundtrip) From 61e3cff579de0310c3a23143177b6cf3101dfc8e Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 30 Jan 2025 09:59:02 -0500 Subject: [PATCH 02/21] Make it clear that integration tests require zarr-python --- virtualizarr/tests/test_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 4e7dd6b6..aac5f17b 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -13,6 +13,7 @@ open_dataset_kerchunk, parametrize_over_hdf_backends, requires_kerchunk, + requires_zarr_python, ) from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, @@ -89,6 +90,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] +@requires_zarr_python @pytest.mark.parametrize( "format", ["dict", "json", "parquet"] if has_kerchunk else ["dict", "json"] ) From bd95d6048aa134c7590cec34a7662b88074ddb01 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 30 Jan 2025 10:44:15 -0500 Subject: [PATCH 03/21] Add in-memory icechunk tests to existing roundtrip tests --- virtualizarr/tests/__init__.py | 9 ++ virtualizarr/tests/test_integration.py | 124 ++++++++++++------------- 2 files changed, 68 insertions(+), 65 deletions(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 19de6345..7fb8982d 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,6 +37,7 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") +has_icechunk, requires_icechunk = _importorskip("icechunk") has_kerchunk, requires_kerchunk = _importorskip("kerchunk") has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") @@ -119,3 +120,11 @@ def open_dataset_kerchunk( "reference", fo=filename_or_obj, **(storage_options or {}) ).get_mapper() return xr.open_dataset(m, engine="zarr", consolidated=False, **kwargs) + + +def in_memory_icechunk_session(): + from icechunk import Repository, Storage + + repo = Repository.create(storage=Storage.new_in_memory()) + session = repo.writable_session("main") + return session diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index aac5f17b..b045a330 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -9,7 +9,9 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import ( + has_icechunk, has_kerchunk, + in_memory_icechunk_session, open_dataset_kerchunk, parametrize_over_hdf_backends, requires_kerchunk, @@ -40,16 +42,16 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): ), chunkmanifest=manifest, ) - ds = xr.Dataset({"a": (["x", "y"], marr)}) + vds = xr.Dataset({"a": (["x", "y"], marr)}) # Use accessor to write it out to kerchunk reference dict - ds_refs = ds.virtualize.to_kerchunk(format="dict") + ds_refs = vds.virtualize.to_kerchunk(format="dict") # Use dataset_from_kerchunk_refs to reconstruct the dataset roundtrip = dataset_from_kerchunk_refs(ds_refs) # Assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_equal(roundtrip, vds) @requires_kerchunk @@ -90,13 +92,55 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] +def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to an in-memory kerchunk-formatted references dictionary + ds_refs = vds.virtualize.to_kerchunk(format="dict") + + # use fsspec to read the dataset from the kerchunk references dict + return open_dataset_kerchunk(ds_refs, **kwargs) + + +def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to disk as kerchunk references format + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json") + + # use fsspec to read the dataset from disk via the kerchunk references + return open_dataset_kerchunk(f"{tmpdir}/refs.json", **kwargs) + + +def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to disk as kerchunk references format + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.parquet", format="parquet") + + # use fsspec to read the dataset from disk via the kerchunk references + return open_dataset_kerchunk(f"{tmpdir}/refs.parquet", **kwargs) + + +def roundtrip_as_in_memory_icechunk(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to an in-memory icechunk store + icechunk_session = in_memory_icechunk_session() + vds.virtualize.to_icechunk(icechunk_session.store) + icechunk_session.commit("add data") + + # read the dataset from icechunk + return xr.open_zarr( + icechunk_session.store, zarr_format=3, consolidated=False, **kwargs + ) + + @requires_zarr_python @pytest.mark.parametrize( - "format", ["dict", "json", "parquet"] if has_kerchunk else ["dict", "json"] + "roundtrip_func", + [ + roundtrip_as_kerchunk_dict, + roundtrip_as_kerchunk_json, + *([roundtrip_as_kerchunk_parquet] if has_kerchunk else []), + *([roundtrip_as_in_memory_icechunk] if has_icechunk else []), + ], ) -class TestKerchunkRoundtrip: +class TestRoundtrip: @parametrize_over_hdf_backends - def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): + def test_roundtrip_no_concat(self, tmpdir, roundtrip_func, hdf_backend): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -106,20 +150,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): # use open_dataset_via_kerchunk to read it as references vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend) - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = vds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = open_dataset_kerchunk(ds_refs, decode_times=False) - else: - # write those references to disk as kerchunk references format - vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = open_dataset_kerchunk( - f"{tmpdir}/refs.{format}", decode_times=False - ) + roundtrip = roundtrip_func(vds, tmpdir, decode_times=False) # assert all_close to original dataset xrt.assert_allclose(roundtrip, ds) @@ -131,7 +162,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): @parametrize_over_hdf_backends @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) def test_kerchunk_roundtrip_concat( - self, tmpdir, format, hdf_backend, decode_times, time_vars + self, tmpdir, roundtrip_func, hdf_backend, decode_times, time_vars ): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times) @@ -167,20 +198,7 @@ def test_kerchunk_roundtrip_concat( # concatenate virtually along time vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override") - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = vds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = open_dataset_kerchunk(ds_refs, decode_times=decode_times) - else: - # write those references to disk as kerchunk references format - vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = open_dataset_kerchunk( - f"{tmpdir}/refs.{format}", decode_times=decode_times - ) + roundtrip = roundtrip_func(vds, tmpdir, decode_times=decode_times) if decode_times is False: # assert all_close to original dataset @@ -197,7 +215,7 @@ def test_kerchunk_roundtrip_concat( assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"] @parametrize_over_hdf_backends - def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): + def test_non_dimension_coordinates(self, tmpdir, roundtrip_func, hdf_backend): # regression test for GH issue #105 if hdf_backend: @@ -215,20 +233,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): assert "lat" in vds.coords assert "coordinates" not in vds.attrs - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = vds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = open_dataset_kerchunk(ds_refs, decode_times=False) - else: - # write those references to disk as kerchunk references format - vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = open_dataset_kerchunk( - f"{tmpdir}/refs.{format}", decode_times=False - ) + roundtrip = roundtrip_func(vds, tmpdir) # assert equal to original dataset xrt.assert_allclose(roundtrip, ds) @@ -237,7 +242,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs - def test_datetime64_dtype_fill_value(self, tmpdir, format): + def test_datetime64_dtype_fill_value(self, tmpdir, roundtrip_func): chunks_dict = { "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } @@ -255,7 +260,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): zarr_format=2, ) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest) - ds = xr.Dataset( + vds = xr.Dataset( { "a": xr.DataArray( marr1, @@ -266,20 +271,9 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): } ) - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = ds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = open_dataset_kerchunk(ds_refs) - else: - # write those references to disk as kerchunk references format - ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = open_dataset_kerchunk(f"{tmpdir}/refs.{format}") + roundtrip = roundtrip_func(vds, tmpdir) - assert roundtrip.a.attrs == ds.a.attrs + assert roundtrip.a.attrs == vds.a.attrs @parametrize_over_hdf_backends From cf9fc4f9c8f9e6bf2ca3f846baf89e93cb3fc48d Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 10 Jan 2025 17:10:03 -0800 Subject: [PATCH 04/21] Playing around with icechunk / zarr / xarray upgrade --- pyproject.toml | 4 ++-- virtualizarr/writers/icechunk.py | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8421819e..0ee48ef4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ - "xarray>=2024.10.0,<2025.0.0", + "xarray>=2025.1.1", "numpy>=2.0.0", "packaging", "universal-pathlib", @@ -39,7 +39,7 @@ hdf_reader = [ "numcodecs" ] icechunk = [ - "icechunk==0.1.0a8", + "icechunk>=0.1.0a10", ] test = [ "codecov", diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 6ed535e5..f21972e3 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -211,6 +211,7 @@ def write_virtual_variable_to_icechunk( ) -> None: """Write a single virtual variable into an icechunk store""" from zarr import Array + from zarr.core.metadata.v3 import parse_codecs ma = cast(ManifestArray, var.data) zarray = ma.zarray @@ -241,15 +242,29 @@ def write_virtual_variable_to_icechunk( append_axis=append_axis, ) else: - append_axis = None + append_axis, compressors = None, None # create array if it doesn't already exist + # review https://github.com/zarr-developers/zarr-python/blob/0e1fde44b2ff3904bbe88fc4d1424d61d769dfe2/docs/user-guide/arrays.rst#compressors + # this should create a custom compressor with numcodecs, not try and replace zlib with gzip... + if zarray.compressor: + compressor = zarray.compressor.copy() + compressor["configuration"] = {} + for k, v in zarray.compressor.items(): + if k == "id": + v = "gzip" if v == "zlib" else v + compressor["name"] = v + else: + compressor["configuration"][k] = v + compressor.pop(k) + compressors = parse_codecs([compressor]) + arr = group.require_array( name=name, shape=zarray.shape, - chunk_shape=zarray.chunks, + chunks=zarray.chunks, dtype=encode_dtype(zarray.dtype), - codecs=zarray._v3_codec_pipeline(), + compressors=compressors, dimension_names=var.dims, fill_value=zarray.fill_value, ) From 390efca285bd71717f6d76d4eac513acf8780ceb Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Sun, 12 Jan 2025 12:37:45 -0800 Subject: [PATCH 05/21] Passing icechunk tests --- virtualizarr/codecs.py | 2 +- virtualizarr/writers/icechunk.py | 21 +++------------------ virtualizarr/zarr.py | 24 +++++++++++++++++------- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index ad2a3d9b..94f6d1aa 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -55,7 +55,7 @@ def _get_manifestarray_codecs( ) -> Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]]: """Get codecs for a ManifestArray based on its zarr_format.""" if normalize_to_zarr_v3 or array.zarray.zarr_format == 3: - return array.zarray._v3_codec_pipeline() + return (array.zarray.serializer(),) + array.zarray._v3_codec_pipeline() elif array.zarray.zarr_format == 2: return array.zarray.codec else: diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index f21972e3..538eab33 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -211,7 +211,6 @@ def write_virtual_variable_to_icechunk( ) -> None: """Write a single virtual variable into an icechunk store""" from zarr import Array - from zarr.core.metadata.v3 import parse_codecs ma = cast(ManifestArray, var.data) zarray = ma.zarray @@ -242,29 +241,15 @@ def write_virtual_variable_to_icechunk( append_axis=append_axis, ) else: - append_axis, compressors = None, None + append_axis = None # create array if it doesn't already exist - - # review https://github.com/zarr-developers/zarr-python/blob/0e1fde44b2ff3904bbe88fc4d1424d61d769dfe2/docs/user-guide/arrays.rst#compressors - # this should create a custom compressor with numcodecs, not try and replace zlib with gzip... - if zarray.compressor: - compressor = zarray.compressor.copy() - compressor["configuration"] = {} - for k, v in zarray.compressor.items(): - if k == "id": - v = "gzip" if v == "zlib" else v - compressor["name"] = v - else: - compressor["configuration"][k] = v - compressor.pop(k) - compressors = parse_codecs([compressor]) - arr = group.require_array( name=name, shape=zarray.shape, chunks=zarray.chunks, dtype=encode_dtype(zarray.dtype), - compressors=compressors, + compressors=zarray._v3_codec_pipeline(), # compressors, + serializer=zarray.serializer(), dimension_names=var.dims, fill_value=zarray.fill_value, ) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index e339a3f4..4c36e7dc 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -175,13 +175,6 @@ def _v3_codec_pipeline(self) -> Any: transpose = dict(name="transpose", configuration=dict(order=order)) codec_configs.append(transpose) - # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 - # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" - bytes = dict( - name="bytes", configuration={} - ) # TODO need to handle endianess configuration - codec_configs.append(bytes) - # Noting here that zarr v3 has very few codecs specificed in the official spec, # and that there are far more codecs in `numcodecs`. We take a gamble and assume # that the codec names and configuration are simply mapped into zarrv3 "configurables". @@ -198,6 +191,23 @@ def _v3_codec_pipeline(self) -> Any: return codec_pipeline + def serializer(self) -> Any: + """ + testing + """ + try: + from zarr.core.metadata.v3 import ( # type: ignore[import-untyped] + parse_codecs, + ) + except ImportError: + raise ImportError("zarr v3 is required to generate v3 codec pipelines") + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + return parse_codecs([bytes])[0] + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the ' Date: Thu, 30 Jan 2025 11:46:33 -0500 Subject: [PATCH 06/21] Update tests to latest kerchunk --- ci/upstream.yml | 4 ++-- pyproject.toml | 2 +- virtualizarr/tests/__init__.py | 15 +-------------- virtualizarr/tests/test_backend.py | 3 +-- virtualizarr/tests/test_integration.py | 17 ++++++++++------- .../test_hdf/test_hdf_integration.py | 9 +++++---- .../tests/test_readers/test_kerchunk.py | 4 ++-- .../tests/test_writers/test_kerchunk.py | 3 ++- 8 files changed, 24 insertions(+), 33 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index ed9bf6fa..97762961 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -29,6 +29,6 @@ dependencies: - fsspec - pip - pip: - - icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency - # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) + - icechunk==0.1.0a12 # Installs python-zarr v3 as dependency + - git+https://github.com/fsspec/kerchunk.git@main - imagecodecs-numcodecs==2024.6.1 diff --git a/pyproject.toml b/pyproject.toml index 0ee48ef4..54d06b91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ hdf_reader = [ "numcodecs" ] icechunk = [ - "icechunk>=0.1.0a10", + "icechunk>=0.1.0a12", ] test = [ "codecov", diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 7fb8982d..0b9c416e 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -1,10 +1,8 @@ import importlib import itertools -import fsspec import numpy as np import pytest -import xarray as xr from packaging.version import Version from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -39,6 +37,7 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") has_icechunk, requires_icechunk = _importorskip("icechunk") has_kerchunk, requires_kerchunk = _importorskip("kerchunk") +has_fastparquet, requires_fastparquet = _importorskip("fastparquet") has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") @@ -110,18 +109,6 @@ def length_from_chunk_key(ind: tuple[int, ...]) -> int: return sum(ind) + 5 -def open_dataset_kerchunk( - filename_or_obj: str, *, storage_options=None, **kwargs -) -> xr.Dataset: - """Equivalent to ``xr.open_dataset(..., engine="kerchunk")`` but without depending on - kerchunk library - """ - m = fsspec.filesystem( - "reference", fo=filename_or_obj, **(storage_options or {}) - ).get_mapper() - return xr.open_dataset(m, engine="zarr", consolidated=False, **kwargs) - - def in_memory_icechunk_session(): from icechunk import Repository, Storage diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 41bf9c2d..e4157b73 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -15,7 +15,6 @@ from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( has_astropy, - open_dataset_kerchunk, parametrize_over_hdf_backends, requires_hdf5plugin, requires_imagecodecs, @@ -322,7 +321,7 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend): ) tmpref = "/tmp/cmip6.json" vds.virtualize.to_kerchunk(tmpref, format="json") - dsV = open_dataset_kerchunk(tmpref) + dsV = xr.open_dataset(tmpref, engine="kerchunk") # xrt.assert_identical(dsXR, dsV) #Attribute order changes xrt.assert_equal(dsXR, dsV) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index b045a330..5569e859 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -9,10 +9,10 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import ( + has_fastparquet, has_icechunk, has_kerchunk, in_memory_icechunk_session, - open_dataset_kerchunk, parametrize_over_hdf_backends, requires_kerchunk, requires_zarr_python, @@ -97,7 +97,7 @@ def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs): ds_refs = vds.virtualize.to_kerchunk(format="dict") # use fsspec to read the dataset from the kerchunk references dict - return open_dataset_kerchunk(ds_refs, **kwargs) + return xr.open_dataset(ds_refs, engine="kerchunk", **kwargs) def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs): @@ -105,7 +105,7 @@ def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs): vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json") # use fsspec to read the dataset from disk via the kerchunk references - return open_dataset_kerchunk(f"{tmpdir}/refs.json", **kwargs) + return xr.open_dataset(f"{tmpdir}/refs.json", engine="kerchunk", **kwargs) def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs): @@ -113,7 +113,7 @@ def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs): vds.virtualize.to_kerchunk(f"{tmpdir}/refs.parquet", format="parquet") # use fsspec to read the dataset from disk via the kerchunk references - return open_dataset_kerchunk(f"{tmpdir}/refs.parquet", **kwargs) + return xr.open_dataset(f"{tmpdir}/refs.parquet", engine="kerchunk", **kwargs) def roundtrip_as_in_memory_icechunk(vds: xr.Dataset, tmpdir, **kwargs): @@ -132,9 +132,12 @@ def roundtrip_as_in_memory_icechunk(vds: xr.Dataset, tmpdir, **kwargs): @pytest.mark.parametrize( "roundtrip_func", [ - roundtrip_as_kerchunk_dict, - roundtrip_as_kerchunk_json, - *([roundtrip_as_kerchunk_parquet] if has_kerchunk else []), + *( + [roundtrip_as_kerchunk_dict, roundtrip_as_kerchunk_json] + if has_kerchunk + else [] + ), + *([roundtrip_as_kerchunk_parquet] if has_kerchunk and has_fastparquet else []), *([roundtrip_as_in_memory_icechunk] if has_icechunk else []), ], ) diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index e43ae1f4..7496a968 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -5,12 +5,13 @@ import virtualizarr from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( - open_dataset_kerchunk, requires_hdf5plugin, requires_imagecodecs, + requires_kerchunk, ) +@requires_kerchunk @requires_hdf5plugin @requires_imagecodecs class TestIntegration: @@ -29,7 +30,7 @@ def test_filters_h5netcdf_roundtrip( ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = open_dataset_kerchunk(kerchunk_file, decode_times=True) + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) xrt.assert_allclose(ds, roundtrip) @pytest.mark.xfail( @@ -43,7 +44,7 @@ def test_filters_netcdf4_roundtrip( vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="dict") - roundtrip = open_dataset_kerchunk(kerchunk_file) + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): @@ -53,5 +54,5 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file ) kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = open_dataset_kerchunk(kerchunk_file) + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_allclose(ds, roundtrip) diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index 89c6ba31..83f7999d 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -7,7 +7,7 @@ from virtualizarr.backend import open_virtual_dataset from virtualizarr.manifests import ManifestArray -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import has_fastparquet, requires_kerchunk def gen_ds_refs( @@ -177,7 +177,7 @@ def test_handle_relative_paths(refs_file_factory): @requires_kerchunk @pytest.mark.parametrize( "reference_format", - ["json", "parquet", "invalid"], + ["json", "invalid", *(["parquet"] if has_fastparquet else [])], ) def test_open_virtual_dataset_existing_kerchunk_refs( tmp_path, netcdf4_virtual_dataset, reference_format diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py index 8cc7f825..1e9b240c 100644 --- a/virtualizarr/tests/test_writers/test_kerchunk.py +++ b/virtualizarr/tests/test_writers/test_kerchunk.py @@ -3,7 +3,7 @@ from xarray import Dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import requires_fastparquet, requires_kerchunk @requires_kerchunk @@ -108,6 +108,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path): } assert loaded_refs == expected_ds_refs + @requires_fastparquet def test_accessor_to_kerchunk_parquet(self, tmp_path): import ujson From 42414f06cd11e34f9a8730ca02430ef17b9f34f8 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 30 Jan 2025 12:23:30 -0500 Subject: [PATCH 07/21] Remove icechunk roundtripping --- virtualizarr/tests/__init__.py | 8 -------- virtualizarr/tests/test_integration.py | 15 --------------- 2 files changed, 23 deletions(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 0b9c416e..258a9112 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -107,11 +107,3 @@ def offset_from_chunk_key(ind: tuple[int, ...]) -> int: def length_from_chunk_key(ind: tuple[int, ...]) -> int: return sum(ind) + 5 - - -def in_memory_icechunk_session(): - from icechunk import Repository, Storage - - repo = Repository.create(storage=Storage.new_in_memory()) - session = repo.writable_session("main") - return session diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 5569e859..14cc8d3d 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -10,9 +10,7 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import ( has_fastparquet, - has_icechunk, has_kerchunk, - in_memory_icechunk_session, parametrize_over_hdf_backends, requires_kerchunk, requires_zarr_python, @@ -116,18 +114,6 @@ def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs): return xr.open_dataset(f"{tmpdir}/refs.parquet", engine="kerchunk", **kwargs) -def roundtrip_as_in_memory_icechunk(vds: xr.Dataset, tmpdir, **kwargs): - # write those references to an in-memory icechunk store - icechunk_session = in_memory_icechunk_session() - vds.virtualize.to_icechunk(icechunk_session.store) - icechunk_session.commit("add data") - - # read the dataset from icechunk - return xr.open_zarr( - icechunk_session.store, zarr_format=3, consolidated=False, **kwargs - ) - - @requires_zarr_python @pytest.mark.parametrize( "roundtrip_func", @@ -138,7 +124,6 @@ def roundtrip_as_in_memory_icechunk(vds: xr.Dataset, tmpdir, **kwargs): else [] ), *([roundtrip_as_kerchunk_parquet] if has_kerchunk and has_fastparquet else []), - *([roundtrip_as_in_memory_icechunk] if has_icechunk else []), ], ) class TestRoundtrip: From ff659b9476f23a638a359af00ef9c93386de9de1 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 15:41:56 -0800 Subject: [PATCH 08/21] Fixed some warnings --- .github/workflows/main.yml | 12 ++++----- .github/workflows/min-deps.yml | 12 ++++----- .github/workflows/upstream.yml | 12 ++++----- ci/upstream.yml | 2 +- conftest.py | 8 ++++-- docs/contributing.md | 12 +++++---- virtualizarr/readers/common.py | 47 +++++++++++++++++----------------- 7 files changed, 55 insertions(+), 50 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 38fb4e49..fd782dd4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,13 +2,13 @@ name: CI on: push: - branches: [ "main" ] + branches: ["main"] paths-ignore: - - 'docs/**' + - "docs/**" pull_request: - branches: [ "main" ] + branches: ["main"] paths-ignore: - - 'docs/**' + - "docs/**" schedule: - cron: "0 0 * * *" @@ -42,7 +42,7 @@ jobs: - name: Install virtualizarr run: | - python -m pip install -e . --no-deps + python -m pip install -e . --no-deps - name: Conda list information run: | @@ -51,7 +51,7 @@ jobs: - name: Running Tests run: | - python -m pytest --run-network-tests --verbose --cov=virtualizarr --cov-report=xml + python -m pytest --run-network-tests --verbose --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml index c236a9ef..9d8568cb 100644 --- a/.github/workflows/min-deps.yml +++ b/.github/workflows/min-deps.yml @@ -2,13 +2,13 @@ name: min-deps on: push: - branches: [ "main" ] + branches: ["main"] paths-ignore: - - 'docs/**' + - "docs/**" pull_request: - branches: [ "main" ] + branches: ["main"] paths-ignore: - - 'docs/**' + - "docs/**" schedule: - cron: "0 0 * * *" @@ -42,7 +42,7 @@ jobs: - name: Install virtualizarr run: | - python -m pip install -e . --no-deps + python -m pip install -e . --no-deps - name: Conda list information run: | @@ -51,7 +51,7 @@ jobs: - name: Running Tests run: | - python -m pytest --verbose --cov=virtualizarr --cov-report=xml + python -m pytest --verbose --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 74867ea5..65b303b5 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -2,13 +2,13 @@ name: upstream on: push: - branches: [ "main" ] + branches: ["main"] paths-ignore: - - 'docs/**' + - "docs/**" pull_request: - branches: [ "main" ] + branches: ["main"] paths-ignore: - - 'docs/**' + - "docs/**" schedule: - cron: "0 0 * * *" @@ -42,7 +42,7 @@ jobs: - name: Install virtualizarr run: | - python -m pip install -e . --no-deps + python -m pip install -e . --no-deps - name: Conda list information run: | @@ -51,7 +51,7 @@ jobs: - name: Running Tests run: | - python -m pytest --verbose --cov=virtualizarr --cov-report=xml + python -m pytest --verbose --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 diff --git a/ci/upstream.yml b/ci/upstream.yml index 97762961..74f944d1 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -29,6 +29,6 @@ dependencies: - fsspec - pip - pip: - - icechunk==0.1.0a12 # Installs python-zarr v3 as dependency + - icechunk>=0.1.0a12 # Installs python-zarr v3 as dependency - git+https://github.com/fsspec/kerchunk.git@main - imagecodecs-numcodecs==2024.6.1 diff --git a/conftest.py b/conftest.py index 0781c37e..e9fe050b 100644 --- a/conftest.py +++ b/conftest.py @@ -43,7 +43,9 @@ def netcdf4_file(tmp_path: Path) -> str: # Set up example xarray dataset with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) - ds.to_netcdf(filepath, format="NETCDF4") + ds.to_netcdf( + filepath, format="NETCDF4" + ) # , encoding={"air": {"dtype": "float32"}}) return str(filepath) @@ -63,7 +65,9 @@ def netcdf4_file_with_data_in_multiple_groups(tmp_path: Path) -> str: @pytest.fixture def netcdf4_files_factory(tmp_path: Path) -> Callable: def create_netcdf4_files( - encoding: Optional[Mapping[str, Mapping[str, Any]]] = None, + encoding: Optional[Mapping[str, Mapping[str, Any]]] = { + "air": {"dtype": "float32"} + }, # Store as float32 to address SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs ) -> tuple[str, str]: filepath1 = tmp_path / "air1.nc" filepath2 = tmp_path / "air2.nc" diff --git a/docs/contributing.md b/docs/contributing.md index 45fc8599..4c9d60fa 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -16,7 +16,7 @@ python -m pytest ``` You may also add the `--run-network-tests` option, which will run additional tests -that require downloading files over the network. Skip this if you want the tests to run +that require downloading files over the network. Skip this if you want the tests to run faster or you have no internet access: ```bash @@ -24,13 +24,15 @@ python -m pytest --run-network-tests ``` Further, the `pytest-cov` plugin is a test dependency, so you can generate a test -coverage report locally, if you wish (CI will automatically do so). Here are some +coverage report locally, if you wish (CI will automatically do so). Here are some examples: +Note: `--cov` should not be needed since `include = ["virtualizarr/"]` is set in pyproject.toml `[tool.coverage.run]`. + ```bash -python -m pytest --cov=. # Terminal (text) report (--cov=term) -python -m pytest --cov=. --cov=term-missing # Terminal report showing missing coverage -python -m pytest --cov=. --cov=html # HTML report written to htmlcov/index.html +python -m pytest # Terminal (text) report (--cov=term) +python -m pytest --cov=term-missing # Terminal report showing missing coverage +python -m pytest --cov=html # HTML report written to htmlcov/index.html ``` To see all available `pytest` options added by the `pytest-cov` plugin, run diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 0a7ad36e..0bbccabc 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -50,37 +50,36 @@ def maybe_open_loadable_vars_and_indexes( ).open_file() # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. - ds = open_dataset( + with open_dataset( fpath, # type: ignore[arg-type] drop_variables=drop_variables, group=group, decode_times=decode_times, - ) + ) as ds: + if indexes is None: + # add default indexes by reading data from file + # but avoid creating an in-memory index for virtual variables by default + indexes = { + name: index + for name, index in ds.xindexes.items() + if name in loadable_variables + } + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation - if indexes is None: - # add default indexes by reading data from file - # but avoid creating an in-memory index for virtual variables by default - indexes = { - name: index - for name, index in ds.xindexes.items() + # TODO we should drop these earlier by using drop_variables + loadable_vars = { + str(name): var + for name, var in ds.variables.items() if name in loadable_variables } - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - # TODO we should drop these earlier by using drop_variables - loadable_vars = { - str(name): var - for name, var in ds.variables.items() - if name in loadable_variables - } - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() + + # # if we only read the indexes we can just close the file right away as nothing is lazy + # if loadable_vars == {}: + # ds.close() return loadable_vars, indexes From b3ed8fff4b4d1424e8803f2d960e640b7b575473 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 15:51:29 -0800 Subject: [PATCH 09/21] Fixed codec test --- virtualizarr/tests/test_codecs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_codecs.py b/virtualizarr/tests/test_codecs.py index 41d16b5b..23cb494e 100644 --- a/virtualizarr/tests/test_codecs.py +++ b/virtualizarr/tests/test_codecs.py @@ -58,7 +58,9 @@ def test_manifest_array_zarr_v2_normalized(self): # Get codecs and verify actual_codecs = get_codecs(manifest_array, normalize_to_zarr_v3=True) - expected_codecs = manifest_array.zarray._v3_codec_pipeline() + expected_codecs = ( + manifest_array.zarray.serializer(), + ) + manifest_array.zarray._v3_codec_pipeline() assert actual_codecs == expected_codecs @requires_zarr_python_v3 From 57be796569d7baae97c16e096f6a5601405d32fb Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 15:58:45 -0800 Subject: [PATCH 10/21] Fix warnings in test_backend.py --- conftest.py | 9 +++++++-- virtualizarr/tests/test_backend.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index e9fe050b..eb2eb987 100644 --- a/conftest.py +++ b/conftest.py @@ -44,7 +44,7 @@ def netcdf4_file(tmp_path: Path) -> str: with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) ds.to_netcdf( - filepath, format="NETCDF4" + filepath, format="NETCDF4", encoding={"air": {"dtype": "float32"}} ) # , encoding={"air": {"dtype": "float32"}}) return str(filepath) @@ -117,7 +117,12 @@ def hdf5_groups_file(tmp_path: Path) -> str: # Set up example xarray dataset with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) - ds.to_netcdf(filepath, format="NETCDF4", group="test/group") + ds.to_netcdf( + filepath, + format="NETCDF4", + group="test/group", + encoding={"air": {"dtype": "float32"}}, + ) return str(filepath) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index e4157b73..4e55c09e 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -152,7 +152,7 @@ def test_cftime_index(tmpdir, hdf_backend): # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) assert list(ds.coords) == list(vds.coords) - assert vds.dims == ds.dims + assert vds.sizes == ds.sizes assert vds.attrs == ds.attrs From deb15bae14c6d1324d0e70f6e5c7b8bb6f4f8775 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 16:28:12 -0800 Subject: [PATCH 11/21] Tests passing --- ci/upstream.yml | 2 +- conftest.py | 31 +++++++++++++++---- .../tests/test_writers/test_icechunk.py | 18 +++++++---- virtualizarr/tests/test_writers/test_zarr.py | 2 +- 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index 74f944d1..2a8c1da3 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -3,7 +3,7 @@ channels: - conda-forge - nodefaults dependencies: - - xarray>=2024.10.0,<2025.0.0 + - xarray>=2025.1.1 - h5netcdf - h5py - hdf5 diff --git a/conftest.py b/conftest.py index eb2eb987..0c213817 100644 --- a/conftest.py +++ b/conftest.py @@ -37,15 +37,31 @@ def empty_netcdf4_file(tmp_path: Path) -> str: @pytest.fixture -def netcdf4_file(tmp_path: Path) -> str: +def netcdf4_file_with_scale(tmp_path: Path) -> str: filepath = tmp_path / "air.nc" # Set up example xarray dataset with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) ds.to_netcdf( - filepath, format="NETCDF4", encoding={"air": {"dtype": "float32"}} - ) # , encoding={"air": {"dtype": "float32"}}) + filepath, + format="NETCDF4", + encoding={"air": {"dtype": "float32", "scale_factor": 0.01}}, + ) + + return str(filepath) + + +@pytest.fixture +def netcdf4_file(tmp_path: Path) -> str: + filepath = tmp_path / "air.nc" + + # Set up example xarray dataset + with xr.tutorial.open_dataset("air_temperature") as ds: + # Save it to disk as netCDF (in temporary directory) + air_encoding = ds["air"].encoding + air_encoding["_FillValue"] = -9999 + ds.to_netcdf(filepath, format="NETCDF4", encoding={"air": air_encoding}) return str(filepath) @@ -65,14 +81,17 @@ def netcdf4_file_with_data_in_multiple_groups(tmp_path: Path) -> str: @pytest.fixture def netcdf4_files_factory(tmp_path: Path) -> Callable: def create_netcdf4_files( - encoding: Optional[Mapping[str, Mapping[str, Any]]] = { - "air": {"dtype": "float32"} - }, # Store as float32 to address SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs + encoding: Optional[ + Mapping[str, Mapping[str, Any]] + ] = None, # Store as float32 to address SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs ) -> tuple[str, str]: filepath1 = tmp_path / "air1.nc" filepath2 = tmp_path / "air2.nc" with xr.tutorial.open_dataset("air_temperature") as ds: + if encoding is None: + encoding = {"air": ds["air"].encoding} + encoding["air"]["_FillValue"] = -9999 # Split dataset into two parts ds1 = ds.isel(time=slice(None, 1460)) ds2 = ds.isel(time=slice(1460, None)) diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index b5f2048e..294ff649 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -143,22 +143,28 @@ def test_set_single_virtual_ref_without_encoding( def test_set_single_virtual_ref_with_encoding( - icechunk_filestore: "IcechunkStore", netcdf4_file: Path + icechunk_filestore: "IcechunkStore", netcdf4_file_with_scale: Path ): import xarray.testing as xrt - with xr.open_dataset(netcdf4_file) as ds: + with xr.open_dataset(netcdf4_file_with_scale) as ds: # We drop the coordinates because we don't have them in the zarr test case expected_ds = ds.drop_vars(["lon", "lat", "time"]) # instead, for now just write out byte ranges explicitly manifest = ChunkManifest( - {"0.0.0": {"path": netcdf4_file, "offset": 15419, "length": 7738000}} + { + "0.0.0": { + "path": netcdf4_file_with_scale, + "offset": 15419, + "length": 15476000, + } + } ) zarray = ZArray( shape=(2920, 25, 53), chunks=(2920, 25, 53), - dtype=np.dtype("int16"), + dtype=np.dtype("float32"), compressor=None, filters=None, fill_value=None, @@ -170,7 +176,7 @@ def test_set_single_virtual_ref_with_encoding( air = xr.Variable( data=ma, dims=["time", "lat", "lon"], - encoding={"scale_factor": 0.01}, + encoding={"scale_factor": 0.01, "dtype": "float32"}, attrs=expected_ds["air"].attrs, ) vds = xr.Dataset({"air": air}, attrs=expected_ds.attrs) @@ -184,7 +190,7 @@ def test_set_single_virtual_ref_with_encoding( # check array metadata assert air_array.shape == (2920, 25, 53) assert air_array.chunks == (2920, 25, 53) - assert air_array.dtype == np.dtype("int16") + assert air_array.dtype == np.dtype("float32") assert air_array.attrs["scale_factor"] == 0.01 # check chunk references diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py index 5afa87a3..9ca281cb 100644 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ b/virtualizarr/tests/test_writers/test_zarr.py @@ -42,7 +42,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset) assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) assert ( isinstance(metadata["codecs"], list) - and len(metadata["codecs"]) > 1 + and len(metadata["codecs"]) == 1 and all(isconfigurable(codec) for codec in metadata["codecs"]) ) From cecc151a21b29973f54cfcda6a3bea972c62b216 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 16:36:35 -0800 Subject: [PATCH 12/21] Remove obsolete comment --- conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 0c213817..9d2aeaec 100644 --- a/conftest.py +++ b/conftest.py @@ -81,9 +81,7 @@ def netcdf4_file_with_data_in_multiple_groups(tmp_path: Path) -> str: @pytest.fixture def netcdf4_files_factory(tmp_path: Path) -> Callable: def create_netcdf4_files( - encoding: Optional[ - Mapping[str, Mapping[str, Any]] - ] = None, # Store as float32 to address SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs + encoding: Optional[Mapping[str, Mapping[str, Any]]] = None, ) -> tuple[str, str]: filepath1 = tmp_path / "air1.nc" filepath2 = tmp_path / "air2.nc" From 6d062918bb8685fe075399fbd2ad23d03e123051 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 17:57:48 -0800 Subject: [PATCH 13/21] Add fill value to fixture --- conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 9d2aeaec..2a4c7110 100644 --- a/conftest.py +++ b/conftest.py @@ -134,11 +134,13 @@ def hdf5_groups_file(tmp_path: Path) -> str: # Set up example xarray dataset with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) + air_encoding = ds["air"].encoding + air_encoding["_FillValue"] = -9999 ds.to_netcdf( filepath, format="NETCDF4", group="test/group", - encoding={"air": {"dtype": "float32"}}, + encoding={"air": air_encoding}, ) return str(filepath) From 4a2f294956210b4d215c51e51dac6d05b88effdb Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 17:58:06 -0800 Subject: [PATCH 14/21] Remove obsolete conditional to ds.close() --- virtualizarr/readers/common.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 0bbccabc..3c0d91fa 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -77,10 +77,6 @@ def maybe_open_loadable_vars_and_indexes( if name in loadable_variables } - # # if we only read the indexes we can just close the file right away as nothing is lazy - # if loadable_vars == {}: - # ds.close() - return loadable_vars, indexes From aa55a39095f5391010316f6da2d7c0563fb01164 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 17:58:53 -0800 Subject: [PATCH 15/21] Reset workflows with --cov --- .github/workflows/main.yml | 12 ++++++------ .github/workflows/min-deps.yml | 12 ++++++------ .github/workflows/upstream.yml | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fd782dd4..38fb4e49 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,13 +2,13 @@ name: CI on: push: - branches: ["main"] + branches: [ "main" ] paths-ignore: - - "docs/**" + - 'docs/**' pull_request: - branches: ["main"] + branches: [ "main" ] paths-ignore: - - "docs/**" + - 'docs/**' schedule: - cron: "0 0 * * *" @@ -42,7 +42,7 @@ jobs: - name: Install virtualizarr run: | - python -m pip install -e . --no-deps + python -m pip install -e . --no-deps - name: Conda list information run: | @@ -51,7 +51,7 @@ jobs: - name: Running Tests run: | - python -m pytest --run-network-tests --verbose --cov-report=xml + python -m pytest --run-network-tests --verbose --cov=virtualizarr --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml index 9d8568cb..c236a9ef 100644 --- a/.github/workflows/min-deps.yml +++ b/.github/workflows/min-deps.yml @@ -2,13 +2,13 @@ name: min-deps on: push: - branches: ["main"] + branches: [ "main" ] paths-ignore: - - "docs/**" + - 'docs/**' pull_request: - branches: ["main"] + branches: [ "main" ] paths-ignore: - - "docs/**" + - 'docs/**' schedule: - cron: "0 0 * * *" @@ -42,7 +42,7 @@ jobs: - name: Install virtualizarr run: | - python -m pip install -e . --no-deps + python -m pip install -e . --no-deps - name: Conda list information run: | @@ -51,7 +51,7 @@ jobs: - name: Running Tests run: | - python -m pytest --verbose --cov-report=xml + python -m pytest --verbose --cov=virtualizarr --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 65b303b5..74867ea5 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -2,13 +2,13 @@ name: upstream on: push: - branches: ["main"] + branches: [ "main" ] paths-ignore: - - "docs/**" + - 'docs/**' pull_request: - branches: ["main"] + branches: [ "main" ] paths-ignore: - - "docs/**" + - 'docs/**' schedule: - cron: "0 0 * * *" @@ -42,7 +42,7 @@ jobs: - name: Install virtualizarr run: | - python -m pip install -e . --no-deps + python -m pip install -e . --no-deps - name: Conda list information run: | @@ -51,7 +51,7 @@ jobs: - name: Running Tests run: | - python -m pytest --verbose --cov-report=xml + python -m pytest --verbose --cov=virtualizarr --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 From 83379ee9d971147cc0c6bf63e3c4f43a5c9f0c10 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 17:59:43 -0800 Subject: [PATCH 16/21] Reset conftest.py fixtures (air encoding) --- conftest.py | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/conftest.py b/conftest.py index 2a4c7110..0781c37e 100644 --- a/conftest.py +++ b/conftest.py @@ -36,22 +36,6 @@ def empty_netcdf4_file(tmp_path: Path) -> str: return str(filepath) -@pytest.fixture -def netcdf4_file_with_scale(tmp_path: Path) -> str: - filepath = tmp_path / "air.nc" - - # Set up example xarray dataset - with xr.tutorial.open_dataset("air_temperature") as ds: - # Save it to disk as netCDF (in temporary directory) - ds.to_netcdf( - filepath, - format="NETCDF4", - encoding={"air": {"dtype": "float32", "scale_factor": 0.01}}, - ) - - return str(filepath) - - @pytest.fixture def netcdf4_file(tmp_path: Path) -> str: filepath = tmp_path / "air.nc" @@ -59,9 +43,7 @@ def netcdf4_file(tmp_path: Path) -> str: # Set up example xarray dataset with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) - air_encoding = ds["air"].encoding - air_encoding["_FillValue"] = -9999 - ds.to_netcdf(filepath, format="NETCDF4", encoding={"air": air_encoding}) + ds.to_netcdf(filepath, format="NETCDF4") return str(filepath) @@ -87,9 +69,6 @@ def create_netcdf4_files( filepath2 = tmp_path / "air2.nc" with xr.tutorial.open_dataset("air_temperature") as ds: - if encoding is None: - encoding = {"air": ds["air"].encoding} - encoding["air"]["_FillValue"] = -9999 # Split dataset into two parts ds1 = ds.isel(time=slice(None, 1460)) ds2 = ds.isel(time=slice(1460, None)) @@ -134,14 +113,7 @@ def hdf5_groups_file(tmp_path: Path) -> str: # Set up example xarray dataset with xr.tutorial.open_dataset("air_temperature") as ds: # Save it to disk as netCDF (in temporary directory) - air_encoding = ds["air"].encoding - air_encoding["_FillValue"] = -9999 - ds.to_netcdf( - filepath, - format="NETCDF4", - group="test/group", - encoding={"air": air_encoding}, - ) + ds.to_netcdf(filepath, format="NETCDF4", group="test/group") return str(filepath) From 7786421d1b6d4129886e8759cb5db9fbfb27e1b8 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 18:01:34 -0800 Subject: [PATCH 17/21] Reset contributiong (--cov) removed --- docs/contributing.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index 4c9d60fa..45fc8599 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -16,7 +16,7 @@ python -m pytest ``` You may also add the `--run-network-tests` option, which will run additional tests -that require downloading files over the network. Skip this if you want the tests to run +that require downloading files over the network. Skip this if you want the tests to run faster or you have no internet access: ```bash @@ -24,15 +24,13 @@ python -m pytest --run-network-tests ``` Further, the `pytest-cov` plugin is a test dependency, so you can generate a test -coverage report locally, if you wish (CI will automatically do so). Here are some +coverage report locally, if you wish (CI will automatically do so). Here are some examples: -Note: `--cov` should not be needed since `include = ["virtualizarr/"]` is set in pyproject.toml `[tool.coverage.run]`. - ```bash -python -m pytest # Terminal (text) report (--cov=term) -python -m pytest --cov=term-missing # Terminal report showing missing coverage -python -m pytest --cov=html # HTML report written to htmlcov/index.html +python -m pytest --cov=. # Terminal (text) report (--cov=term) +python -m pytest --cov=. --cov=term-missing # Terminal report showing missing coverage +python -m pytest --cov=. --cov=html # HTML report written to htmlcov/index.html ``` To see all available `pytest` options added by the `pytest-cov` plugin, run From 45eb86bc4246e2a35f2a2570940395cc7edecc65 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 18:02:10 -0800 Subject: [PATCH 18/21] Remove context manager from readers/common.py --- virtualizarr/readers/common.py | 43 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 3c0d91fa..0a7ad36e 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -50,32 +50,37 @@ def maybe_open_loadable_vars_and_indexes( ).open_file() # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. - with open_dataset( + ds = open_dataset( fpath, # type: ignore[arg-type] drop_variables=drop_variables, group=group, decode_times=decode_times, - ) as ds: - if indexes is None: - # add default indexes by reading data from file - # but avoid creating an in-memory index for virtual variables by default - indexes = { - name: index - for name, index in ds.xindexes.items() - if name in loadable_variables - } - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation + ) - # TODO we should drop these earlier by using drop_variables - loadable_vars = { - str(name): var - for name, var in ds.variables.items() + if indexes is None: + # add default indexes by reading data from file + # but avoid creating an in-memory index for virtual variables by default + indexes = { + name: index + for name, index in ds.xindexes.items() if name in loadable_variables } + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + # TODO we should drop these earlier by using drop_variables + loadable_vars = { + str(name): var + for name, var in ds.variables.items() + if name in loadable_variables + } + + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() return loadable_vars, indexes From 0babb6510294d014de08ef07f49a2cedc2cf8510 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 18:02:43 -0800 Subject: [PATCH 19/21] Reset test_backend with ds.dims --- virtualizarr/tests/test_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 4e55c09e..e4157b73 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -152,7 +152,7 @@ def test_cftime_index(tmpdir, hdf_backend): # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) assert list(ds.coords) == list(vds.coords) - assert vds.sizes == ds.sizes + assert vds.dims == ds.dims assert vds.attrs == ds.attrs From dc4ed8b40424669a2242c07b1896196fbf66022f Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 30 Jan 2025 18:03:36 -0800 Subject: [PATCH 20/21] Reset test_icechunk (air encoding) --- .../tests/test_writers/test_icechunk.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index 294ff649..b5f2048e 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -143,28 +143,22 @@ def test_set_single_virtual_ref_without_encoding( def test_set_single_virtual_ref_with_encoding( - icechunk_filestore: "IcechunkStore", netcdf4_file_with_scale: Path + icechunk_filestore: "IcechunkStore", netcdf4_file: Path ): import xarray.testing as xrt - with xr.open_dataset(netcdf4_file_with_scale) as ds: + with xr.open_dataset(netcdf4_file) as ds: # We drop the coordinates because we don't have them in the zarr test case expected_ds = ds.drop_vars(["lon", "lat", "time"]) # instead, for now just write out byte ranges explicitly manifest = ChunkManifest( - { - "0.0.0": { - "path": netcdf4_file_with_scale, - "offset": 15419, - "length": 15476000, - } - } + {"0.0.0": {"path": netcdf4_file, "offset": 15419, "length": 7738000}} ) zarray = ZArray( shape=(2920, 25, 53), chunks=(2920, 25, 53), - dtype=np.dtype("float32"), + dtype=np.dtype("int16"), compressor=None, filters=None, fill_value=None, @@ -176,7 +170,7 @@ def test_set_single_virtual_ref_with_encoding( air = xr.Variable( data=ma, dims=["time", "lat", "lon"], - encoding={"scale_factor": 0.01, "dtype": "float32"}, + encoding={"scale_factor": 0.01}, attrs=expected_ds["air"].attrs, ) vds = xr.Dataset({"air": air}, attrs=expected_ds.attrs) @@ -190,7 +184,7 @@ def test_set_single_virtual_ref_with_encoding( # check array metadata assert air_array.shape == (2920, 25, 53) assert air_array.chunks == (2920, 25, 53) - assert air_array.dtype == np.dtype("float32") + assert air_array.dtype == np.dtype("int16") assert air_array.attrs["scale_factor"] == 0.01 # check chunk references From 76dcfb7ee9ba9df7f132734241696546a0c59f73 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 31 Jan 2025 13:57:05 -0500 Subject: [PATCH 21/21] Fix change that snuck in on #395 --- .../tests/test_readers/test_hdf/test_hdf_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index 7496a968..d83fb91e 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -43,7 +43,7 @@ def test_filters_netcdf4_roundtrip( ds = xr.open_dataset(filepath) vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend) kerchunk_file = f"{tmpdir}/kerchunk.json" - vds.virtualize.to_kerchunk(kerchunk_file, format="dict") + vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip)