diff --git a/ci/upstream.yml b/ci/upstream.yml index ed9bf6fa..2a8c1da3 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -3,7 +3,7 @@ channels: - conda-forge - nodefaults dependencies: - - xarray>=2024.10.0,<2025.0.0 + - xarray>=2025.1.1 - h5netcdf - h5py - hdf5 @@ -29,6 +29,6 @@ dependencies: - fsspec - pip - pip: - - icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency - # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) + - icechunk>=0.1.0a12 # Installs python-zarr v3 as dependency + - git+https://github.com/fsspec/kerchunk.git@main - imagecodecs-numcodecs==2024.6.1 diff --git a/pyproject.toml b/pyproject.toml index 8421819e..54d06b91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ - "xarray>=2024.10.0,<2025.0.0", + "xarray>=2025.1.1", "numpy>=2.0.0", "packaging", "universal-pathlib", @@ -39,7 +39,7 @@ hdf_reader = [ "numcodecs" ] icechunk = [ - "icechunk==0.1.0a8", + "icechunk>=0.1.0a12", ] test = [ "codecov", diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index ad2a3d9b..94f6d1aa 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -55,7 +55,7 @@ def _get_manifestarray_codecs( ) -> Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]]: """Get codecs for a ManifestArray based on its zarr_format.""" if normalize_to_zarr_v3 or array.zarray.zarr_format == 3: - return array.zarray._v3_codec_pipeline() + return (array.zarray.serializer(),) + array.zarray._v3_codec_pipeline() elif array.zarray.zarr_format == 2: return array.zarray.codec else: diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index f38d5c2c..258a9112 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -35,7 +35,9 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") +has_icechunk, requires_icechunk = _importorskip("icechunk") has_kerchunk, requires_kerchunk = _importorskip("kerchunk") +has_fastparquet, requires_fastparquet = _importorskip("fastparquet") has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") diff --git a/virtualizarr/tests/test_codecs.py b/virtualizarr/tests/test_codecs.py index 41d16b5b..23cb494e 100644 --- a/virtualizarr/tests/test_codecs.py +++ b/virtualizarr/tests/test_codecs.py @@ -58,7 +58,9 @@ def test_manifest_array_zarr_v2_normalized(self): # Get codecs and verify actual_codecs = get_codecs(manifest_array, normalize_to_zarr_v3=True) - expected_codecs = manifest_array.zarray._v3_codec_pipeline() + expected_codecs = ( + manifest_array.zarray.serializer(), + ) + manifest_array.zarray._v3_codec_pipeline() assert actual_codecs == expected_codecs @requires_zarr_python_v3 diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 95be3de8..14cc8d3d 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -8,7 +8,13 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk +from virtualizarr.tests import ( + has_fastparquet, + has_kerchunk, + parametrize_over_hdf_backends, + requires_kerchunk, + requires_zarr_python, +) from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, ) @@ -34,16 +40,16 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): ), chunkmanifest=manifest, ) - ds = xr.Dataset({"a": (["x", "y"], marr)}) + vds = xr.Dataset({"a": (["x", "y"], marr)}) # Use accessor to write it out to kerchunk reference dict - ds_refs = ds.virtualize.to_kerchunk(format="dict") + ds_refs = vds.virtualize.to_kerchunk(format="dict") # Use dataset_from_kerchunk_refs to reconstruct the dataset roundtrip = dataset_from_kerchunk_refs(ds_refs) # Assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_equal(roundtrip, vds) @requires_kerchunk @@ -84,11 +90,45 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] -@requires_kerchunk -@pytest.mark.parametrize("format", ["dict", "json", "parquet"]) -class TestKerchunkRoundtrip: +def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to an in-memory kerchunk-formatted references dictionary + ds_refs = vds.virtualize.to_kerchunk(format="dict") + + # use fsspec to read the dataset from the kerchunk references dict + return xr.open_dataset(ds_refs, engine="kerchunk", **kwargs) + + +def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to disk as kerchunk references format + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json") + + # use fsspec to read the dataset from disk via the kerchunk references + return xr.open_dataset(f"{tmpdir}/refs.json", engine="kerchunk", **kwargs) + + +def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs): + # write those references to disk as kerchunk references format + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.parquet", format="parquet") + + # use fsspec to read the dataset from disk via the kerchunk references + return xr.open_dataset(f"{tmpdir}/refs.parquet", engine="kerchunk", **kwargs) + + +@requires_zarr_python +@pytest.mark.parametrize( + "roundtrip_func", + [ + *( + [roundtrip_as_kerchunk_dict, roundtrip_as_kerchunk_json] + if has_kerchunk + else [] + ), + *([roundtrip_as_kerchunk_parquet] if has_kerchunk and has_fastparquet else []), + ], +) +class TestRoundtrip: @parametrize_over_hdf_backends - def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): + def test_roundtrip_no_concat(self, tmpdir, roundtrip_func, hdf_backend): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -98,20 +138,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): # use open_dataset_via_kerchunk to read it as references vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend) - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = vds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) - else: - # write those references to disk as kerchunk references format - vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset( - f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False - ) + roundtrip = roundtrip_func(vds, tmpdir, decode_times=False) # assert all_close to original dataset xrt.assert_allclose(roundtrip, ds) @@ -123,7 +150,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): @parametrize_over_hdf_backends @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) def test_kerchunk_roundtrip_concat( - self, tmpdir, format, hdf_backend, decode_times, time_vars + self, tmpdir, roundtrip_func, hdf_backend, decode_times, time_vars ): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times) @@ -159,22 +186,7 @@ def test_kerchunk_roundtrip_concat( # concatenate virtually along time vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override") - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = vds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset( - ds_refs, engine="kerchunk", decode_times=decode_times - ) - else: - # write those references to disk as kerchunk references format - vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset( - f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times - ) + roundtrip = roundtrip_func(vds, tmpdir, decode_times=decode_times) if decode_times is False: # assert all_close to original dataset @@ -191,7 +203,7 @@ def test_kerchunk_roundtrip_concat( assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"] @parametrize_over_hdf_backends - def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): + def test_non_dimension_coordinates(self, tmpdir, roundtrip_func, hdf_backend): # regression test for GH issue #105 if hdf_backend: @@ -209,20 +221,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): assert "lat" in vds.coords assert "coordinates" not in vds.attrs - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = vds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) - else: - # write those references to disk as kerchunk references format - vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset( - f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False - ) + roundtrip = roundtrip_func(vds, tmpdir) # assert equal to original dataset xrt.assert_allclose(roundtrip, ds) @@ -231,7 +230,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs - def test_datetime64_dtype_fill_value(self, tmpdir, format): + def test_datetime64_dtype_fill_value(self, tmpdir, roundtrip_func): chunks_dict = { "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } @@ -249,7 +248,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): zarr_format=2, ) marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest) - ds = xr.Dataset( + vds = xr.Dataset( { "a": xr.DataArray( marr1, @@ -260,20 +259,9 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): } ) - if format == "dict": - # write those references to an in-memory kerchunk-formatted references dictionary - ds_refs = ds.virtualize.to_kerchunk(format=format) - - # use fsspec to read the dataset from the kerchunk references dict - roundtrip = xr.open_dataset(ds_refs, engine="kerchunk") - else: - # write those references to disk as kerchunk references format - ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) - - # use fsspec to read the dataset from disk via the kerchunk references - roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") + roundtrip = roundtrip_func(vds, tmpdir) - assert roundtrip.a.attrs == ds.a.attrs + assert roundtrip.a.attrs == vds.a.attrs @parametrize_over_hdf_backends diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index fdbeb4d2..8583f210 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -4,16 +4,22 @@ import virtualizarr from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import ( + requires_hdf5plugin, + requires_imagecodecs, + requires_kerchunk, +) @requires_kerchunk +@requires_hdf5plugin +@requires_imagecodecs class TestIntegration: @pytest.mark.xfail( reason="0 time start is being interpreted as fillvalue see issues/280" ) def test_filters_h5netcdf_roundtrip( - self, tmpdir, filter_encoded_roundtrip_hdf5_file, backend=HDFVirtualBackend + self, tmpdir, filter_encoded_roundtrip_hdf5_file ): ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index 89c6ba31..83f7999d 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -7,7 +7,7 @@ from virtualizarr.backend import open_virtual_dataset from virtualizarr.manifests import ManifestArray -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import has_fastparquet, requires_kerchunk def gen_ds_refs( @@ -177,7 +177,7 @@ def test_handle_relative_paths(refs_file_factory): @requires_kerchunk @pytest.mark.parametrize( "reference_format", - ["json", "parquet", "invalid"], + ["json", "invalid", *(["parquet"] if has_fastparquet else [])], ) def test_open_virtual_dataset_existing_kerchunk_refs( tmp_path, netcdf4_virtual_dataset, reference_format diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py index 8cc7f825..1e9b240c 100644 --- a/virtualizarr/tests/test_writers/test_kerchunk.py +++ b/virtualizarr/tests/test_writers/test_kerchunk.py @@ -3,7 +3,7 @@ from xarray import Dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import requires_fastparquet, requires_kerchunk @requires_kerchunk @@ -108,6 +108,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path): } assert loaded_refs == expected_ds_refs + @requires_fastparquet def test_accessor_to_kerchunk_parquet(self, tmp_path): import ujson diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py index 5afa87a3..9ca281cb 100644 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ b/virtualizarr/tests/test_writers/test_zarr.py @@ -42,7 +42,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset) assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) assert ( isinstance(metadata["codecs"], list) - and len(metadata["codecs"]) > 1 + and len(metadata["codecs"]) == 1 and all(isconfigurable(codec) for codec in metadata["codecs"]) ) diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 6ed535e5..538eab33 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -243,13 +243,13 @@ def write_virtual_variable_to_icechunk( else: append_axis = None # create array if it doesn't already exist - arr = group.require_array( name=name, shape=zarray.shape, - chunk_shape=zarray.chunks, + chunks=zarray.chunks, dtype=encode_dtype(zarray.dtype), - codecs=zarray._v3_codec_pipeline(), + compressors=zarray._v3_codec_pipeline(), # compressors, + serializer=zarray.serializer(), dimension_names=var.dims, fill_value=zarray.fill_value, ) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index e339a3f4..4c36e7dc 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -175,13 +175,6 @@ def _v3_codec_pipeline(self) -> Any: transpose = dict(name="transpose", configuration=dict(order=order)) codec_configs.append(transpose) - # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 - # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" - bytes = dict( - name="bytes", configuration={} - ) # TODO need to handle endianess configuration - codec_configs.append(bytes) - # Noting here that zarr v3 has very few codecs specificed in the official spec, # and that there are far more codecs in `numcodecs`. We take a gamble and assume # that the codec names and configuration are simply mapped into zarrv3 "configurables". @@ -198,6 +191,23 @@ def _v3_codec_pipeline(self) -> Any: return codec_pipeline + def serializer(self) -> Any: + """ + testing + """ + try: + from zarr.core.metadata.v3 import ( # type: ignore[import-untyped] + parse_codecs, + ) + except ImportError: + raise ImportError("zarr v3 is required to generate v3 codec pipelines") + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + return parse_codecs([bytes])[0] + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the '