Skip to content

Commit

Permalink
Update to zarr 3 and main kerchunk (#406)
Browse files Browse the repository at this point in the history
* Use open_dataset_kerchunk in roundtrip tests that don't otherwise require kerchunk

* Make it clear that integration tests require zarr-python

* Add in-memory icechunk tests to existing roundtrip tests

* Playing around with icechunk / zarr / xarray upgrade

* Passing icechunk tests

* Update tests to latest kerchunk

* Remove icechunk roundtripping

* Fixed some warnings

* Fixed codec test

* Fix warnings in test_backend.py

* Tests passing

* Remove obsolete comment

* Add fill value to fixture

* Remove obsolete conditional to ds.close()

* Reset workflows with --cov

* Reset conftest.py fixtures (air encoding)

* Reset contributiong (--cov) removed

* Remove context manager from readers/common.py

* Reset test_backend with ds.dims

* Reset test_icechunk (air encoding)

* Fix change that snuck in on #395

---------

Co-authored-by: Aimee Barciauskas <aimee@developmentseed.org>
  • Loading branch information
jsignell and abarciauskas-bgse authored Jan 31, 2025
1 parent 61847e9 commit 83a6f10
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 92 deletions.
6 changes: 3 additions & 3 deletions ci/upstream.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
- nodefaults
dependencies:
- xarray>=2024.10.0,<2025.0.0
- xarray>=2025.1.1
- h5netcdf
- h5py
- hdf5
Expand All @@ -29,6 +29,6 @@ dependencies:
- fsspec
- pip
- pip:
- icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency
# - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
- icechunk>=0.1.0a12 # Installs python-zarr v3 as dependency
- git+https://github.com/fsspec/kerchunk.git@main
- imagecodecs-numcodecs==2024.6.1
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ classifiers = [
requires-python = ">=3.10"
dynamic = ["version"]
dependencies = [
"xarray>=2024.10.0,<2025.0.0",
"xarray>=2025.1.1",
"numpy>=2.0.0",
"packaging",
"universal-pathlib",
Expand All @@ -39,7 +39,7 @@ hdf_reader = [
"numcodecs"
]
icechunk = [
"icechunk==0.1.0a8",
"icechunk>=0.1.0a12",
]
test = [
"codecov",
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _get_manifestarray_codecs(
) -> Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]]:
"""Get codecs for a ManifestArray based on its zarr_format."""
if normalize_to_zarr_v3 or array.zarray.zarr_format == 3:
return array.zarray._v3_codec_pipeline()
return (array.zarray.serializer(),) + array.zarray._v3_codec_pipeline()
elif array.zarray.zarr_format == 2:
return array.zarray.codec
else:
Expand Down
2 changes: 2 additions & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def _importorskip(


has_astropy, requires_astropy = _importorskip("astropy")
has_icechunk, requires_icechunk = _importorskip("icechunk")
has_kerchunk, requires_kerchunk = _importorskip("kerchunk")
has_fastparquet, requires_fastparquet = _importorskip("fastparquet")
has_s3fs, requires_s3fs = _importorskip("s3fs")
has_scipy, requires_scipy = _importorskip("scipy")
has_tifffile, requires_tifffile = _importorskip("tifffile")
Expand Down
4 changes: 3 additions & 1 deletion virtualizarr/tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ def test_manifest_array_zarr_v2_normalized(self):

# Get codecs and verify
actual_codecs = get_codecs(manifest_array, normalize_to_zarr_v3=True)
expected_codecs = manifest_array.zarray._v3_codec_pipeline()
expected_codecs = (
manifest_array.zarray.serializer(),
) + manifest_array.zarray._v3_codec_pipeline()
assert actual_codecs == expected_codecs

@requires_zarr_python_v3
Expand Down
126 changes: 57 additions & 69 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
from virtualizarr.tests import (
has_fastparquet,
has_kerchunk,
parametrize_over_hdf_backends,
requires_kerchunk,
requires_zarr_python,
)
from virtualizarr.translators.kerchunk import (
dataset_from_kerchunk_refs,
)
Expand All @@ -34,16 +40,16 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
),
chunkmanifest=manifest,
)
ds = xr.Dataset({"a": (["x", "y"], marr)})
vds = xr.Dataset({"a": (["x", "y"], marr)})

# Use accessor to write it out to kerchunk reference dict
ds_refs = ds.virtualize.to_kerchunk(format="dict")
ds_refs = vds.virtualize.to_kerchunk(format="dict")

# Use dataset_from_kerchunk_refs to reconstruct the dataset
roundtrip = dataset_from_kerchunk_refs(ds_refs)

# Assert equal to original dataset
xrt.assert_equal(roundtrip, ds)
xrt.assert_equal(roundtrip, vds)


@requires_kerchunk
Expand Down Expand Up @@ -84,11 +90,45 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
assert refs["refs"]["time/0"] == expected["refs"]["time/0"]


@requires_kerchunk
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs):
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format="dict")

# use fsspec to read the dataset from the kerchunk references dict
return xr.open_dataset(ds_refs, engine="kerchunk", **kwargs)


def roundtrip_as_kerchunk_json(vds: xr.Dataset, tmpdir, **kwargs):
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")

# use fsspec to read the dataset from disk via the kerchunk references
return xr.open_dataset(f"{tmpdir}/refs.json", engine="kerchunk", **kwargs)


def roundtrip_as_kerchunk_parquet(vds: xr.Dataset, tmpdir, **kwargs):
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.parquet", format="parquet")

# use fsspec to read the dataset from disk via the kerchunk references
return xr.open_dataset(f"{tmpdir}/refs.parquet", engine="kerchunk", **kwargs)


@requires_zarr_python
@pytest.mark.parametrize(
"roundtrip_func",
[
*(
[roundtrip_as_kerchunk_dict, roundtrip_as_kerchunk_json]
if has_kerchunk
else []
),
*([roundtrip_as_kerchunk_parquet] if has_kerchunk and has_fastparquet else []),
],
)
class TestRoundtrip:
@parametrize_over_hdf_backends
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
def test_roundtrip_no_concat(self, tmpdir, roundtrip_func, hdf_backend):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)

Expand All @@ -98,20 +138,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
# use open_dataset_via_kerchunk to read it as references
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend)

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
)
roundtrip = roundtrip_func(vds, tmpdir, decode_times=False)

# assert all_close to original dataset
xrt.assert_allclose(roundtrip, ds)
Expand All @@ -123,7 +150,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
@parametrize_over_hdf_backends
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
def test_kerchunk_roundtrip_concat(
self, tmpdir, format, hdf_backend, decode_times, time_vars
self, tmpdir, roundtrip_func, hdf_backend, decode_times, time_vars
):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
Expand Down Expand Up @@ -159,22 +186,7 @@ def test_kerchunk_roundtrip_concat(
# concatenate virtually along time
vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(
ds_refs, engine="kerchunk", decode_times=decode_times
)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times
)
roundtrip = roundtrip_func(vds, tmpdir, decode_times=decode_times)

if decode_times is False:
# assert all_close to original dataset
Expand All @@ -191,7 +203,7 @@ def test_kerchunk_roundtrip_concat(
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]

@parametrize_over_hdf_backends
def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
def test_non_dimension_coordinates(self, tmpdir, roundtrip_func, hdf_backend):
# regression test for GH issue #105

if hdf_backend:
Expand All @@ -209,20 +221,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
assert "lat" in vds.coords
assert "coordinates" not in vds.attrs

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
)
roundtrip = roundtrip_func(vds, tmpdir)

# assert equal to original dataset
xrt.assert_allclose(roundtrip, ds)
Expand All @@ -231,7 +230,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
for coord in ds.coords:
assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs

def test_datetime64_dtype_fill_value(self, tmpdir, format):
def test_datetime64_dtype_fill_value(self, tmpdir, roundtrip_func):
chunks_dict = {
"0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
}
Expand All @@ -249,7 +248,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
zarr_format=2,
)
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest)
ds = xr.Dataset(
vds = xr.Dataset(
{
"a": xr.DataArray(
marr1,
Expand All @@ -260,20 +259,9 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
}
)

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = ds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk")
else:
# write those references to disk as kerchunk references format
ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
roundtrip = roundtrip_func(vds, tmpdir)

assert roundtrip.a.attrs == ds.a.attrs
assert roundtrip.a.attrs == vds.a.attrs


@parametrize_over_hdf_backends
Expand Down
10 changes: 8 additions & 2 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,22 @@

import virtualizarr
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import (
requires_hdf5plugin,
requires_imagecodecs,
requires_kerchunk,
)


@requires_kerchunk
@requires_hdf5plugin
@requires_imagecodecs
class TestIntegration:
@pytest.mark.xfail(
reason="0 time start is being interpreted as fillvalue see issues/280"
)
def test_filters_h5netcdf_roundtrip(
self, tmpdir, filter_encoded_roundtrip_hdf5_file, backend=HDFVirtualBackend
self, tmpdir, filter_encoded_roundtrip_hdf5_file
):
ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
vds = virtualizarr.open_virtual_dataset(
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from virtualizarr.backend import open_virtual_dataset
from virtualizarr.manifests import ManifestArray
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import has_fastparquet, requires_kerchunk


def gen_ds_refs(
Expand Down Expand Up @@ -177,7 +177,7 @@ def test_handle_relative_paths(refs_file_factory):
@requires_kerchunk
@pytest.mark.parametrize(
"reference_format",
["json", "parquet", "invalid"],
["json", "invalid", *(["parquet"] if has_fastparquet else [])],
)
def test_open_virtual_dataset_existing_kerchunk_refs(
tmp_path, netcdf4_virtual_dataset, reference_format
Expand Down
3 changes: 2 additions & 1 deletion virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from xarray import Dataset

from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import requires_fastparquet, requires_kerchunk


@requires_kerchunk
Expand Down Expand Up @@ -108,6 +108,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
}
assert loaded_refs == expected_ds_refs

@requires_fastparquet
def test_accessor_to_kerchunk_parquet(self, tmp_path):
import ujson

Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_writers/test_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset)
assert isinstance(metadata["fill_value"], (bool, int, float, str, list))
assert (
isinstance(metadata["codecs"], list)
and len(metadata["codecs"]) > 1
and len(metadata["codecs"]) == 1
and all(isconfigurable(codec) for codec in metadata["codecs"])
)

Expand Down
6 changes: 3 additions & 3 deletions virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,13 @@ def write_virtual_variable_to_icechunk(
else:
append_axis = None
# create array if it doesn't already exist

arr = group.require_array(
name=name,
shape=zarray.shape,
chunk_shape=zarray.chunks,
chunks=zarray.chunks,
dtype=encode_dtype(zarray.dtype),
codecs=zarray._v3_codec_pipeline(),
compressors=zarray._v3_codec_pipeline(), # compressors,
serializer=zarray.serializer(),
dimension_names=var.dims,
fill_value=zarray.fill_value,
)
Expand Down
Loading

0 comments on commit 83a6f10

Please sign in to comment.