Skip to content

Commit

Permalink
Switch to custom netcdf4/hdf5 backend (#395)
Browse files Browse the repository at this point in the history
* Switch to custom netcdf4/hdf5 backend

* Switches autodetected backend selection
* updates tests to require kerchunk less often
* only test kerchunk hdf reader if kerchunk is available

* Allow for kerchunk-based backend

* Rename to parametrize_over_hdf_backends

* Run group tests

* Respect dimensions without coordinates

* Fix #402 so that nested groups are ignored

* Encode #401 behavior in tests

* Fix min deps tests

* Make mypy happy

* Add to release notes

* combine two tests into one

---------

Co-authored-by: TomNicholas <tom@cworthy.org>
  • Loading branch information
jsignell and TomNicholas authored Jan 30, 2025
1 parent 95fce11 commit 81a76f0
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 98 deletions.
3 changes: 3 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ Breaking changes
rather than positional or keyword. This change is breaking _only_ where arguments for
these parameters are currently given positionally. (:issue:`341`) By
`Chuck Daniels <https://github.com/chuckwondo>`_.
- The default backend for netCDF4 and HDF5 is now the custom ``HDFVirtualBackend`` replacing
the previous default which was a wrapper around the kerchunk backend.
(:issue:`374`, :pull:`395`) By `Julia Signell <https://github.com/jsignell>`_.

Deprecations
~~~~~~~~~~~~
Expand Down
6 changes: 3 additions & 3 deletions virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from virtualizarr.readers import (
DMRPPVirtualBackend,
FITSVirtualBackend,
HDF5VirtualBackend,
HDFVirtualBackend,
KerchunkVirtualBackend,
NetCDF3VirtualBackend,
TIFFVirtualBackend,
Expand All @@ -27,9 +27,9 @@
"kerchunk": KerchunkVirtualBackend,
"zarr_v3": ZarrV3VirtualBackend,
"dmrpp": DMRPPVirtualBackend,
"hdf5": HDFVirtualBackend,
"netcdf4": HDFVirtualBackend, # note this is the same as for hdf5
# all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
"hdf5": HDF5VirtualBackend,
"netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5
"netcdf3": NetCDF3VirtualBackend,
"tiff": TIFFVirtualBackend,
"fits": FITSVirtualBackend,
Expand Down
9 changes: 3 additions & 6 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,14 +361,14 @@ def _virtual_vars_from_hdf(
).open_file()
f = h5py.File(open_file, mode="r")

if group is not None:
if group is not None and group != "":
g = f[group]
group_name = group
if not isinstance(g, h5py.Group):
raise ValueError("The provided group is not an HDF group")
else:
g = f
group_name = ""
g = f["/"]
group_name = "/"

variables = {}
for key in g.keys():
Expand All @@ -381,9 +381,6 @@ def _virtual_vars_from_hdf(
)
if variable is not None:
variables[key] = variable
else:
raise NotImplementedError("Nested groups are not yet supported")

return variables

@staticmethod
Expand Down
7 changes: 7 additions & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.manifests.manifest import join
from virtualizarr.readers import HDF5VirtualBackend
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.zarr import ZArray, ceildiv

requires_network = pytest.mark.network
Expand Down Expand Up @@ -42,6 +44,11 @@ def _importorskip(
has_zarr_python, requires_zarr_python = _importorskip("zarr")
has_zarr_python_v3, requires_zarr_python_v3 = _importorskip("zarr", "3.0.0b")

parametrize_over_hdf_backends = pytest.mark.parametrize(
"hdf_backend",
[HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend],
)


def create_manifestarray(
shape: tuple[int, ...], chunks: tuple[int, ...]
Expand Down
108 changes: 62 additions & 46 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
has_astropy,
requires_kerchunk,
parametrize_over_hdf_backends,
requires_hdf5plugin,
requires_imagecodecs,
requires_network,
requires_s3fs,
requires_scipy,
Expand Down Expand Up @@ -82,13 +84,14 @@ def test_FileType():
FileType(None)


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestOpenVirtualDatasetIndexes:
def test_no_indexes(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
assert vds.indexes == {}

@requires_hdf5plugin
@requires_imagecodecs
def test_create_default_indexes_for_loadable_variables(
self, netcdf4_file, hdf_backend
):
Expand Down Expand Up @@ -122,8 +125,9 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
return True


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@requires_hdf5plugin
@requires_imagecodecs
@parametrize_over_hdf_backends
def test_cftime_index(tmpdir, hdf_backend):
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
Expand Down Expand Up @@ -152,8 +156,7 @@ def test_cftime_index(tmpdir, hdf_backend):
assert vds.attrs == ds.attrs


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestOpenVirtualDatasetAttrs:
def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
# regression test for GH issue #150
Expand All @@ -171,14 +174,16 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
}


@requires_kerchunk
@parametrize_over_hdf_backends
class TestDetermineCoords:
def test_infer_one_dimensional_coords(self, netcdf4_file):
vds = open_virtual_dataset(netcdf4_file, indexes={})
def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
assert set(vds.coords) == {"time", "lat", "lon"}

def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={})
def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend):
vds = open_virtual_dataset(
netcdf4_file_with_2d_coords, indexes={}, backend=hdf_backend
)

expected_dimension_coords = ["ocean_time", "s_rho"]
expected_2d_coords = ["lon_rho", "lat_rho", "h"]
Expand All @@ -189,6 +194,8 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
+ expected_2d_coords
+ expected_1d_non_dimension_coords
+ expected_scalar_coords
# These should not be included in coords see #401 for more information
+ (["xi_rho", "eta_rho"] if hdf_backend == HDFVirtualBackend else [])
)
assert set(vds.coords) == set(expected_coords)

Expand All @@ -199,7 +206,7 @@ class TestReadFromS3:
@pytest.mark.parametrize(
"indexes", [None, {}], ids=["None index", "empty dict index"]
)
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_anon_read_s3(self, indexes, hdf_backend):
"""Parameterized tests for empty vs supplied indexes and filetypes."""
# TODO: Switch away from this s3 url after minIO is implemented.
Expand All @@ -217,7 +224,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):


@requires_network
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestReadFromURL:
@pytest.mark.parametrize(
"filetype, url",
Expand Down Expand Up @@ -320,46 +327,55 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend):
xrt.assert_equal(dsXR, dsV)


@requires_kerchunk
def test_open_empty_group(empty_netcdf4_file):
vds = open_virtual_dataset(empty_netcdf4_file, indexes={})
assert isinstance(vds, xr.Dataset)
expected = Dataset()
xrt.assert_identical(vds, expected)


@requires_kerchunk
@parametrize_over_hdf_backends
class TestOpenVirtualDatasetHDFGroup:
def test_open_subgroup(self, netcdf4_file_with_data_in_multiple_groups):
def test_open_empty_group(self, empty_netcdf4_file, hdf_backend):
vds = open_virtual_dataset(empty_netcdf4_file, indexes={}, backend=hdf_backend)
assert isinstance(vds, xr.Dataset)
expected = Dataset()
xrt.assert_identical(vds, expected)

def test_open_subgroup(
self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
):
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups, group="subgroup", indexes={}
netcdf4_file_with_data_in_multiple_groups,
group="subgroup",
indexes={},
backend=hdf_backend,
)
# This should just be ["bar"] see #401 for more information
assert list(vds.variables) == (
["bar", "dim_0"] if hdf_backend == HDFVirtualBackend else ["bar"]
)
assert list(vds.variables) == ["bar"]
assert isinstance(vds["bar"].data, ManifestArray)
assert vds["bar"].shape == (2,)

def test_open_root_group_manually(self, netcdf4_file_with_data_in_multiple_groups):
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups, group="", indexes={}
)
assert list(vds.variables) == ["foo"]
assert isinstance(vds["foo"].data, ManifestArray)
assert vds["foo"].shape == (3,)

def test_open_root_group_by_default(
self, netcdf4_file_with_data_in_multiple_groups
@pytest.mark.parametrize("group", ["", None])
def test_open_root_group(
self,
netcdf4_file_with_data_in_multiple_groups,
hdf_backend,
group,
):
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups, indexes={}
netcdf4_file_with_data_in_multiple_groups,
group=group,
indexes={},
backend=hdf_backend,
)
# This should just be ["foo"] see #401 for more information
assert list(vds.variables) == (
["foo", "dim_0"] if hdf_backend == HDFVirtualBackend else ["foo"]
)
assert list(vds.variables) == ["foo"]
assert isinstance(vds["foo"].data, ManifestArray)
assert vds["foo"].shape == (3,)


@requires_kerchunk
@requires_hdf5plugin
@requires_imagecodecs
class TestLoadVirtualDataset:
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_loadable_variables(self, netcdf4_file, hdf_backend):
vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
Expand Down Expand Up @@ -399,18 +415,18 @@ def test_explicit_filetype_and_backend(self, netcdf4_file):
netcdf4_file, filetype="hdf", backend=HDFVirtualBackend
)

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
if hdf_backend == HDFVirtualBackend:
with pytest.raises(NotImplementedError, match="Nested groups"):
open_virtual_dataset(hdf5_groups_file, backend=hdf_backend)
with pytest.raises(KeyError, match="doesn't exist"):
open_virtual_dataset(
hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
)
if hdf_backend == HDF5VirtualBackend:
with pytest.raises(ValueError, match="not found in"):
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
open_virtual_dataset(
hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
)

vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
Expand Down Expand Up @@ -443,13 +459,13 @@ def test_open_virtual_dataset_passes_expected_args(
}
mock_read_kerchunk.assert_called_once_with(**args)

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend):
vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
assert vds.empty.dims == ()
assert vds.empty.attrs == {"empty": "true"}

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend):
vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
assert vds.scalar.dims == ()
Expand Down
20 changes: 7 additions & 13 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.readers import HDF5VirtualBackend
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
from virtualizarr.translators.kerchunk import (
dataset_from_kerchunk_refs,
)
Expand Down Expand Up @@ -61,7 +59,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
),
],
)
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_numpy_arrays_to_inlined_kerchunk_refs(
netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
):
Expand Down Expand Up @@ -89,7 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
@requires_kerchunk
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
Expand Down Expand Up @@ -122,7 +120,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
for coord in ds.coords:
assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
def test_kerchunk_roundtrip_concat(
self, tmpdir, format, hdf_backend, decode_times, time_vars
Expand Down Expand Up @@ -192,7 +190,7 @@ def test_kerchunk_roundtrip_concat(
assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
# regression test for GH issue #105

Expand Down Expand Up @@ -278,8 +276,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
assert roundtrip.a.attrs == ds.a.attrs


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_open_scalar_variable(tmpdir, hdf_backend):
# regression test for GH issue #100

Expand All @@ -290,9 +287,8 @@ def test_open_scalar_variable(tmpdir, hdf_backend):
assert vds["a"].shape == ()


@parametrize_over_hdf_backends
class TestPathsToURIs:
@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)

Expand All @@ -302,8 +298,6 @@ def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
path = manifest["0.0.0"]["path"]
assert path == expected_path

@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend):
relative_path = relpath(netcdf4_file)
vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend)
Expand Down
Loading

0 comments on commit 81a76f0

Please sign in to comment.